Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
c2e87202
Commit
c2e87202
authored
Jun 04, 2025
by
Catheriany
Browse files
Merge remote-tracking branch 'origin/main' into issue/142
parents
41818f84
c203635b
Changes
175
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1293 additions
and
281 deletions
+1293
-281
src/utils/custom_types.cc
src/utils/custom_types.cc
+1
-1
src/utils/rearrange.cc
src/utils/rearrange.cc
+69
-0
src/utils/rearrange.h
src/utils/rearrange.h
+3
-0
test/infiniop-test/test_generate/__init__.py
test/infiniop-test/test_generate/__init__.py
+1
-1
test/infiniop-test/test_generate/infiniop_test.py
test/infiniop-test/test_generate/infiniop_test.py
+8
-0
test/infiniop-test/test_generate/testcases/add.py
test/infiniop-test/test_generate/testcases/add.py
+132
-0
test/infiniop-test/test_generate/testcases/clip.py
test/infiniop-test/test_generate/testcases/clip.py
+242
-0
test/infiniop-test/test_generate/testcases/mul.py
test/infiniop-test/test_generate/testcases/mul.py
+124
-0
test/infiniop-test/test_generate/testcases/swiglu.py
test/infiniop-test/test_generate/testcases/swiglu.py
+117
-0
test/infiniop/add.py
test/infiniop/add.py
+155
-95
test/infiniop/attention.py
test/infiniop/attention.py
+84
-165
test/infiniop/avg_pool.py
test/infiniop/avg_pool.py
+5
-0
test/infiniop/causal_softmax.py
test/infiniop/causal_softmax.py
+58
-14
test/infiniop/clip.py
test/infiniop/clip.py
+246
-0
test/infiniop/conv.py
test/infiniop/conv.py
+5
-1
test/infiniop/expand.py
test/infiniop/expand.py
+5
-1
test/infiniop/gemm.py
test/infiniop/gemm.py
+4
-0
test/infiniop/global_avg_pool.py
test/infiniop/global_avg_pool.py
+5
-1
test/infiniop/libinfiniop/utils.py
test/infiniop/libinfiniop/utils.py
+24
-1
test/infiniop/max_pool.py
test/infiniop/max_pool.py
+5
-1
No files found.
src/utils/custom_types.cc
View file @
c2e87202
...
...
@@ -43,7 +43,7 @@ fp16_t _f32_to_f16(float val) {
int32_t
exponent
=
((
f32
>>
23
)
&
0xFF
)
-
127
;
// Extract and de-bias the exponent
uint32_t
mantissa
=
f32
&
0x7FFFFF
;
// Extract the mantissa (fraction part)
if
(
exponent
>=
3
1
)
{
// Special cases for Inf and NaN
if
(
exponent
>=
1
6
)
{
// Special cases for Inf and NaN
// NaN
if
(
exponent
==
128
&&
mantissa
!=
0
)
{
return
fp16_t
{
static_cast
<
uint16_t
>
(
sign
|
0x7E00
)};
...
...
src/utils/rearrange.cc
View file @
c2e87202
...
...
@@ -138,4 +138,73 @@ void rearrange(
}
}
utils
::
Result
<
RearrangeMeta
>
RearrangeMeta
::
distributeUnit
(
const
std
::
vector
<
size_t
>
&
candidates
)
const
{
// 获取当前的unit大小
size_t
current_unit
=
_meta
[
0
];
// 寻找满足条件的unit值:当前unit能被其整除
size_t
new_unit
=
0
;
for
(
size_t
candidate
:
candidates
)
{
if
(
current_unit
%
candidate
==
0
)
{
new_unit
=
candidate
;
break
;
}
}
// 如果没找到合适的值,返回错误
if
(
new_unit
==
0
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
// 如果找到的值就是当前unit,返回自身的副本
if
(
new_unit
==
current_unit
)
{
return
Result
<
RearrangeMeta
>
(
_meta
);
}
// 获取当前维度
size_t
ndim_value
=
this
->
ndim
();
// 创建新的布局数组
std
::
vector
<
ptrdiff_t
>
layout
(
2
+
(
ndim_value
+
1
)
*
3
,
0
);
// 设置新的unit值
layout
[
0
]
=
new_unit
;
// 计算扩展因子
ptrdiff_t
extra
=
current_unit
/
new_unit
;
// 计算步长指针的偏移量
ptrdiff_t
idx_offset
=
1
;
// 在新布局中设置相应的指针
ptrdiff_t
*
new_idx
=
layout
.
data
()
+
1
;
ptrdiff_t
*
new_dst
=
layout
.
data
()
+
2
+
(
ndim_value
+
1
);
ptrdiff_t
*
new_src
=
layout
.
data
()
+
2
+
(
ndim_value
+
1
)
*
2
;
// 复制并调整索引步长
// 索引步长需要重新计算
// 首先复制原来的索引步长
for
(
size_t
i
=
0
;
i
<
ndim_value
+
1
;
++
i
)
{
new_idx
[
i
]
=
_meta
[
idx_offset
+
i
]
*
extra
;
}
// 设置最后一个维度的步长为1
new_idx
[
ndim_value
+
1
]
=
1
;
// 复制目标步长数据,并添加新单元大小
for
(
size_t
i
=
0
;
i
<
ndim_value
;
++
i
)
{
new_dst
[
i
]
=
dst_strides
()[
i
];
}
new_dst
[
ndim_value
]
=
new_unit
;
// 复制源步长数据,并添加新单元大小
for
(
size_t
i
=
0
;
i
<
ndim_value
;
++
i
)
{
new_src
[
i
]
=
src_strides
()[
i
];
}
new_src
[
ndim_value
]
=
new_unit
;
return
Result
<
RearrangeMeta
>
(
layout
);
}
}
// namespace utils
src/utils/rearrange.h
View file @
c2e87202
...
...
@@ -28,6 +28,9 @@ public:
const
ptrdiff_t
*
src_strides
()
const
;
void
launch
(
void
*
dst
,
const
void
*
src
)
const
;
// 拆分 unit 到更小的规模以利于并行
utils
::
Result
<
RearrangeMeta
>
distributeUnit
(
const
std
::
vector
<
size_t
>
&
candidates
)
const
;
};
void
rearrange
(
...
...
test/infiniop-test/test_generate/__init__.py
View file @
c2e87202
from
.infiniop_test
import
InfiniopTestCase
,
InfiniopTestWriter
,
np_dtype_to_ggml
,
gguf_strides
from
.infiniop_test
import
InfiniopTestCase
,
InfiniopTestWriter
,
np_dtype_to_ggml
,
gguf_strides
,
contiguous_gguf_strides
test/infiniop-test/test_generate/infiniop_test.py
View file @
c2e87202
...
...
@@ -29,6 +29,14 @@ def gguf_strides(*args: int) -> list[int] | None:
return
list
(
args
)[::
-
1
]
if
args
else
None
def
contiguous_gguf_strides
(
shape
:
tuple
[
int
,
...])
->
list
[
int
]:
strides
=
[]
acc
=
1
for
size
in
reversed
(
shape
):
strides
.
append
(
acc
)
acc
*=
size
return
strides
[::
-
1
]
class
InfiniopTestCase
:
op_name
:
str
...
...
test/infiniop-test/test_generate/testcases/add.py
0 → 100644
View file @
c2e87202
from
ast
import
List
import
numpy
as
np
import
gguf
from
typing
import
List
from
numpy.lib.stride_tricks
import
as_strided
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
,
contiguous_gguf_strides
def
add
(
a
:
np
.
ndarray
,
b
:
np
.
ndarray
,
):
return
a
+
b
def
process_tensor
(
a
,
b
,
stride_a
=
None
,
stride_b
=
None
):
def
normalize_stride
(
tensor
,
stride
):
if
stride
:
slices
=
tuple
(
slice
(
0
,
1
)
if
s
==
0
else
slice
(
None
)
for
s
in
stride
)
return
tensor
[
slices
]
else
:
return
tensor
a_unique
=
normalize_stride
(
a
,
stride_a
)
b_unique
=
normalize_stride
(
b
,
stride_b
)
return
a_unique
,
b_unique
class
AddTestCase
(
InfiniopTestCase
):
def
__init__
(
self
,
a
:
np
.
ndarray
,
shape_a
:
List
[
int
]
|
None
,
stride_a
:
List
[
int
]
|
None
,
b
:
np
.
ndarray
,
shape_b
:
List
[
int
]
|
None
,
stride_b
:
List
[
int
]
|
None
,
c
:
np
.
ndarray
,
shape_c
:
List
[
int
]
|
None
,
stride_c
:
List
[
int
]
|
None
,
):
super
().
__init__
(
"add"
)
self
.
a
=
a
self
.
shape_a
=
shape_a
self
.
stride_a
=
stride_a
self
.
b
=
b
self
.
shape_b
=
shape_b
self
.
stride_b
=
stride_b
self
.
c
=
c
self
.
shape_c
=
shape_c
self
.
stride_c
=
stride_c
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
if
self
.
shape_a
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"a.shape"
),
self
.
shape_a
)
if
self
.
shape_b
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"b.shape"
),
self
.
shape_b
)
if
self
.
shape_c
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"c.shape"
),
self
.
shape_c
)
if
self
.
stride_a
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"a.strides"
),
gguf_strides
(
*
self
.
stride_a
))
if
self
.
stride_b
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"b.strides"
),
gguf_strides
(
*
self
.
stride_b
))
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"c.strides"
),
gguf_strides
(
*
self
.
stride_c
if
self
.
stride_c
is
not
None
else
contiguous_gguf_strides
(
self
.
shape_c
))
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"a"
),
self
.
a
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
a
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"b"
),
self
.
b
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
b
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"c"
),
self
.
c
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
c
.
dtype
)
)
ans
=
add
(
self
.
a
.
astype
(
np
.
float64
),
self
.
b
.
astype
(
np
.
float64
),
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans"
),
ans
,
raw_dtype
=
gguf
.
GGMLQuantizationType
.
F64
)
if
__name__
==
"__main__"
:
test_writer
=
InfiniopTestWriter
(
"add.gguf"
)
test_cases
=
[]
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_
=
[
# shape, a_stride, b_stride, c_stride
((
13
,
4
),
None
,
None
,
None
),
((
13
,
4
),
(
10
,
1
),
(
10
,
1
),
(
10
,
1
)),
((
13
,
4
),
(
0
,
1
),
None
,
None
),
((
13
,
4
,
4
),
None
,
None
,
None
),
((
13
,
4
,
4
),
(
20
,
4
,
1
),
(
20
,
4
,
1
),
(
20
,
4
,
1
)),
((
13
,
4
,
4
),
(
4
,
0
,
1
),
(
0
,
4
,
1
),
None
),
((
16
,
5632
),
None
,
None
,
None
),
((
16
,
5632
),
(
13312
,
1
),
(
13312
,
1
),
(
13312
,
1
)),
((
4
,
4
,
5632
),
None
,
None
,
None
),
((
4
,
4
,
5632
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
)),
]
_TENSOR_DTYPES_
=
[
np
.
float32
,
np
.
float16
]
for
dtype
in
_TENSOR_DTYPES_
:
for
shape
,
stride_a
,
stride_b
,
stride_c
in
_TEST_CASES_
:
a
=
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
b
=
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
c
=
np
.
empty
(
tuple
(
0
for
_
in
shape
),
dtype
=
dtype
)
a
,
b
=
process_tensor
(
a
,
b
,
stride_a
,
stride_b
)
if
stride_c
is
None
:
stride_c
=
contiguous_gguf_strides
(
shape
)
test_case
=
AddTestCase
(
a
=
a
,
shape_a
=
shape
,
stride_a
=
stride_a
,
b
=
b
,
shape_b
=
shape
,
stride_b
=
stride_b
,
c
=
c
,
shape_c
=
shape
,
stride_c
=
stride_c
,
)
test_cases
.
append
(
test_case
)
test_writer
.
add_tests
(
test_cases
)
test_writer
.
save
()
\ No newline at end of file
test/infiniop-test/test_generate/testcases/clip.py
0 → 100644
View file @
c2e87202
import
numpy
as
np
import
gguf
from
typing
import
List
,
Optional
,
Tuple
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
def
clip
(
x
:
np
.
ndarray
,
min_val
:
np
.
ndarray
,
max_val
:
np
.
ndarray
,
)
->
np
.
ndarray
:
"""
Clip the values in input tensor x to the range [min_val, max_val].
Args:
x: Input tensor
min_val: Tensor with minimum values (same shape as x)
max_val: Tensor with maximum values (same shape as x)
Returns:
Clipped tensor with the same shape as x
"""
return
np
.
maximum
(
np
.
minimum
(
x
,
max_val
),
min_val
)
def
random_tensor
(
shape
,
dtype
):
"""
Generate a random tensor with values in the range [-2, 2].
Args:
shape: Shape of the tensor
dtype: Data type of the tensor
Returns:
Random tensor with the specified shape and dtype
"""
return
(
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
*
4.0
-
2.0
)
class
ClipTestCase
(
InfiniopTestCase
):
"""
Test case for the Clip operator.
"""
def
__init__
(
self
,
x
:
np
.
ndarray
,
x_stride
:
Optional
[
List
[
int
]],
min_val
:
np
.
ndarray
,
min_stride
:
Optional
[
List
[
int
]],
max_val
:
np
.
ndarray
,
max_stride
:
Optional
[
List
[
int
]],
y
:
np
.
ndarray
,
y_stride
:
Optional
[
List
[
int
]],
):
super
().
__init__
(
"clip"
)
self
.
x
=
x
self
.
x_stride
=
x_stride
self
.
min_val
=
min_val
self
.
min_stride
=
min_stride
self
.
max_val
=
max_val
self
.
max_stride
=
max_stride
self
.
y
=
y
self
.
y_stride
=
y_stride
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
# Add strides as arrays if they exist
if
self
.
x_stride
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"x.strides"
),
self
.
x_stride
)
if
self
.
min_stride
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"min_val.strides"
),
self
.
min_stride
)
if
self
.
max_stride
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"max_val.strides"
),
self
.
max_stride
)
if
self
.
y_stride
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"y.strides"
),
self
.
y_stride
)
# Add tensors to the test
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"x"
),
self
.
x
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
x
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"min_val"
),
self
.
min_val
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
min_val
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"max_val"
),
self
.
max_val
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
max_val
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"y"
),
self
.
y
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
y
.
dtype
)
)
# Calculate the expected result
ans
=
clip
(
self
.
x
.
astype
(
np
.
float64
),
self
.
min_val
.
astype
(
np
.
float64
),
self
.
max_val
.
astype
(
np
.
float64
)
)
# Add the expected result to the test
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans"
),
ans
,
raw_dtype
=
gguf
.
GGMLQuantizationType
.
F64
)
if
__name__
==
"__main__"
:
test_writer
=
InfiniopTestWriter
(
"clip.gguf"
)
# Create test cases for different shapes, strides, and data types
test_cases
=
[]
# Test case shapes
shapes
=
[
(
10
,),
# 1D tensor
(
5
,
10
),
# 2D tensor
(
2
,
3
,
4
),
# 3D tensor
(
7
,
13
),
# Prime dimensions
(
1
,
1
),
# Minimum shape
(
100
,
100
),
# Large shape
(
16
,
16
,
16
),
# Large 3D
]
# Test case min/max values
min_max_values
=
[
(
-
1.0
,
1.0
),
# Standard range
(
0.0
,
2.0
),
# Positive range
(
-
2.0
,
0.0
),
# Negative range
(
-
1000.0
,
1000.0
),
# Large range
(
-
0.001
,
0.001
),
# Small range
(
0.0
,
0.0
),
# min=max
]
# Data types to test
dtypes
=
[
np
.
float16
,
np
.
float32
,
np
.
float64
]
# Generate test cases with contiguous tensors
for
shape
in
shapes
:
for
min_val
,
max_val
in
min_max_values
:
for
dtype
in
dtypes
:
x
=
random_tensor
(
shape
,
dtype
)
min_tensor
=
np
.
full
(
shape
,
min_val
,
dtype
=
dtype
)
max_tensor
=
np
.
full
(
shape
,
max_val
,
dtype
=
dtype
)
y
=
np
.
zeros
(
shape
,
dtype
=
dtype
)
test_cases
.
append
(
ClipTestCase
(
x
=
x
,
x_stride
=
None
,
min_val
=
min_tensor
,
min_stride
=
None
,
max_val
=
max_tensor
,
max_stride
=
None
,
y
=
y
,
y_stride
=
None
)
)
# Generate test cases with strided tensors (for 2D shapes only)
for
shape
in
[
s
for
s
in
shapes
if
len
(
s
)
==
2
]:
for
dtype
in
dtypes
:
# Row-major stride
row_stride
=
gguf_strides
(
shape
[
1
],
1
)
# Column-major stride
col_stride
=
gguf_strides
(
1
,
shape
[
0
])
# Test case with row-major input and output
x
=
random_tensor
(
shape
,
dtype
)
min_tensor
=
np
.
full
(
shape
,
-
1.0
,
dtype
=
dtype
)
max_tensor
=
np
.
full
(
shape
,
1.0
,
dtype
=
dtype
)
y
=
np
.
zeros
(
shape
,
dtype
=
dtype
)
test_cases
.
append
(
ClipTestCase
(
x
=
x
,
x_stride
=
row_stride
,
min_val
=
min_tensor
,
min_stride
=
row_stride
,
max_val
=
max_tensor
,
max_stride
=
row_stride
,
y
=
y
,
y_stride
=
row_stride
)
)
# Test case with column-major input and output
x
=
random_tensor
(
shape
,
dtype
)
min_tensor
=
np
.
full
(
shape
,
-
1.0
,
dtype
=
dtype
)
max_tensor
=
np
.
full
(
shape
,
1.0
,
dtype
=
dtype
)
y
=
np
.
zeros
(
shape
,
dtype
=
dtype
)
test_cases
.
append
(
ClipTestCase
(
x
=
x
,
x_stride
=
col_stride
,
min_val
=
min_tensor
,
min_stride
=
col_stride
,
max_val
=
max_tensor
,
max_stride
=
col_stride
,
y
=
y
,
y_stride
=
col_stride
)
)
# Test case with different strides for input and output
x
=
random_tensor
(
shape
,
dtype
)
min_tensor
=
np
.
full
(
shape
,
-
1.0
,
dtype
=
dtype
)
max_tensor
=
np
.
full
(
shape
,
1.0
,
dtype
=
dtype
)
y
=
np
.
zeros
(
shape
,
dtype
=
dtype
)
test_cases
.
append
(
ClipTestCase
(
x
=
x
,
x_stride
=
row_stride
,
min_val
=
min_tensor
,
min_stride
=
row_stride
,
max_val
=
max_tensor
,
max_stride
=
row_stride
,
y
=
y
,
y_stride
=
col_stride
)
)
# Add all test cases to the writer
test_writer
.
add_tests
(
test_cases
)
# Save the test cases to a GGUF file
test_writer
.
save
()
print
(
f
"Generated
{
len
(
test_cases
)
}
test cases for the Clip operator"
)
test/infiniop-test/test_generate/testcases/mul.py
0 → 100644
View file @
c2e87202
import
numpy
as
np
import
gguf
from
typing
import
List
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
,
contiguous_gguf_strides
def
mul
(
a
:
np
.
ndarray
,
b
:
np
.
ndarray
):
return
np
.
multiply
(
a
,
b
)
def
random_tensor
(
shape
,
dtype
):
rate
=
1e-3
var
=
0.5
*
rate
# 数值范围在[-5e-4, 5e-4]
return
rate
*
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
-
var
class
MulTestCase
(
InfiniopTestCase
):
def
__init__
(
self
,
a
:
np
.
ndarray
,
shape_a
:
List
[
int
]
|
None
,
stride_a
:
List
[
int
]
|
None
,
b
:
np
.
ndarray
,
shape_b
:
List
[
int
]
|
None
,
stride_b
:
List
[
int
]
|
None
,
c
:
np
.
ndarray
,
shape_c
:
List
[
int
]
|
None
,
stride_c
:
List
[
int
]
|
None
,
):
super
().
__init__
(
"mul"
)
self
.
a
=
a
self
.
shape_a
=
shape_a
self
.
stride_a
=
stride_a
self
.
b
=
b
self
.
shape_b
=
shape_b
self
.
stride_b
=
stride_b
self
.
c
=
c
self
.
shape_c
=
shape_c
self
.
stride_c
=
stride_c
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
if
self
.
shape_a
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"a.shape"
),
self
.
shape_a
)
if
self
.
shape_b
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"b.shape"
),
self
.
shape_b
)
if
self
.
shape_c
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"c.shape"
),
self
.
shape_c
)
if
self
.
stride_a
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"a.strides"
),
gguf_strides
(
*
self
.
stride_a
))
if
self
.
stride_b
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"b.strides"
),
gguf_strides
(
*
self
.
stride_b
))
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"c.strides"
),
gguf_strides
(
*
self
.
stride_c
if
self
.
stride_c
is
not
None
else
contiguous_gguf_strides
(
self
.
shape_c
))
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"a"
),
self
.
a
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
a
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"b"
),
self
.
b
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
b
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"c"
),
self
.
c
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
c
.
dtype
)
)
a_fp64
=
self
.
a
.
astype
(
np
.
float64
)
b_fp64
=
self
.
b
.
astype
(
np
.
float64
)
ans_fp64
=
np
.
multiply
(
a_fp64
,
b_fp64
)
ans
=
mul
(
self
.
a
,
self
.
b
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans"
),
ans
,
raw_dtype
=
np_dtype_to_ggml
(
ans
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans_fp64"
),
ans_fp64
,
raw_dtype
=
np_dtype_to_ggml
(
ans_fp64
.
dtype
),
)
if
__name__
==
'__main__'
:
test_writer
=
InfiniopTestWriter
(
"mul.gguf"
)
test_cases
=
[]
_TEST_CASES_
=
[
((
2
,
3
),
(
3
,
1
),
(
1
,
2
),
(
3
,
1
)),
((
2
,
3
),
(
1
,
2
),
(
3
,
1
),
(
1
,
2
)),
((
2
,
3
),
(
3
,
1
),
(
3
,
1
),
(
1
,
2
)),
((
4
,
6
),
(
1
,
4
),
(
1
,
5
),
(
6
,
1
)),
((
1
,
2048
),
(
1
,
1
),
(
2048
,
1
),
(
1
,
1
)),
((
2048
,
2048
),
None
,
(
1
,
2048
),
None
),
((
2
,
4
,
2048
),
(
4
*
2048
,
2048
,
1
),
(
1
,
2
,
8
),
(
4
*
2048
,
2048
,
1
)),
((
2
,
4
,
2048
),
(
1
,
2
,
8
),
None
,
(
1
,
2
,
8
)),
((
2048
,
2560
),
(
2560
,
1
),
(
1
,
2048
),
(
2560
,
1
)),
((
4
,
48
,
64
),
(
64
*
48
,
64
,
1
),
(
1
,
4
,
192
),
None
),
((
4
,
48
,
64
),
None
,
(
1
,
4
,
192
),
(
48
*
64
,
64
,
1
)),
]
_TENSOR_DTYPES_
=
[
np
.
float32
,
np
.
float16
]
for
dtype
in
_TENSOR_DTYPES_
:
for
shape
,
stride_a
,
stride_b
,
stride_c
in
_TEST_CASES_
:
a
=
random_tensor
(
shape
,
dtype
)
b
=
random_tensor
(
shape
,
dtype
)
c
=
np
.
empty
(
tuple
(
0
for
_
in
shape
),
dtype
=
dtype
)
test_cases
.
append
(
MulTestCase
(
a
=
a
,
shape_a
=
shape
,
stride_a
=
stride_a
,
b
=
b
,
shape_b
=
shape
,
stride_b
=
stride_b
,
c
=
c
,
shape_c
=
shape
,
stride_c
=
stride_c
,
)
)
test_writer
.
add_tests
(
test_cases
)
test_writer
.
save
()
test/infiniop-test/test_generate/testcases/swiglu.py
0 → 100644
View file @
c2e87202
import
numpy
as
np
import
gguf
from
typing
import
List
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
,
contiguous_gguf_strides
def
swiglu
(
a
:
np
.
ndarray
,
b
:
np
.
ndarray
,
):
c
=
a
*
b
/
(
1.0
+
np
.
exp
(
-
b
))
return
c
class
SwiGLUTestCase
(
InfiniopTestCase
):
def
__init__
(
self
,
a
:
np
.
ndarray
,
shape_a
:
List
[
int
]
|
None
,
stride_a
:
List
[
int
]
|
None
,
b
:
np
.
ndarray
,
shape_b
:
List
[
int
]
|
None
,
stride_b
:
List
[
int
]
|
None
,
c
:
np
.
ndarray
,
shape_c
:
List
[
int
]
|
None
,
stride_c
:
List
[
int
]
|
None
,
):
super
().
__init__
(
"swiglu"
)
self
.
a
=
a
self
.
shape_a
=
shape_a
self
.
stride_a
=
stride_a
self
.
b
=
b
self
.
shape_b
=
shape_b
self
.
stride_b
=
stride_b
self
.
c
=
c
self
.
shape_c
=
shape_c
self
.
stride_c
=
stride_c
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
if
self
.
shape_a
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"a.shape"
),
self
.
shape_a
)
if
self
.
shape_b
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"b.shape"
),
self
.
shape_b
)
if
self
.
shape_c
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"c.shape"
),
self
.
shape_c
)
if
self
.
stride_a
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"a.strides"
),
gguf_strides
(
*
self
.
stride_a
))
if
self
.
stride_b
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"b.strides"
),
gguf_strides
(
*
self
.
stride_b
))
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"c.strides"
),
gguf_strides
(
*
self
.
stride_c
if
self
.
stride_c
is
not
None
else
contiguous_gguf_strides
(
self
.
shape_c
))
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"a"
),
self
.
a
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
a
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"b"
),
self
.
b
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
b
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"c"
),
self
.
c
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
c
.
dtype
)
)
ans
=
swiglu
(
self
.
a
.
astype
(
np
.
float64
),
self
.
b
.
astype
(
np
.
float64
),
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans"
),
ans
,
raw_dtype
=
gguf
.
GGMLQuantizationType
.
F64
)
if
__name__
==
"__main__"
:
test_writer
=
InfiniopTestWriter
(
"swiglu.gguf"
)
test_cases
=
[]
_TEST_CASES_
=
[
((
64
,
128
),
None
,
None
,
None
),
((
64
,
121
),
None
,
None
,
None
),
((
15
,
512
),
None
,
None
,
None
),
((
13
,
4
),
None
,
None
,
None
),
((
13
,
4
),
(
10
,
1
),
(
10
,
1
),
(
10
,
1
)),
((
13
,
4
,
4
),
None
,
None
,
None
),
((
13
,
4
,
4
),
(
20
,
4
,
1
),
(
20
,
4
,
1
),
(
20
,
4
,
1
)),
((
16
,
5632
),
None
,
None
,
None
),
((
16
,
5632
),
(
13312
,
1
),
(
13312
,
1
),
(
13312
,
1
)),
((
16
,
5632
),
(
5632
,
1
),
(
5632
,
1
),
(
1
,
16
)),
((
2
,
3
,
400
),
(
1200
,
400
,
1
),
(
1200
,
400
,
1
),
(
1
,
2
,
6
)),
((
4
,
4
,
5632
),
None
,
None
,
None
),
((
4
,
4
,
5632
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
)),
]
_TENSOR_DTYPES_
=
[
np
.
float32
,
np
.
float16
]
for
dtype
in
_TENSOR_DTYPES_
:
for
shape
,
stride_a
,
stride_b
,
stride_c
in
_TEST_CASES_
:
a
=
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
b
=
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
c
=
np
.
empty
(
tuple
(
0
for
_
in
shape
),
dtype
=
dtype
)
test_case
=
SwiGLUTestCase
(
a
=
a
,
shape_a
=
list
(
shape
),
stride_a
=
stride_a
,
b
=
b
,
shape_b
=
list
(
shape
),
stride_b
=
stride_b
,
c
=
c
,
shape_c
=
list
(
shape
),
stride_c
=
stride_c
,
)
test_cases
.
append
(
test_case
)
test_writer
.
add_tests
(
test_cases
)
test_writer
.
save
()
test/infiniop/add.py
View file @
c2e87202
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_void_p
import
torch
import
ctypes
import
sys
import
os
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
".."
)))
from
operatorspy
import
(
open_lib
,
to_tensor
,
DeviceEnum
,
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_void_p
,
c_uint64
from
libinfiniop
import
(
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
create_handle
,
destroy_handle
,
open_lib
,
to_tensor
,
get_test_devices
,
check_error
,
rearrange_if_needed
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
create_workspace
,
)
from
operatorspy.tests.test_utils
import
get_args
from
enum
import
Enum
,
auto
import
torch
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_
=
[
# shape, a_stride, b_stride, c_stride
((
13
,
4
),
None
,
None
,
None
),
((
13
,
4
),
(
10
,
1
),
(
10
,
1
),
(
10
,
1
)),
((
13
,
4
),
(
0
,
1
),
None
,
None
),
((
13
,
4
,
4
),
None
,
None
,
None
),
((
13
,
4
,
4
),
(
20
,
4
,
1
),
(
20
,
4
,
1
),
(
20
,
4
,
1
)),
((
13
,
4
,
4
),
(
4
,
0
,
1
),
(
0
,
4
,
1
),
None
),
((
16
,
5632
),
None
,
None
,
None
),
((
16
,
5632
),
(
13312
,
1
),
(
13312
,
1
),
(
13312
,
1
)),
((
4
,
4
,
5632
),
None
,
None
,
None
),
((
4
,
4
,
5632
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
)),
]
class
Inplace
(
Enum
):
...
...
@@ -26,6 +43,35 @@ class Inplace(Enum):
INPLACE_B
=
auto
()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE
=
[
Inplace
.
OUT_OF_PLACE
,
Inplace
.
INPLACE_A
,
Inplace
.
INPLACE_B
,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES
=
[
test_case
+
(
inplace_item
,)
for
test_case
in
_TEST_CASES_
for
inplace_item
in
_INPLACE
]
# Data types used for testing
_TENSOR_DTYPES
=
[
torch
.
float16
,
torch
.
float32
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
torch
.
float32
:
{
"atol"
:
1e-7
,
"rtol"
:
1e-7
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
class
AddDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
...
...
@@ -37,42 +83,71 @@ def add(x, y):
return
torch
.
add
(
x
,
y
)
def
process_tensors
(
c
,
c_strides
,
a
,
a_stride
,
b
,
b_stride
,
inplace
):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides
=
c_strides
if
c_strides
else
c
.
stride
()
def
_rearrange
(
tensor
,
strides
):
if
strides
and
0
in
strides
:
tensor
.
set_
(
tensor
.
untyped_storage
(),
0
,
tensor
.
shape
,
strides
)
return
tensor
else
:
return
rearrange_if_needed
(
tensor
,
strides
)
a
,
b
,
c
=
[
_rearrange
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
a
,
b
,
c
],
[
a_stride
,
b_stride
,
c_strides
])
]
c
=
(
c
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if
0
in
c
.
stride
():
c
.
set_
(
c
.
untyped_storage
(),
0
,
c
.
shape
,
original_c_strides
)
return
a
,
b
,
c
def
test
(
lib
,
handle
,
torch_device
,
c_
shape
,
a_s
hap
e
,
b_s
hap
e
,
tensor_dtype
=
torch
.
float16
,
shape
,
a_s
tride
=
Non
e
,
b_s
tride
=
Non
e
,
c_stride
=
None
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
dtype
=
torch
.
float16
,
sync
=
None
,
):
print
(
f
"Testing Add on
{
torch_device
}
with c_shape:
{
c_shape
}
a_shape:
{
a_shape
}
b_shape:
{
b_shape
}
dtype:
{
tensor_dtype
}
inplace:
{
inplace
.
name
}
"
f
"Testing Add on
{
torch_device
}
with shape:
{
shape
}
a_stride:
{
a_stride
}
b_stride:
{
b_stride
}
c_stride:
{
c_stride
}
"
f
"dtype:
{
dtype
}
inplace:
{
inplace
}
"
)
if
a_shape
!=
b_shape
and
inplace
!=
Inplace
.
OUT_OF_PLACE
:
print
(
"Unsupported test: broadcasting does not support in-place"
)
return
a
=
torch
.
rand
(
a_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
b_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
c
=
(
torch
.
rand
(
c_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
)
a
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
c
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
a
,
b
,
c
=
process_tensors
(
c
,
c_stride
,
a
,
a_stride
,
b
,
b_stride
,
inplace
)
ans
=
add
(
a
,
b
)
a_tensor
=
to_tensor
(
a
,
lib
)
b_tensor
=
to_tensor
(
b
,
lib
)
a_tensor
,
b_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
a
,
b
]]
c_tensor
=
(
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
)
descriptor
=
infiniopAddDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopAddDescriptor_t
()
check_error
(
lib
.
infiniopCreateAddDescriptor
(
handle
,
...
...
@@ -84,74 +159,48 @@ def test(
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
c_tensor
.
descriptor
.
contents
.
invalidate
()
a_tensor
.
descriptor
.
contents
.
invalidate
()
b_tensor
.
descriptor
.
contents
.
invalidate
()
for
tensor
in
[
a_tensor
,
b_tensor
,
c_tensor
]:
tensor
.
destroyDesc
(
lib
)
workspace_size
=
c_uint64
(
0
)
check_error
(
lib
.
infiniop
Add
(
descriptor
,
c_tensor
.
data
,
a_tensor
.
data
,
b_tensor
.
data
,
Non
e
)
lib
.
infiniop
GetAddWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_siz
e
)
)
)
assert
torch
.
allclose
(
c
,
ans
,
atol
=
0
,
rtol
=
1e-3
)
check_error
(
lib
.
infiniopDestroyAddDescriptor
(
descriptor
))
def
test_cpu
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
def
test_cuda
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
workspace
=
create_workspace
(
workspace_size
.
value
,
c
.
device
)
def
lib_add
():
check_error
(
lib
.
infiniopAdd
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
c_tensor
.
data
,
a_tensor
.
data
,
b_tensor
.
data
,
None
,
)
)
lib_add
()
def
test_bang
(
lib
,
test_cases
):
import
torch_mlu
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# Profiling workflow
if
PROFILE
:
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
profile_operation
(
"PyTorch"
,
lambda
:
add
(
a
,
b
),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_add
(),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
destroy_handle
(
lib
,
handle
)
check_error
(
lib
.
infiniopDestroyAddDescriptor
(
descriptor
)
)
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
((
1
,
3
),
(
1
,
3
),
(
1
,
3
),
Inplace
.
OUT_OF_PLACE
),
((),
(),
(),
Inplace
.
OUT_OF_PLACE
),
((
3
,
3
),
(
3
,
3
),
(
3
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
20
,
3
),
(
2
,
1
,
3
),
(
2
,
20
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
32
,
20
,
512
),
(
32
,
20
,
512
),
(
32
,
20
,
512
),
Inplace
.
INPLACE_A
),
((
32
,
20
,
512
),
(
32
,
20
,
512
),
(
32
,
20
,
512
),
Inplace
.
INPLACE_B
),
((
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
1
),
(
32
,
256
,
112
,
112
),
Inplace
.
OUT_OF_PLACE
),
((
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
112
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
4
,
3
),
(
2
,
1
,
3
),
(
4
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
3
,
4
,
5
),
(
2
,
3
,
4
,
5
),
(
5
,),
Inplace
.
OUT_OF_PLACE
),
((
3
,
2
,
4
,
5
),
(
4
,
5
),
(
3
,
2
,
1
,
1
),
Inplace
.
OUT_OF_PLACE
),
# fmt: on
]
args
=
get_args
()
lib
=
open_lib
()
lib
.
infiniopCreateAddDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateAddDescriptor
.
argtypes
=
[
infiniopHandle_t
,
...
...
@@ -160,25 +209,36 @@ if __name__ == "__main__":
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetAddWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetAddWorkspaceSize
.
argtypes
=
[
infiniopAddDescriptor_t
,
POINTER
(
c_uint64
),
]
lib
.
infiniopAdd
.
restype
=
c_int32
lib
.
infiniopAdd
.
argtypes
=
[
infiniopAddDescriptor_t
,
c_void_p
,
c_uint64
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
]
lib
.
infiniopDestroyAddDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyAddDescriptor
.
argtypes
=
[
infiniopAddDescriptor_t
,
]
if
args
.
cpu
:
test_cpu
(
lib
,
test_cases
)
if
args
.
cuda
:
test_cuda
(
lib
,
test_cases
)
if
args
.
bang
:
test_bang
(
lib
,
test_cases
)
if
not
(
args
.
cpu
or
args
.
cuda
or
args
.
bang
):
test_cpu
(
lib
,
test_cases
)
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
for
device
in
get_test_devices
(
args
):
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
test/infiniop/attention.py
View file @
c2e87202
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_uint64
,
c_void_p
,
c_float
,
c_bool
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_uint64
,
c_void_p
import
ctypes
import
sys
import
os
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
".."
)))
from
operatorspy
import
(
from
libinfiniop
import
(
open_lib
,
to_tensor
,
CTensor
,
DeviceEnum
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
create_handle
,
destroy_handle
,
check_error
,
rearrange_tensor
,
create_workspace
,
get_args
,
get_test_devices
,
test_operator
,
debug
,
get_tolerance
,
profile_operation
,
)
from
operatorspy.tests.test_utils
import
get_args
import
torch
import
torch.nn.functional
as
F
class
AttentionDescriptor
(
Structure
):
...
...
@@ -95,12 +95,13 @@ def test(
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
=
torch
.
float16
,
q_stride
=
None
,
k_stride
=
None
,
v_stride
=
None
,
k_cache_stride
=
None
,
v_cache_stride
=
None
,
dtype
=
torch
.
float16
,
sync
=
None
,
):
print
(
f
"Testing Attention on
{
torch_device
}
with n_q_head:
{
n_q_head
}
n_kv_head:
{
n_kv_head
}
seq_len:
{
seq_len
}
head_dim:
{
head_dim
}
pos:
{
pos
}
"
...
...
@@ -140,6 +141,9 @@ def test(
k_cache_tensor
=
to_tensor
(
k_cache
,
lib
)
v_cache_tensor
=
to_tensor
(
v_cache
,
lib
)
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopAttentionDescriptor_t
()
check_error
(
lib
.
infiniopCreateAttentionDescriptor
(
...
...
@@ -156,12 +160,15 @@ def test(
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
out_tensor
.
descriptor
.
contents
.
invalidate
()
q_tensor
.
descriptor
.
contents
.
invalidate
()
k_tensor
.
descriptor
.
contents
.
invalidate
()
v_tensor
.
descriptor
.
contents
.
invalidate
()
k_cache_tensor
.
descriptor
.
contents
.
invalidate
()
v_cache_tensor
.
descriptor
.
contents
.
invalidate
()
for
tensor
in
[
out_tensor
,
q_tensor
,
k_tensor
,
v_tensor
,
k_cache_tensor
,
v_cache_tensor
,
]:
tensor
.
destroyDesc
(
lib
)
workspace_size
=
c_uint64
(
0
)
check_error
(
...
...
@@ -169,152 +176,52 @@ def test(
)
workspace
=
create_workspace
(
workspace_size
.
value
,
out
.
device
)
check_error
(
lib
.
infiniopAttention
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
out_tensor
.
data
,
q_tensor
.
data
,
k_tensor
.
data
,
v_tensor
.
data
,
k_cache_tensor
.
data
,
v_cache_tensor
.
data
,
None
,
)
)
assert
torch
.
allclose
(
out
,
ans
,
atol
=
1e-4
,
rtol
=
1e-2
)
check_error
(
lib
.
infiniopDestroyAttentionDescriptor
(
descriptor
))
def
test_cpu
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CPU
handle
=
create_handle
(
lib
,
device
)
for
(
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
in
test_cases
:
test
(
lib
,
handle
,
"cpu"
,
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
def
lib_attention
():
check_error
(
lib
.
infiniopAttention
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
out_tensor
.
data
,
q_tensor
.
data
,
k_tensor
.
data
,
v_tensor
.
data
,
k_cache_tensor
.
data
,
v_cache_tensor
.
data
,
None
,
)
)
destroy_handle
(
lib
,
handle
)
def
test_cuda
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
(
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
in
test_cases
:
test
(
lib
,
handle
,
"cuda"
,
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
lib_attention
()
destroy_handle
(
lib
,
handle
)
def
test_bang
(
lib
,
test_cases
):
import
torch_mlu
device
=
DeviceEnum
.
DEVICE_BANG
handle
=
create_handle
(
lib
,
device
)
for
(
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
in
test_cases
:
test
(
lib
,
handle
,
"mlu"
,
n_q_head
,
n_kv_head
,
seq_len
,
head_dim
,
pos
,
k_cache_buf_len
,
v_cache_buf_len
,
dtype
,
q_stride
,
k_stride
,
v_stride
,
k_cache_stride
,
v_cache_stride
,
)
# Validate results
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
out
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
out
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
destroy_handle
(
lib
,
handle
)
# Profiling workflow
if
PROFILE
:
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
attention
(
q
,
k
,
v
,
k_cache
,
v_cache
,
pos
),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_attention
(),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
check_error
(
lib
.
infiniopDestroyAttentionDescriptor
(
descriptor
))
if
__name__
==
"__main__"
:
_TENSOR_DTYPES
=
[
torch
.
float16
,
torch
.
float32
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
1e-4
,
"rtol"
:
1e-2
},
torch
.
float32
:
{
"atol"
:
1e-5
,
"rtol"
:
1e-3
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
test_cases
=
[
# prefill
(
...
...
@@ -325,7 +232,6 @@ if __name__ == "__main__":
0
,
# pos
2048
,
# k_cache_buf_len
2048
,
# v_cache_buf_len
torch
.
float16
,
# dtype
[
64
,
2560
,
1
],
# q_stride
[
64
,
2560
,
1
],
# k_stride
[
64
,
2560
,
1
],
# v_stride
...
...
@@ -341,7 +247,6 @@ if __name__ == "__main__":
3
,
# pos
2048
,
# k_cache_buf_len
2048
,
# v_cache_buf_len
torch
.
float16
,
# dtype
[
64
,
2560
,
1
],
# q_stride
[
64
,
2560
,
1
],
# k_stride
[
64
,
2560
,
1
],
# v_stride
...
...
@@ -357,13 +262,26 @@ if __name__ == "__main__":
1
,
# pos
8
,
# k_cache_buf_len
8
,
# v_cache_buf_len
torch
.
float16
,
# dtype
None
,
# q_stride
None
,
# k_stride
None
,
# v_stride
None
,
# k_cache_stride
None
,
# v_cache_stride
),
(
28
,
# n_q_head
28
,
# n_kv_head
15
,
# seq_len
128
,
# head_dim
0
,
# pos
2048
,
# k_cache_buf_len
2048
,
# v_cache_buf_len
[
128
,
10752
,
1
],
# q_stride
[
128
,
10752
,
1
],
# k_stride
[
128
,
10752
,
1
],
# v_stride
[
128
,
3584
,
1
],
# k_cache_stride
[
128
,
3584
,
1
],
# v_cache_stride
),
]
args
=
get_args
()
lib
=
open_lib
()
...
...
@@ -406,12 +324,13 @@ if __name__ == "__main__":
infiniopAttentionDescriptor_t
,
]
if
args
.
cpu
:
test_cpu
(
lib
,
test_cases
)
if
args
.
cuda
:
test_cuda
(
lib
,
test_cases
)
if
args
.
bang
:
test_bang
(
lib
,
test_cases
)
if
not
(
args
.
cpu
or
args
.
cuda
or
args
.
bang
):
test_cpu
(
lib
,
test_cases
)
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
# Execute tests
for
device
in
get_test_devices
(
args
):
test_operator
(
lib
,
device
,
test
,
test_cases
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
test/infiniop/avg_pool.py
View file @
c2e87202
...
...
@@ -88,6 +88,7 @@ def test(
padding
,
strides
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing AvgPool on
{
torch_device
}
with x_shape:
{
x_shape
}
kernel_shape:
{
k_shape
}
padding:
{
padding
}
strides:
{
strides
}
dtype:
{
tensor_dtype
}
"
...
...
@@ -109,6 +110,10 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopAvgPoolDescriptor_t
()
check_error
(
...
...
test/infiniop/causal_softmax.py
View file @
c2e87202
...
...
@@ -16,18 +16,20 @@ from libinfiniop import (
get_tolerance
,
profile_operation
,
)
from
enum
import
Enum
,
auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES
=
[
# x_shape, x_stride
((
32
,
512
),
None
),
((
32
,
512
),
(
1024
,
1
)),
((
32
,
5
,
5
),
None
),
((
32
,
20
,
512
),
None
),
((
32
,
20
,
512
),
(
20480
,
512
,
1
)),
# Ascend 暂不支持非连续
_TEST_CASES_
=
[
# shape, x_stride, y_stride
((
3
,
3
),
None
,
None
),
((
32
,
512
),
None
,
None
),
((
32
,
512
),
(
1024
,
1
),
(
1024
,
1
)),
((
32
,
5
,
5
),
None
,
None
),
((
32
,
20
,
512
),
None
,
None
),
((
32
,
20
,
512
),
(
20480
,
512
,
1
),
None
),
]
# Data types used for testing
...
...
@@ -35,9 +37,26 @@ _TENSOR_DTYPES = [torch.float16]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
0
,
"rtol"
:
1e-2
},
torch
.
float16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-2
},
}
class
Inplace
(
Enum
):
OUT_OF_PLACE
=
auto
()
INPLACE_X
=
auto
()
_INPLACE
=
[
Inplace
.
INPLACE_X
,
Inplace
.
OUT_OF_PLACE
,
]
_TEST_CASES
=
[
test_case
+
(
inplace_item
,)
for
test_case
in
_TEST_CASES_
for
inplace_item
in
_INPLACE
]
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
...
...
@@ -59,12 +78,22 @@ def causal_softmax(x):
return
torch
.
nn
.
functional
.
softmax
(
masked
,
dim
=-
1
).
to
(
type
)
def
test
(
lib
,
handle
,
torch_device
,
x_shape
,
x_stride
=
None
,
dtype
=
torch
.
float16
):
def
test
(
lib
,
handle
,
torch_device
,
shape
,
x_stride
=
None
,
y_stride
=
None
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing CausalSoftmax on
{
torch_device
}
with
x_
shape:
{
x_
shape
}
x_stride:
{
x_stride
}
dtype:
{
dtyp
e
}
"
f
"Testing CausalSoftmax on
{
torch_device
}
with shape:
{
shape
}
x_stride:
{
x_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
inplace:
{
inplac
e
}
"
)
x
=
torch
.
rand
(
x_
shape
,
dtype
=
dtype
).
to
(
torch_device
)
x
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
ans
=
causal_softmax
(
x
)
...
...
@@ -72,10 +101,21 @@ def test(lib, handle, torch_device, x_shape, x_stride=None, dtype=torch.float16)
x_tensor
=
to_tensor
(
x
,
lib
)
if
inplace
==
Inplace
.
INPLACE_X
:
y
=
x
y_tensor
=
x_tensor
else
:
y
=
torch
.
zeros
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
y
=
rearrange_if_needed
(
y
,
y_stride
)
y_tensor
=
to_tensor
(
y
,
lib
)
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopCausalSoftmaxDescriptor_t
()
check_error
(
lib
.
infiniopCreateCausalSoftmaxDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
x_tensor
.
descriptor
handle
,
ctypes
.
byref
(
descriptor
),
y_tensor
.
descriptor
,
x_tensor
.
descriptor
)
)
...
...
@@ -96,17 +136,21 @@ def test(lib, handle, torch_device, x_shape, x_stride=None, dtype=torch.float16)
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
y_tensor
.
data
,
x_tensor
.
data
,
None
,
)
)
lib_causal_softmax
()
if
sync
is
not
None
:
sync
()
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
x
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
x
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
debug
(
y
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
y
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
# Profiling workflow
if
PROFILE
:
...
...
test/infiniop/clip.py
0 → 100644
View file @
c2e87202
#!/usr/bin/env python3
import
torch
import
ctypes
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_size_t
,
c_uint64
,
c_void_p
,
c_float
from
libinfiniop
import
(
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
open_lib
,
to_tensor
,
get_test_devices
,
check_error
,
rearrange_if_needed
,
create_workspace
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
)
from
enum
import
Enum
,
auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_
=
[
# shape, x_stride, y_stride, min_val, max_val
# 基本形状测试
((
10
,),
None
,
None
,
-
1.0
,
1.0
),
((
5
,
10
),
None
,
None
,
-
1.0
,
1.0
),
((
2
,
3
,
4
),
None
,
None
,
-
1.0
,
1.0
),
# 不同的min_val和max_val
((
10
,),
None
,
None
,
0.0
,
2.0
),
((
5
,
10
),
None
,
None
,
0.0
,
2.0
),
((
2
,
3
,
4
),
None
,
None
,
0.0
,
2.0
),
((
10
,),
None
,
None
,
-
2.0
,
0.0
),
((
5
,
10
),
None
,
None
,
-
2.0
,
0.0
),
((
2
,
3
,
4
),
None
,
None
,
-
2.0
,
0.0
),
# 奇怪形状测试
((
7
,
13
),
None
,
None
,
-
1.0
,
1.0
),
# 质数维度
((
3
,
5
,
7
),
None
,
None
,
-
1.0
,
1.0
),
# 三维质数
# 非标准形状测试
((
1
,
1
),
None
,
None
,
-
1.0
,
1.0
),
# 最小形状
((
100
,
100
),
None
,
None
,
-
1.0
,
1.0
),
# 大形状
((
16
,
16
,
16
),
None
,
None
,
-
1.0
,
1.0
),
# 大三维
# 极端值测试
((
10
,),
None
,
None
,
-
1000.0
,
1000.0
),
# 大范围
((
10
,),
None
,
None
,
-
0.001
,
0.001
),
# 小范围
((
10
,),
None
,
None
,
0.0
,
0.0
),
# min=max
# 特殊形状测试
((
0
,),
None
,
None
,
-
1.0
,
1.0
),
# 空张量
((
1
,
0
),
None
,
None
,
-
1.0
,
1.0
),
# 空维度
]
_TENSOR_DTYPES
=
[
torch
.
float16
,
torch
.
float32
]
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
torch
.
float32
:
{
"atol"
:
1e-7
,
"rtol"
:
1e-6
},
}
class
Inplace
(
Enum
):
OUT_OF_PLACE
=
auto
()
INPLACE_X
=
auto
()
_INPLACE
=
[
Inplace
.
INPLACE_X
,
Inplace
.
OUT_OF_PLACE
,
]
_TEST_CASES
=
[
test_case
+
(
inplace_item
,)
for
test_case
in
_TEST_CASES_
for
inplace_item
in
_INPLACE
]
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
class
ClipDescriptor
(
Structure
):
_fields_
=
[(
"device_type"
,
c_int32
),
(
"device_id"
,
c_int32
)]
infiniopClipDescriptor_t
=
POINTER
(
ClipDescriptor
)
def
clip
(
x
,
min_val
,
max_val
):
return
torch
.
clamp
(
x
,
min_val
,
max_val
)
def
create_tensor_with_stride
(
shape
,
stride
,
dtype
,
device
):
"""Create a tensor with specific stride without using view() that might cause errors."""
x
=
torch
.
rand
(
shape
,
dtype
=
dtype
,
device
=
device
)
*
4.0
-
2.0
# Range: [-2, 2]
if
stride
is
None
:
return
x
if
len
(
shape
)
==
2
and
len
(
stride
)
==
2
:
if
stride
==
(
shape
[
1
],
1
):
return
x
.
contiguous
()
elif
stride
==
(
1
,
shape
[
0
]):
return
x
.
transpose
(
0
,
1
).
contiguous
().
transpose
(
0
,
1
)
else
:
y
=
torch
.
zeros
(
shape
,
dtype
=
dtype
,
device
=
device
)
for
i
in
range
(
shape
[
0
]):
for
j
in
range
(
shape
[
1
]):
y
[
i
,
j
]
=
x
[
i
,
j
]
return
y
.
contiguous
()
return
x
def
test
(
lib
,
handle
,
torch_device
,
shape
,
x_stride
=
None
,
y_stride
=
None
,
min_val
=-
1.0
,
max_val
=
1.0
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
dtype
=
torch
.
float32
,
):
print
(
f
"Testing Clip on
{
torch_device
}
with shape:
{
shape
}
x_stride:
{
x_stride
}
y_stride:
{
y_stride
}
"
f
"min_val:
{
min_val
}
max_val:
{
max_val
}
dtype:
{
dtype
}
inplace:
{
inplace
}
"
)
x
=
create_tensor_with_stride
(
shape
,
x_stride
,
dtype
,
torch_device
)
ans
=
clip
(
x
,
min_val
,
max_val
)
x
=
rearrange_if_needed
(
x
,
x_stride
)
x_tensor
=
to_tensor
(
x
,
lib
)
if
inplace
==
Inplace
.
INPLACE_X
:
y
=
x
y_tensor
=
x_tensor
else
:
y
=
torch
.
zeros
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
y
=
rearrange_if_needed
(
y
,
y_stride
)
y_tensor
=
to_tensor
(
y
,
lib
)
descriptor
=
infiniopClipDescriptor_t
()
check_error
(
lib
.
infiniopCreateClipDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
y_tensor
.
descriptor
,
x_tensor
.
descriptor
)
)
workspace_size
=
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetClipWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
)
workspace
=
create_workspace
(
workspace_size
.
value
,
x
.
device
)
def
lib_clip
():
check_error
(
lib
.
infiniopClip
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
y_tensor
.
data
,
x_tensor
.
data
,
c_float
(
min_val
),
c_float
(
max_val
),
None
,
)
)
lib_clip
()
# Now we can destroy the tensor descriptors
x_tensor
.
destroyDesc
(
lib
)
if
inplace
!=
Inplace
.
INPLACE_X
:
y_tensor
.
destroyDesc
(
lib
)
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
or
not
torch
.
allclose
(
y
,
ans
,
atol
=
atol
,
rtol
=
rtol
):
print
(
"
\n
Expected:"
)
print
(
ans
)
print
(
"
\n
Actual:"
)
print
(
y
)
print
(
"
\n
Difference:"
)
print
(
torch
.
abs
(
y
-
ans
))
print
(
"
\n
Max difference:"
,
torch
.
max
(
torch
.
abs
(
y
-
ans
)).
item
())
debug
(
y
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
y
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
# Profiling workflow
if
PROFILE
:
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
clip
(
x
,
min_val
,
max_val
),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_clip
(),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
check_error
(
lib
.
infiniopDestroyClipDescriptor
(
descriptor
))
if
__name__
==
"__main__"
:
args
=
get_args
()
lib
=
open_lib
()
lib
.
infiniopCreateClipDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateClipDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopClipDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetClipWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetClipWorkspaceSize
.
argtypes
=
[
infiniopClipDescriptor_t
,
POINTER
(
c_uint64
),
]
lib
.
infiniopClip
.
restype
=
c_int32
lib
.
infiniopClip
.
argtypes
=
[
infiniopClipDescriptor_t
,
c_void_p
,
c_uint64
,
c_void_p
,
c_void_p
,
c_float
,
c_float
,
c_void_p
,
]
lib
.
infiniopDestroyClipDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyClipDescriptor
.
argtypes
=
[
infiniopClipDescriptor_t
,
]
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
for
device
in
get_test_devices
(
args
):
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
test/infiniop/conv.py
View file @
c2e87202
...
...
@@ -95,6 +95,7 @@ def test(
dilations
,
tensor_stride
=
None
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
):
assert
len
(
pads
)
==
len
(
strides
)
==
len
(
dilations
)
print
(
...
...
@@ -118,8 +119,11 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
w_tensor
=
to_tensor
(
w
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
descriptor
=
infiniopConvDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopConvDescriptor_t
()
check_error
(
lib
.
infiniopCreateConvDescriptor
(
handle
,
...
...
test/infiniop/expand.py
View file @
c2e87202
...
...
@@ -52,6 +52,7 @@ def test(
y_stride
=
None
,
x_stride
=
None
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing Expand on
{
torch_device
}
with x_shape:
{
x_shape
}
y_shape:
{
y_shape
}
x_stride:
{
x_stride
}
y_stride:
{
y_stride
}
dtype:
{
tensor_dtype
}
"
...
...
@@ -76,8 +77,11 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
descriptor
=
infiniopExpandDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopExpandDescriptor_t
()
check_error
(
lib
.
infiniopCreateExpandDescriptor
(
handle
,
...
...
test/infiniop/gemm.py
View file @
c2e87202
...
...
@@ -83,6 +83,7 @@ def test(
b_stride
=
None
,
c_stride
=
None
,
dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing Gemm on
{
torch_device
}
with alpha:
{
alpha
}
, beta:
{
beta
}
,"
...
...
@@ -104,6 +105,9 @@ def test(
]
a_tensor
,
b_tensor
,
c_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
a
,
b
,
c
]]
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopGemmDescriptor_t
()
check_error
(
lib
.
infiniopCreateGemmDescriptor
(
...
...
test/infiniop/global_avg_pool.py
View file @
c2e87202
...
...
@@ -51,6 +51,7 @@ def test(
torch_device
,
x_shape
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing GlobalAvgPool on
{
torch_device
}
with input tensor_shape:
{
x_shape
}
dtype:
{
tensor_dtype
}
"
...
...
@@ -70,8 +71,11 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
descriptor
=
infiniopGlobalAvgPoolDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopGlobalAvgPoolDescriptor_t
()
check_error
(
lib
.
infiniopCreateGlobalAvgPoolDescriptor
(
handle
,
...
...
test/infiniop/libinfiniop/utils.py
View file @
c2e87202
...
...
@@ -10,7 +10,7 @@ def check_error(status):
raise
Exception
(
"Error code "
+
str
(
status
))
def
to_tensor
(
tensor
,
lib
):
def
to_tensor
(
tensor
,
lib
,
force_unsigned
=
False
):
"""
Convert a PyTorch tensor to a library Tensor(descriptor, data).
"""
...
...
@@ -37,6 +37,16 @@ def to_tensor(tensor, lib):
InfiniDtype
.
U64
if
tensor
.
dtype
==
torch
.
uint64
else
None
)
if
force_unsigned
:
dt
=
(
InfiniDtype
.
U8
if
dt
==
InfiniDtype
.
I8
else
InfiniDtype
.
U16
if
dt
==
InfiniDtype
.
I16
else
InfiniDtype
.
U32
if
dt
==
InfiniDtype
.
I32
else
InfiniDtype
.
U64
if
dt
==
InfiniDtype
.
I64
else
dt
)
# fmt: on
assert
dt
is
not
None
# Create TensorDecriptor
...
...
@@ -413,6 +423,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
infiniDeviceEnum_str_map
[
device
],
*
test_case
,
tensor_dtype
,
get_sync_func
(
device
)
)
finally
:
destroy_handle
(
lib
,
handle
)
...
...
@@ -461,3 +472,15 @@ def get_test_devices(args):
devices_to_test
=
[
InfiniDeviceEnum
.
CPU
]
return
devices_to_test
def
get_sync_func
(
device
):
import
torch
device_str
=
infiniDeviceEnum_str_map
[
device
]
if
device
==
InfiniDeviceEnum
.
CPU
:
sync
=
None
else
:
sync
=
getattr
(
torch
,
device_str
).
synchronize
return
sync
test/infiniop/max_pool.py
View file @
c2e87202
...
...
@@ -83,6 +83,7 @@ def test(
padding
,
strides
,
tensor_dtype
=
torch
.
float16
,
sync
=
None
):
print
(
f
"Testing MaxPool on
{
torch_device
}
with x_shape:
{
x_shape
}
kernel_shape:
{
k_shape
}
padding:
{
padding
}
strides:
{
strides
}
dtype:
{
tensor_dtype
}
"
...
...
@@ -104,8 +105,11 @@ def test(
x_tensor
=
to_tensor
(
x
,
lib
)
y_tensor
=
to_tensor
(
y
,
lib
)
descriptor
=
infiniopMaxPoolDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopMaxPoolDescriptor_t
()
check_error
(
lib
.
infiniopCreateMaxPoolDescriptor
(
handle
,
...
...
Prev
1
…
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment