Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
4e4d3415
Unverified
Commit
4e4d3415
authored
Apr 29, 2025
by
Catheriany
Committed by
GitHub
Apr 29, 2025
Browse files
Merge branch 'main' into issue/150
parents
d1c46889
1a4cfb99
Changes
51
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
823 additions
and
141 deletions
+823
-141
src/infinirt/infinirt.cc
src/infinirt/infinirt.cc
+30
-30
src/utils/rearrange.cc
src/utils/rearrange.cc
+69
-0
src/utils/rearrange.h
src/utils/rearrange.h
+3
-0
test/infiniop-test/test_generate/testcases/mul.py
test/infiniop-test/test_generate/testcases/mul.py
+159
-0
test/infiniop/add.py
test/infiniop/add.py
+155
-95
test/infiniop/mul.py
test/infiniop/mul.py
+246
-0
test/infiniop/rearrange.py
test/infiniop/rearrange.py
+83
-14
test/infiniop/swiglu.py
test/infiniop/swiglu.py
+2
-1
xmake.lua
xmake.lua
+28
-1
xmake/cuda.lua
xmake/cuda.lua
+31
-0
xmake/test.lua
xmake/test.lua
+17
-0
No files found.
src/infinirt/infinirt.cc
View file @
4e4d3415
...
@@ -44,46 +44,46 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
...
@@ -44,46 +44,46 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
#define INFINIRT_CALL_DEVICE_API_AND(API, PARAMS, ACTION)
\
#define INFINIRT_CALL_DEVICE_API_AND(
DEVICE_TYPE,
API, PARAMS, ACTION) \
{ \
{
\
infiniStatus_t _status; \
infiniStatus_t _status;
\
switch (
CURRENT_
DEVICE_TYPE) { \
switch (DEVICE_TYPE) {
\
case INFINI_DEVICE_CPU: \
case INFINI_DEVICE_CPU:
\
_status = infinirt::cpu::API PARAMS; \
_status = infinirt::cpu::API PARAMS;
\
break; \
break;
\
case INFINI_DEVICE_NVIDIA: \
case INFINI_DEVICE_NVIDIA:
\
_status = infinirt::cuda::API PARAMS; \
_status = infinirt::cuda::API PARAMS;
\
break; \
break;
\
case INFINI_DEVICE_CAMBRICON: \
case INFINI_DEVICE_CAMBRICON:
\
_status = infinirt::bang::API PARAMS; \
_status = infinirt::bang::API PARAMS;
\
break; \
break;
\
case INFINI_DEVICE_ASCEND: \
case INFINI_DEVICE_ASCEND:
\
_status = infinirt::ascend::API PARAMS; \
_status = infinirt::ascend::API PARAMS;
\
break; \
break;
\
case INFINI_DEVICE_METAX: \
case INFINI_DEVICE_METAX:
\
_status = infinirt::maca::API PARAMS; \
_status = infinirt::maca::API PARAMS;
\
break; \
break;
\
case INFINI_DEVICE_MOORE: \
case INFINI_DEVICE_MOORE:
\
_status = infinirt::musa::API PARAMS; \
_status = infinirt::musa::API PARAMS;
\
break; \
break;
\
default: \
default:
\
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
\
} \
}
\
{ ACTION; } \
{ ACTION; }
\
return _status; \
return _status;
\
}
}
#define INFINIRT_CALL_DEVICE_API(API, PARAMS) INFINIRT_CALL_DEVICE_API_AND(API, PARAMS, )
#define INFINIRT_CALL_DEVICE_API(API, PARAMS) INFINIRT_CALL_DEVICE_API_AND(
CURRENT_DEVICE_TYPE,
API, PARAMS, )
__C
infiniStatus_t
infinirtGetDeviceCount
(
infiniDevice_t
device
,
int
*
count
)
{
__C
infiniStatus_t
infinirtGetDeviceCount
(
infiniDevice_t
device
,
int
*
count
)
{
if
(
count
==
nullptr
)
{
if
(
count
==
nullptr
)
{
return
INFINI_STATUS_NULL_POINTER
;
return
INFINI_STATUS_NULL_POINTER
;
}
}
INFINIRT_CALL_DEVICE_API
(
getDeviceCount
,
(
count
));
INFINIRT_CALL_DEVICE_API
_AND
(
device
,
getDeviceCount
,
(
count
)
,
{}
);
}
}
__C
infiniStatus_t
infinirtSetDevice
(
infiniDevice_t
device
,
int
device_id
)
{
__C
infiniStatus_t
infinirtSetDevice
(
infiniDevice_t
device
,
int
device_id
)
{
INFINIRT_CALL_DEVICE_API_AND
(
setDevice
,
(
device_id
),
INFINIRT_CALL_DEVICE_API_AND
(
device
,
setDevice
,
(
device_id
),
{
CURRENT_DEVICE_TYPE
=
device
;
{
CURRENT_DEVICE_TYPE
=
device
;
CURRENT_DEVICE_ID
=
device_id
;
});
CURRENT_DEVICE_ID
=
device_id
;
});
}
}
...
...
src/utils/rearrange.cc
View file @
4e4d3415
...
@@ -138,4 +138,73 @@ void rearrange(
...
@@ -138,4 +138,73 @@ void rearrange(
}
}
}
}
utils
::
Result
<
RearrangeMeta
>
RearrangeMeta
::
distributeUnit
(
const
std
::
vector
<
size_t
>
&
candidates
)
const
{
// 获取当前的unit大小
size_t
current_unit
=
_meta
[
0
];
// 寻找满足条件的unit值:当前unit能被其整除
size_t
new_unit
=
0
;
for
(
size_t
candidate
:
candidates
)
{
if
(
current_unit
%
candidate
==
0
)
{
new_unit
=
candidate
;
break
;
}
}
// 如果没找到合适的值,返回错误
if
(
new_unit
==
0
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
// 如果找到的值就是当前unit,返回自身的副本
if
(
new_unit
==
current_unit
)
{
return
Result
<
RearrangeMeta
>
(
_meta
);
}
// 获取当前维度
size_t
ndim_value
=
this
->
ndim
();
// 创建新的布局数组
std
::
vector
<
ptrdiff_t
>
layout
(
2
+
(
ndim_value
+
1
)
*
3
,
0
);
// 设置新的unit值
layout
[
0
]
=
new_unit
;
// 计算扩展因子
ptrdiff_t
extra
=
current_unit
/
new_unit
;
// 计算步长指针的偏移量
ptrdiff_t
idx_offset
=
1
;
// 在新布局中设置相应的指针
ptrdiff_t
*
new_idx
=
layout
.
data
()
+
1
;
ptrdiff_t
*
new_dst
=
layout
.
data
()
+
2
+
(
ndim_value
+
1
);
ptrdiff_t
*
new_src
=
layout
.
data
()
+
2
+
(
ndim_value
+
1
)
*
2
;
// 复制并调整索引步长
// 索引步长需要重新计算
// 首先复制原来的索引步长
for
(
size_t
i
=
0
;
i
<
ndim_value
+
1
;
++
i
)
{
new_idx
[
i
]
=
_meta
[
idx_offset
+
i
]
*
extra
;
}
// 设置最后一个维度的步长为1
new_idx
[
ndim_value
+
1
]
=
1
;
// 复制目标步长数据,并添加新单元大小
for
(
size_t
i
=
0
;
i
<
ndim_value
;
++
i
)
{
new_dst
[
i
]
=
dst_strides
()[
i
];
}
new_dst
[
ndim_value
]
=
new_unit
;
// 复制源步长数据,并添加新单元大小
for
(
size_t
i
=
0
;
i
<
ndim_value
;
++
i
)
{
new_src
[
i
]
=
src_strides
()[
i
];
}
new_src
[
ndim_value
]
=
new_unit
;
return
Result
<
RearrangeMeta
>
(
layout
);
}
}
// namespace utils
}
// namespace utils
src/utils/rearrange.h
View file @
4e4d3415
...
@@ -28,6 +28,9 @@ public:
...
@@ -28,6 +28,9 @@ public:
const
ptrdiff_t
*
src_strides
()
const
;
const
ptrdiff_t
*
src_strides
()
const
;
void
launch
(
void
*
dst
,
const
void
*
src
)
const
;
void
launch
(
void
*
dst
,
const
void
*
src
)
const
;
// 拆分 unit 到更小的规模以利于并行
utils
::
Result
<
RearrangeMeta
>
distributeUnit
(
const
std
::
vector
<
size_t
>
&
candidates
)
const
;
};
};
void
rearrange
(
void
rearrange
(
...
...
test/infiniop-test/test_generate/testcases/mul.py
0 → 100644
View file @
4e4d3415
import
numpy
as
np
import
gguf
from
typing
import
List
from
..
import
InfiniopTestWriter
,
InfiniopTestCase
,
np_dtype_to_ggml
,
gguf_strides
def
mul
(
a
:
np
.
ndarray
,
b
:
np
.
ndarray
):
return
np
.
multiply
(
a
,
b
)
def
random_tensor
(
shape
,
dtype
):
rate
=
1e-3
var
=
0.5
*
rate
# 数值范围在[-5e-4, 5e-4]
return
rate
*
np
.
random
.
rand
(
*
shape
).
astype
(
dtype
)
-
var
class
MulTestCase
(
InfiniopTestCase
):
def
__init__
(
self
,
a
:
np
.
ndarray
,
stride_a
:
List
[
int
]
|
None
,
b
:
np
.
ndarray
,
stride_b
:
List
[
int
]
|
None
,
c
:
np
.
ndarray
,
stride_c
:
List
[
int
]
|
None
,
):
super
().
__init__
(
"mul"
)
self
.
a
=
a
self
.
stride_a
=
stride_a
self
.
b
=
b
self
.
stride_b
=
stride_b
self
.
c
=
c
self
.
stride_c
=
stride_c
def
write_test
(
self
,
test_writer
:
"InfiniopTestWriter"
):
super
().
write_test
(
test_writer
)
if
self
.
stride_a
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"a.strides"
),
self
.
stride_a
)
if
self
.
stride_b
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"b.strides"
),
self
.
stride_b
)
if
self
.
stride_c
is
not
None
:
test_writer
.
add_array
(
test_writer
.
gguf_key
(
"c.strides"
),
self
.
stride_c
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"a"
),
self
.
a
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
a
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"b"
),
self
.
b
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
b
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"c"
),
self
.
c
,
raw_dtype
=
np_dtype_to_ggml
(
self
.
c
.
dtype
)
)
a_fp64
=
self
.
a
.
astype
(
np
.
float64
)
b_fp64
=
self
.
b
.
astype
(
np
.
float64
)
ans_fp64
=
np
.
multiply
(
a_fp64
,
b_fp64
)
ans
=
mul
(
self
.
a
,
self
.
b
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans"
),
ans
,
raw_dtype
=
np_dtype_to_ggml
(
ans
.
dtype
)
)
test_writer
.
add_tensor
(
test_writer
.
gguf_key
(
"ans_fp64"
),
ans_fp64
,
raw_dtype
=
np_dtype_to_ggml
(
ans_fp64
.
dtype
),
)
if
__name__
==
'__main__'
:
test_writer
=
InfiniopTestWriter
(
"mul.gguf"
)
test_cases
=
[
MulTestCase
(
random_tensor
((
2
,
3
),
np
.
float32
),
gguf_strides
(
3
,
1
),
random_tensor
((
2
,
3
),
np
.
float32
),
gguf_strides
(
1
,
2
),
random_tensor
((
2
,
3
),
np
.
float32
),
gguf_strides
(
3
,
1
),
),
MulTestCase
(
random_tensor
((
2
,
3
),
np
.
float16
),
gguf_strides
(
1
,
2
),
random_tensor
((
2
,
3
),
np
.
float16
),
gguf_strides
(
3
,
1
),
random_tensor
((
2
,
3
),
np
.
float16
),
gguf_strides
(
1
,
2
),
),
MulTestCase
(
random_tensor
((
2
,
3
),
np
.
float64
),
gguf_strides
(
3
,
1
),
random_tensor
((
2
,
3
),
np
.
float64
),
gguf_strides
(
3
,
1
),
random_tensor
((
2
,
3
),
np
.
float64
),
gguf_strides
(
1
,
2
),
),
MulTestCase
(
random_tensor
((
4
,
6
),
np
.
float16
),
gguf_strides
(
1
,
4
),
random_tensor
((
4
,
6
),
np
.
float16
),
gguf_strides
(
1
,
5
),
random_tensor
((
4
,
6
),
np
.
float16
),
gguf_strides
(
6
,
1
),
),
MulTestCase
(
random_tensor
((
1
,
2048
),
np
.
float16
),
gguf_strides
(
1
,
1
),
random_tensor
((
1
,
2048
),
np
.
float16
),
gguf_strides
(
2048
,
1
),
random_tensor
((
1
,
2048
),
np
.
float16
),
gguf_strides
(
1
,
1
),
),
MulTestCase
(
random_tensor
((
2048
,
2048
),
np
.
float32
),
None
,
random_tensor
((
2048
,
2048
),
np
.
float32
),
gguf_strides
(
1
,
2048
),
random_tensor
((
2048
,
2048
),
np
.
float32
),
None
,
),
MulTestCase
(
random_tensor
((
2
,
4
,
2048
),
np
.
float16
),
gguf_strides
(
4
*
2048
,
2048
,
1
),
random_tensor
((
2
,
4
,
2048
),
np
.
float16
),
gguf_strides
(
1
,
2
,
2
*
4
),
random_tensor
((
2
,
4
,
2048
),
np
.
float16
),
gguf_strides
(
4
*
2048
,
2048
,
1
),
),
MulTestCase
(
random_tensor
((
2
,
4
,
2048
),
np
.
float32
),
gguf_strides
(
1
,
2
,
2
*
4
),
random_tensor
((
2
,
4
,
2048
),
np
.
float32
),
None
,
random_tensor
((
2
,
4
,
2048
),
np
.
float32
),
gguf_strides
(
1
,
2
,
2
*
4
),
),
MulTestCase
(
random_tensor
((
2048
,
2560
),
np
.
float32
),
gguf_strides
(
2560
,
1
),
random_tensor
((
2048
,
2560
),
np
.
float32
),
gguf_strides
(
1
,
2048
),
random_tensor
((
2048
,
2560
),
np
.
float32
),
gguf_strides
(
2560
,
1
),
),
MulTestCase
(
random_tensor
((
4
,
48
,
64
),
np
.
float16
),
gguf_strides
(
64
*
48
,
64
,
1
),
random_tensor
((
4
,
48
,
64
),
np
.
float16
),
gguf_strides
(
1
,
4
,
4
*
48
),
random_tensor
((
4
,
48
,
64
),
np
.
float16
),
None
),
MulTestCase
(
random_tensor
((
4
,
48
,
64
),
np
.
float32
),
None
,
random_tensor
((
4
,
48
,
64
),
np
.
float32
),
gguf_strides
(
1
,
4
,
4
*
48
),
random_tensor
((
4
,
48
,
64
),
np
.
float32
),
gguf_strides
(
48
*
64
,
64
,
1
),
)
]
test_writer
.
add_tests
(
test_cases
)
test_writer
.
save
()
test/infiniop/add.py
View file @
4e4d3415
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_void_p
import
torch
import
ctypes
import
ctypes
import
sys
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_void_p
,
c_uint64
import
os
from
libinfiniop
import
(
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
".."
)))
from
operatorspy
import
(
open_lib
,
to_tensor
,
DeviceEnum
,
infiniopHandle_t
,
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
create_handle
,
open_lib
,
destroy_handle
,
to_tensor
,
get_test_devices
,
check_error
,
check_error
,
rearrange_if_needed
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
create_workspace
,
)
)
from
operatorspy.tests.test_utils
import
get_args
from
enum
import
Enum
,
auto
from
enum
import
Enum
,
auto
import
torch
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_
=
[
# shape, a_stride, b_stride, c_stride
((
13
,
4
),
None
,
None
,
None
),
((
13
,
4
),
(
10
,
1
),
(
10
,
1
),
(
10
,
1
)),
((
13
,
4
),
(
0
,
1
),
None
,
None
),
((
13
,
4
,
4
),
None
,
None
,
None
),
((
13
,
4
,
4
),
(
20
,
4
,
1
),
(
20
,
4
,
1
),
(
20
,
4
,
1
)),
((
13
,
4
,
4
),
(
4
,
0
,
1
),
(
0
,
4
,
1
),
None
),
((
16
,
5632
),
None
,
None
,
None
),
((
16
,
5632
),
(
13312
,
1
),
(
13312
,
1
),
(
13312
,
1
)),
((
4
,
4
,
5632
),
None
,
None
,
None
),
((
4
,
4
,
5632
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
)),
]
class
Inplace
(
Enum
):
class
Inplace
(
Enum
):
...
@@ -26,6 +43,35 @@ class Inplace(Enum):
...
@@ -26,6 +43,35 @@ class Inplace(Enum):
INPLACE_B
=
auto
()
INPLACE_B
=
auto
()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE
=
[
Inplace
.
OUT_OF_PLACE
,
Inplace
.
INPLACE_A
,
Inplace
.
INPLACE_B
,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES
=
[
test_case
+
(
inplace_item
,)
for
test_case
in
_TEST_CASES_
for
inplace_item
in
_INPLACE
]
# Data types used for testing
_TENSOR_DTYPES
=
[
torch
.
float16
,
torch
.
float32
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
torch
.
float32
:
{
"atol"
:
1e-7
,
"rtol"
:
1e-7
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
class
AddDescriptor
(
Structure
):
class
AddDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
_fields_
=
[(
"device"
,
c_int32
)]
...
@@ -37,42 +83,71 @@ def add(x, y):
...
@@ -37,42 +83,71 @@ def add(x, y):
return
torch
.
add
(
x
,
y
)
return
torch
.
add
(
x
,
y
)
def
process_tensors
(
c
,
c_strides
,
a
,
a_stride
,
b
,
b_stride
,
inplace
):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides
=
c_strides
if
c_strides
else
c
.
stride
()
def
_rearrange
(
tensor
,
strides
):
if
strides
and
0
in
strides
:
tensor
.
set_
(
tensor
.
untyped_storage
(),
0
,
tensor
.
shape
,
strides
)
return
tensor
else
:
return
rearrange_if_needed
(
tensor
,
strides
)
a
,
b
,
c
=
[
_rearrange
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
a
,
b
,
c
],
[
a_stride
,
b_stride
,
c_strides
])
]
c
=
(
c
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if
0
in
c
.
stride
():
c
.
set_
(
c
.
untyped_storage
(),
0
,
c
.
shape
,
original_c_strides
)
return
a
,
b
,
c
def
test
(
def
test
(
lib
,
lib
,
handle
,
handle
,
torch_device
,
torch_device
,
c_
shape
,
shape
,
a_s
hap
e
,
a_s
tride
=
Non
e
,
b_s
hap
e
,
b_s
tride
=
Non
e
,
tensor_dtype
=
torch
.
float16
,
c_stride
=
None
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
dtype
=
torch
.
float16
,
sync
=
None
,
):
):
print
(
print
(
f
"Testing Add on
{
torch_device
}
with c_shape:
{
c_shape
}
a_shape:
{
a_shape
}
b_shape:
{
b_shape
}
dtype:
{
tensor_dtype
}
inplace:
{
inplace
.
name
}
"
f
"Testing Add on
{
torch_device
}
with shape:
{
shape
}
a_stride:
{
a_stride
}
b_stride:
{
b_stride
}
c_stride:
{
c_stride
}
"
f
"dtype:
{
dtype
}
inplace:
{
inplace
}
"
)
)
if
a_shape
!=
b_shape
and
inplace
!=
Inplace
.
OUT_OF_PLACE
:
print
(
"Unsupported test: broadcasting does not support in-place"
)
return
a
=
torch
.
rand
(
a_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
a
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
b_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
c
=
(
c
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
torch
.
rand
(
c_shape
,
dtype
=
tensor_dtype
).
to
(
torch_device
)
a
,
b
,
c
=
process_tensors
(
c
,
c_stride
,
a
,
a_stride
,
b
,
b_stride
,
inplace
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
)
ans
=
add
(
a
,
b
)
ans
=
add
(
a
,
b
)
a_tensor
=
to_tensor
(
a
,
lib
)
a_tensor
,
b_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
a
,
b
]]
b_tensor
=
to_tensor
(
b
,
lib
)
c_tensor
=
(
c_tensor
=
(
to_tensor
(
c
,
lib
)
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
)
)
descriptor
=
infiniopAddDescriptor_t
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopAddDescriptor_t
()
check_error
(
check_error
(
lib
.
infiniopCreateAddDescriptor
(
lib
.
infiniopCreateAddDescriptor
(
handle
,
handle
,
...
@@ -84,74 +159,48 @@ def test(
...
@@ -84,74 +159,48 @@ def test(
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
c_tensor
.
descriptor
.
contents
.
invalidate
()
for
tensor
in
[
a_tensor
,
b_tensor
,
c_tensor
]:
a_tensor
.
descriptor
.
contents
.
invalidate
()
tensor
.
destroyDesc
(
lib
)
b_tensor
.
descriptor
.
contents
.
invalidate
()
workspace_size
=
c_uint64
(
0
)
check_error
(
check_error
(
lib
.
infiniop
Add
(
descriptor
,
c_tensor
.
data
,
a_tensor
.
data
,
b_tensor
.
data
,
Non
e
)
lib
.
infiniop
GetAddWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_siz
e
)
)
)
)
assert
torch
.
allclose
(
c
,
ans
,
atol
=
0
,
rtol
=
1e-3
)
workspace
=
create_workspace
(
workspace_size
.
value
,
c
.
device
)
check_error
(
lib
.
infiniopDestroyAddDescriptor
(
descriptor
))
def
lib_add
():
check_error
(
def
test_cpu
(
lib
,
test_cases
):
lib
.
infiniopAdd
(
device
=
DeviceEnum
.
DEVICE_CPU
descriptor
,
handle
=
create_handle
(
lib
,
device
)
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
workspace_size
.
value
,
# fmt: off
c_tensor
.
data
,
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
a_tensor
.
data
,
test
(
lib
,
handle
,
"cpu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
b_tensor
.
data
,
# fmt: on
None
,
destroy_handle
(
lib
,
handle
)
)
)
def
test_cuda
(
lib
,
test_cases
):
device
=
DeviceEnum
.
DEVICE_CUDA
handle
=
create_handle
(
lib
,
device
)
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
test
(
lib
,
handle
,
"cuda"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
# fmt: on
destroy_handle
(
lib
,
handle
)
lib_add
()
def
test_bang
(
lib
,
test_cases
):
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
import
torch_mlu
if
DEBUG
:
debug
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
device
=
DeviceEnum
.
DEVICE_BANG
# Profiling workflow
handle
=
create_handle
(
lib
,
device
)
if
PROFILE
:
for
c_shape
,
a_shape
,
b_shape
,
inplace
in
test_cases
:
# fmt: off
# fmt: off
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float16
,
inplace
=
inplace
)
profile_operation
(
"PyTorch"
,
lambda
:
add
(
a
,
b
),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
test
(
lib
,
handle
,
"mlu"
,
c_shape
,
a_shape
,
b_shape
,
tensor_dtype
=
torch
.
float32
,
inplace
=
inplace
)
profile_operation
(
" lib"
,
lambda
:
lib_add
(),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
# fmt: on
destroy_handle
(
lib
,
handle
)
check_error
(
lib
.
infiniopDestroyAddDescriptor
(
descriptor
)
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
test_cases
=
[
# fmt: off
# c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
((
1
,
3
),
(
1
,
3
),
(
1
,
3
),
Inplace
.
OUT_OF_PLACE
),
((),
(),
(),
Inplace
.
OUT_OF_PLACE
),
((
3
,
3
),
(
3
,
3
),
(
3
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
20
,
3
),
(
2
,
1
,
3
),
(
2
,
20
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
32
,
20
,
512
),
(
32
,
20
,
512
),
(
32
,
20
,
512
),
Inplace
.
INPLACE_A
),
((
32
,
20
,
512
),
(
32
,
20
,
512
),
(
32
,
20
,
512
),
Inplace
.
INPLACE_B
),
((
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
1
),
(
32
,
256
,
112
,
112
),
Inplace
.
OUT_OF_PLACE
),
((
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
112
),
(
32
,
256
,
112
,
112
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
4
,
3
),
(
2
,
1
,
3
),
(
4
,
3
),
Inplace
.
OUT_OF_PLACE
),
((
2
,
3
,
4
,
5
),
(
2
,
3
,
4
,
5
),
(
5
,),
Inplace
.
OUT_OF_PLACE
),
((
3
,
2
,
4
,
5
),
(
4
,
5
),
(
3
,
2
,
1
,
1
),
Inplace
.
OUT_OF_PLACE
),
# fmt: on
]
args
=
get_args
()
args
=
get_args
()
lib
=
open_lib
()
lib
=
open_lib
()
lib
.
infiniopCreateAddDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateAddDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateAddDescriptor
.
argtypes
=
[
lib
.
infiniopCreateAddDescriptor
.
argtypes
=
[
infiniopHandle_t
,
infiniopHandle_t
,
...
@@ -160,25 +209,36 @@ if __name__ == "__main__":
...
@@ -160,25 +209,36 @@ if __name__ == "__main__":
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
]
lib
.
infiniopGetAddWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetAddWorkspaceSize
.
argtypes
=
[
infiniopAddDescriptor_t
,
POINTER
(
c_uint64
),
]
lib
.
infiniopAdd
.
restype
=
c_int32
lib
.
infiniopAdd
.
restype
=
c_int32
lib
.
infiniopAdd
.
argtypes
=
[
lib
.
infiniopAdd
.
argtypes
=
[
infiniopAddDescriptor_t
,
infiniopAddDescriptor_t
,
c_void_p
,
c_void_p
,
c_uint64
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
]
]
lib
.
infiniopDestroyAddDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyAddDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyAddDescriptor
.
argtypes
=
[
lib
.
infiniopDestroyAddDescriptor
.
argtypes
=
[
infiniopAddDescriptor_t
,
infiniopAddDescriptor_t
,
]
]
if
args
.
cpu
:
# Configure testing options
test_cpu
(
lib
,
test_cases
)
DEBUG
=
args
.
debug
if
args
.
cuda
:
PROFILE
=
args
.
profile
test_cuda
(
lib
,
test_cases
)
NUM_PRERUN
=
args
.
num_prerun
if
args
.
bang
:
NUM_ITERATIONS
=
args
.
num_iterations
test_bang
(
lib
,
test_cases
)
if
not
(
args
.
cpu
or
args
.
cuda
or
args
.
bang
):
for
device
in
get_test_devices
(
args
):
test_cpu
(
lib
,
test_cases
)
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
test/infiniop/mul.py
0 → 100644
View file @
4e4d3415
import
torch
import
ctypes
from
ctypes
import
POINTER
,
Structure
,
c_int32
,
c_void_p
,
c_uint64
from
libinfiniop
import
(
infiniopHandle_t
,
infiniopTensorDescriptor_t
,
open_lib
,
to_tensor
,
get_test_devices
,
check_error
,
rearrange_if_needed
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
create_workspace
,
)
from
enum
import
Enum
,
auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_
=
[
# shape, a_stride, b_stride, c_stride
((
13
,
4
),
None
,
None
,
None
),
((
13
,
4
),
(
10
,
1
),
(
10
,
1
),
(
10
,
1
)),
((
13
,
4
),
(
0
,
1
),
None
,
None
),
((
13
,
4
,
4
),
None
,
None
,
None
),
((
13
,
4
,
4
),
(
20
,
4
,
1
),
(
20
,
4
,
1
),
(
20
,
4
,
1
)),
((
13
,
4
,
4
),
(
4
,
0
,
1
),
(
0
,
4
,
1
),
None
),
((
16
,
5632
),
None
,
None
,
None
),
((
16
,
5632
),
(
13312
,
1
),
(
13312
,
1
),
(
13312
,
1
)),
((
4
,
4
,
5632
),
None
,
None
,
None
),
((
4
,
4
,
5632
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
)),
]
class
Inplace
(
Enum
):
OUT_OF_PLACE
=
auto
()
INPLACE_A
=
auto
()
INPLACE_B
=
auto
()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE
=
[
Inplace
.
OUT_OF_PLACE
,
Inplace
.
INPLACE_A
,
Inplace
.
INPLACE_B
,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES
=
[
test_case
+
(
inplace_item
,)
for
test_case
in
_TEST_CASES_
for
inplace_item
in
_INPLACE
]
# Data types used for testing
_TENSOR_DTYPES
=
[
torch
.
float16
,
torch
.
float32
]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
torch
.
float32
:
{
"atol"
:
1e-7
,
"rtol"
:
1e-7
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
class
MulDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
infiniopMulDescriptor_t
=
POINTER
(
MulDescriptor
)
def
mul
(
x
,
y
):
return
torch
.
mul
(
x
,
y
)
def
process_tensors
(
c
,
c_strides
,
a
,
a_stride
,
b
,
b_stride
,
inplace
):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides
=
c_strides
if
c_strides
else
c
.
stride
()
def
_rearrange
(
tensor
,
strides
):
if
strides
and
0
in
strides
:
tensor
.
set_
(
tensor
.
untyped_storage
(),
0
,
tensor
.
shape
,
strides
)
return
tensor
else
:
return
rearrange_if_needed
(
tensor
,
strides
)
a
,
b
,
c
=
[
_rearrange
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
a
,
b
,
c
],
[
a_stride
,
b_stride
,
c_strides
])
]
c
=
(
c
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a
if
inplace
==
Inplace
.
INPLACE_A
else
b
)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if
0
in
c
.
stride
():
c
.
set_
(
c
.
untyped_storage
(),
0
,
c
.
shape
,
original_c_strides
)
return
a
,
b
,
c
def
test
(
lib
,
handle
,
torch_device
,
shape
,
a_stride
=
None
,
b_stride
=
None
,
c_stride
=
None
,
inplace
=
Inplace
.
OUT_OF_PLACE
,
dtype
=
torch
.
float16
,
sync
=
None
,
):
print
(
f
"Testing Mul on
{
torch_device
}
with shape:
{
shape
}
a_stride:
{
a_stride
}
b_stride:
{
b_stride
}
c_stride:
{
c_stride
}
"
f
"dtype:
{
dtype
}
inplace:
{
inplace
}
"
)
a
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
b
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
c
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
a
,
b
,
c
=
process_tensors
(
c
,
c_stride
,
a
,
a_stride
,
b
,
b_stride
,
inplace
)
ans
=
mul
(
a
,
b
)
a_tensor
,
b_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
a
,
b
]]
c_tensor
=
(
to_tensor
(
c
,
lib
)
if
inplace
==
Inplace
.
OUT_OF_PLACE
else
(
a_tensor
if
inplace
==
Inplace
.
INPLACE_A
else
b_tensor
)
)
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopMulDescriptor_t
()
check_error
(
lib
.
infiniopCreateMulDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
c_tensor
.
descriptor
,
a_tensor
.
descriptor
,
b_tensor
.
descriptor
,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for
tensor
in
[
a_tensor
,
b_tensor
,
c_tensor
]:
tensor
.
destroyDesc
(
lib
)
workspace_size
=
c_uint64
(
0
)
check_error
(
lib
.
infiniopGetMulWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
))
)
workspace
=
create_workspace
(
workspace_size
.
value
,
c
.
device
)
def
lib_mul
():
check_error
(
lib
.
infiniopMul
(
descriptor
,
workspace
.
data_ptr
()
if
workspace
is
not
None
else
None
,
workspace_size
.
value
,
c_tensor
.
data
,
a_tensor
.
data
,
b_tensor
.
data
,
None
,
)
)
lib_mul
()
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
c
,
ans
,
atol
=
atol
,
rtol
=
rtol
)
# Profiling workflow
if
PROFILE
:
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
mul
(
a
,
b
),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_mul
(),
torch_device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
check_error
(
lib
.
infiniopDestroyMulDescriptor
(
descriptor
))
if
__name__
==
"__main__"
:
args
=
get_args
()
lib
=
open_lib
()
lib
.
infiniopCreateMulDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateMulDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopMulDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetMulWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetMulWorkspaceSize
.
argtypes
=
[
infiniopMulDescriptor_t
,
POINTER
(
c_uint64
),
]
lib
.
infiniopMul
.
restype
=
c_int32
lib
.
infiniopMul
.
argtypes
=
[
infiniopMulDescriptor_t
,
c_void_p
,
c_uint64
,
c_void_p
,
c_void_p
,
c_void_p
,
c_void_p
,
]
lib
.
infiniopDestroyMulDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroyMulDescriptor
.
argtypes
=
[
infiniopMulDescriptor_t
,
]
# Configure testing options
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
for
device
in
get_test_devices
(
args
):
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
test/infiniop/rearrange.py
View file @
4e4d3415
...
@@ -17,19 +17,88 @@ from libinfiniop import (
...
@@ -17,19 +17,88 @@ from libinfiniop import (
profile_operation
,
profile_operation
,
)
)
def
row_major_strides
(
shape
):
"""生成张量的行优先(C风格)stride
Args:
shape: 张量形状
Returns:
行优先strides列表
"""
# 行优先 (C风格,从最后一维到第一维)
stride
=
1
strides
=
[
1
]
for
dim
in
reversed
(
shape
[
1
:]):
stride
*=
dim
strides
.
insert
(
0
,
stride
)
return
strides
def
column_major_strides
(
shape
):
"""生成张量的列优先(Fortran风格)stride
Args:
shape: 张量形状
Returns:
列优先strides列表
"""
# 列优先 (Fortran风格,从第一维到最后一维)
stride
=
1
strides
=
[
stride
]
for
dim
in
shape
[:
-
1
]:
stride
*=
dim
strides
.
append
(
stride
)
return
strides
# ==============================================================================
# ==============================================================================
# Configuration (Internal Use Only)
# Configuration (Internal Use Only)
# ==============================================================================
# ==============================================================================
# These are not meant to be imported from other modules
# These are not meant to be imported from other modules
_TEST_CASES
=
[
_TEST_CASES
=
[
# ((src_shape, src_stride), (dst_shape, dst_stride))
# (shape, x_stride, y_stride)
(((
2
,
4
,
32
),
None
),
((
2
,
4
,
32
),
(
256
,
64
,
1
))),
(
(((
32
,
6
,
64
),
(
64
,
2560
,
1
)),
((
32
,
6
,
64
),
None
)),
(
2
,
4
,
64
),
# shape
(((
4
,
6
,
64
),
(
64
,
2560
,
1
)),
((
4
,
6
,
64
),
(
131072
,
64
,
1
))),
(
2
,
4
,
8
),
# x_stride
(((
1
,
32
,
64
),
(
2048
,
64
,
1
)),
((
1
,
32
,
64
),
(
2048
,
64
,
1
))),
(
512
,
128
,
2
)
# y_stride
(((
32
,
1
,
64
),
(
64
,
2560
,
1
)),
((
32
,
1
,
64
),
(
64
,
64
,
1
))),
),
(((
4
,
1
,
64
),
(
64
,
2560
,
1
)),
((
4
,
1
,
64
),
(
64
,
11264
,
1
))),
(
(((
64
,),
(
1
,)),
((
64
,),
(
1
,))),
(
100
,
100
),
# shape
(
1
,
100
),
# x_stride
(
100
,
1
)
# y_stride
),
(
(
4
,
4
),
# shape
(
1
,
4
),
# x_stride
(
4
,
1
)
# y_stride
),
(
(
4
,
6
,
64
),
# shape
(
64
,
4
*
64
,
1
),
# x_stride
(
6
*
64
,
64
,
1
)
# y_stride
),
(
(
2000
,
2000
),
# shape
(
1
,
2000
),
# x_stride
(
2000
,
1
)
# y_stride
),
(
(
2001
,
2001
),
# shape
(
1
,
2001
),
# x_stride
(
2001
,
1
)
# y_stride
),
(
(
3
,
4
,
7
,
53
,
9
),
# shape
row_major_strides
((
3
,
4
,
7
,
53
,
9
)),
# x_stride
column_major_strides
((
3
,
4
,
7
,
53
,
9
))
# y_stride
),
(
(
3
,
4
,
50
,
50
,
5
,
7
),
# shape
row_major_strides
((
3
,
4
,
50
,
50
,
5
,
7
)),
# x_stride
column_major_strides
((
3
,
4
,
50
,
50
,
5
,
7
))
# y_stride
),
]
]
# Data types used for testing
# Data types used for testing
...
@@ -58,23 +127,23 @@ def test(
...
@@ -58,23 +127,23 @@ def test(
lib
,
lib
,
handle
,
handle
,
torch_device
,
torch_device
,
x_
shape
,
shape
,
x_stride
,
x_stride
,
y_shape
,
y_stride
,
y_stride
,
dtype
=
torch
.
float16
,
dtype
=
torch
.
float16
,
):
):
print
(
print
(
f
"Testing Rerrange on
{
torch_device
}
with
x_
shape:
{
x_
shape
}
x_stride:
{
x_stride
}
y_shape:
{
y_shape
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
f
"Testing Rerrange on
{
torch_device
}
with shape:
{
shape
}
x_stride:
{
x_stride
}
y_stride:
{
y_stride
}
dtype:
{
dtype
}
"
)
)
x
=
torch
.
rand
(
x_
shape
,
dtype
=
dtype
).
to
(
torch_device
)
x
=
torch
.
rand
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
y
=
torch
.
zeros
(
y_
shape
,
dtype
=
dtype
).
to
(
torch_device
)
y
=
torch
.
zeros
(
shape
,
dtype
=
dtype
).
to
(
torch_device
)
x
,
y
=
[
x
,
y
=
[
rearrange_if_needed
(
tensor
,
stride
)
rearrange_if_needed
(
tensor
,
stride
)
for
tensor
,
stride
in
zip
([
x
,
y
],
[
x_stride
,
y_stride
])
for
tensor
,
stride
in
zip
([
x
,
y
],
[
x_stride
,
y_stride
])
]
]
x_tensor
,
y_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
x
,
y
]]
x_tensor
,
y_tensor
=
[
to_tensor
(
tensor
,
lib
)
for
tensor
in
[
x
,
y
]]
descriptor
=
infiniopRearrangeDescriptor_t
()
descriptor
=
infiniopRearrangeDescriptor_t
()
...
@@ -86,7 +155,7 @@ def test(
...
@@ -86,7 +155,7 @@ def test(
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for
tensor
in
[
x_tensor
,
y_tensor
]:
for
tensor
in
[
x_tensor
,
y_tensor
]:
tensor
.
des
criptor
.
contents
.
invalidate
(
)
tensor
.
des
troyDesc
(
lib
)
def
lib_rearrange
():
def
lib_rearrange
():
check_error
(
check_error
(
...
...
test/infiniop/swiglu.py
View file @
4e4d3415
...
@@ -61,7 +61,8 @@ _TENSOR_DTYPES = [torch.float16, torch.float32]
...
@@ -61,7 +61,8 @@ _TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
1e-4
,
"rtol"
:
1e-2
},
torch
.
float16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
torch
.
float32
:
{
"atol"
:
2e-7
,
"rtol"
:
1e-7
},
}
}
DEBUG
=
False
DEBUG
=
False
...
...
xmake.lua
View file @
4e4d3415
...
@@ -118,6 +118,18 @@ if has_config("kunlun-xpu") then
...
@@ -118,6 +118,18 @@ if has_config("kunlun-xpu") then
includes
(
"xmake/kunlun.lua"
)
includes
(
"xmake/kunlun.lua"
)
end
end
-- InfiniCCL
option
(
"ccl"
)
set_default
(
false
)
set_default
(
false
)
set_showmenu
(
true
)
set_description
(
"Wether to complie implementations for InfiniCCL"
)
option_end
()
if
has_config
(
"ccl"
)
then
add_defines
(
"ENABLE_CCL"
)
end
target
(
"infini-utils"
)
target
(
"infini-utils"
)
set_kind
(
"static"
)
set_kind
(
"static"
)
on_install
(
function
(
target
)
end
)
on_install
(
function
(
target
)
end
)
...
@@ -220,10 +232,25 @@ target("infiniop")
...
@@ -220,10 +232,25 @@ target("infiniop")
add_installfiles
(
"include/infinicore.h"
,
{
prefixdir
=
"include"
})
add_installfiles
(
"include/infinicore.h"
,
{
prefixdir
=
"include"
})
target_end
()
target_end
()
target
(
"infiniccl"
)
set_kind
(
"shared"
)
add_deps
(
"infinirt"
)
if
has_config
(
"nv-gpu"
)
then
add_deps
(
"infiniccl-cuda"
)
end
set_languages
(
"cxx17"
)
add_files
(
"src/infiniccl/*.cc"
)
add_installfiles
(
"include/infiniccl.h"
,
{
prefixdir
=
"include"
})
set_installdir
(
os.getenv
(
"INFINI_ROOT"
)
or
(
os.getenv
(
is_host
(
"windows"
)
and
"HOMEPATH"
or
"HOME"
)
..
"/.infini"
))
target_end
()
target
(
"all"
)
target
(
"all"
)
set_kind
(
"phony"
)
set_kind
(
"phony"
)
add_deps
(
"infiniop"
,
"infinirt"
)
add_deps
(
"infiniop"
,
"infinirt"
,
"infiniccl"
)
after_build
(
function
(
target
)
print
(
YELLOW
..
"[Congratulations!] Now you can install the libraries with \"
xmake
install
\
""
..
NC
)
end
)
after_build
(
function
(
target
)
print
(
YELLOW
..
"[Congratulations!] Now you can install the libraries with \"
xmake
install
\
""
..
NC
)
end
)
target_end
()
target_end
()
...
...
xmake/cuda.lua
View file @
4e4d3415
...
@@ -58,3 +58,34 @@ target("infinirt-cuda")
...
@@ -58,3 +58,34 @@ target("infinirt-cuda")
set_languages
(
"cxx17"
)
set_languages
(
"cxx17"
)
add_files
(
"../src/infinirt/cuda/*.cu"
)
add_files
(
"../src/infinirt/cuda/*.cu"
)
target_end
()
target_end
()
target
(
"infiniccl-cuda"
)
set_kind
(
"static"
)
add_deps
(
"infinirt"
)
on_install
(
function
(
target
)
end
)
if
has_config
(
"ccl"
)
then
set_policy
(
"build.cuda.devlink"
,
true
)
set_toolchains
(
"cuda"
)
add_links
(
"cudart"
)
if
not
is_plat
(
"windows"
)
then
add_cuflags
(
"-Xcompiler=-fPIC"
)
add_culdflags
(
"-Xcompiler=-fPIC"
)
add_cxflags
(
"-fPIC"
)
local
nccl_root
=
os.getenv
(
"NCCL_ROOT"
)
if
nccl_root
then
add_includedirs
(
nccl_root
..
"/include"
)
add_links
(
nccl_root
..
"/lib/libnccl.so"
)
else
add_links
(
"nccl"
)
-- Fall back to default nccl linking
end
add_files
(
"../src/infiniccl/cuda/*.cu"
)
else
print
(
"[Warning] NCCL is not supported on Windows"
)
end
end
set_languages
(
"cxx17"
)
target_end
()
xmake/test.lua
View file @
4e4d3415
...
@@ -34,3 +34,20 @@ target("infiniop-test")
...
@@ -34,3 +34,20 @@ target("infiniop-test")
set_installdir
(
INFINI_ROOT
)
set_installdir
(
INFINI_ROOT
)
target_end
()
target_end
()
target
(
"infiniccl-test"
)
set_kind
(
"binary"
)
add_deps
(
"infini-utils"
)
set_default
(
false
)
set_warnings
(
"all"
,
"error"
)
set_languages
(
"cxx17"
)
local
INFINI_ROOT
=
os.getenv
(
"INFINI_ROOT"
)
or
(
os.getenv
(
is_host
(
"windows"
)
and
"HOMEPATH"
or
"HOME"
)
..
"/.infini"
)
add_includedirs
(
INFINI_ROOT
..
"/include"
)
add_linkdirs
(
INFINI_ROOT
..
"/lib"
)
add_links
(
"infinirt"
,
"infiniccl"
)
add_files
(
os
.
projectdir
()
..
"/src/infiniccl-test/*.cpp"
)
set_installdir
(
os.getenv
(
"INFINI_ROOT"
)
or
(
os.getenv
(
is_host
(
"windows"
)
and
"HOMEPATH"
or
"HOME"
)
..
"/.infini"
))
target_end
()
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment