Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
sunzhq2
bytemlperf-dcu
Commits
24b257f1
Commit
24b257f1
authored
Nov 19, 2024
by
sunzhq2
Browse files
init
parent
920b3c0f
Changes
330
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2634 additions
and
0 deletions
+2634
-0
ByteMLPerf/byte_micro_perf/backends/module_store.py
ByteMLPerf/byte_micro_perf/backends/module_store.py
+1035
-0
ByteMLPerf/byte_micro_perf/backends/utils.py
ByteMLPerf/byte_micro_perf/backends/utils.py
+132
-0
ByteMLPerf/byte_micro_perf/core/perf_engine.py
ByteMLPerf/byte_micro_perf/core/perf_engine.py
+556
-0
ByteMLPerf/byte_micro_perf/launch.py
ByteMLPerf/byte_micro_perf/launch.py
+244
-0
ByteMLPerf/byte_micro_perf/requirements.txt
ByteMLPerf/byte_micro_perf/requirements.txt
+14
-0
ByteMLPerf/byte_micro_perf/run.sh
ByteMLPerf/byte_micro_perf/run.sh
+4
-0
ByteMLPerf/byte_micro_perf/scripts/convert.py
ByteMLPerf/byte_micro_perf/scripts/convert.py
+386
-0
ByteMLPerf/byte_micro_perf/workloads/add.json
ByteMLPerf/byte_micro_perf/workloads/add.json
+21
-0
ByteMLPerf/byte_micro_perf/workloads/allgather.json
ByteMLPerf/byte_micro_perf/workloads/allgather.json
+22
-0
ByteMLPerf/byte_micro_perf/workloads/allreduce.json
ByteMLPerf/byte_micro_perf/workloads/allreduce.json
+22
-0
ByteMLPerf/byte_micro_perf/workloads/alltoall.json
ByteMLPerf/byte_micro_perf/workloads/alltoall.json
+26
-0
ByteMLPerf/byte_micro_perf/workloads/batch_gemm.json
ByteMLPerf/byte_micro_perf/workloads/batch_gemm.json
+26
-0
ByteMLPerf/byte_micro_perf/workloads/broadcast.json
ByteMLPerf/byte_micro_perf/workloads/broadcast.json
+22
-0
ByteMLPerf/byte_micro_perf/workloads/cast.json
ByteMLPerf/byte_micro_perf/workloads/cast.json
+17
-0
ByteMLPerf/byte_micro_perf/workloads/cos.json
ByteMLPerf/byte_micro_perf/workloads/cos.json
+17
-0
ByteMLPerf/byte_micro_perf/workloads/device2host.json
ByteMLPerf/byte_micro_perf/workloads/device2host.json
+17
-0
ByteMLPerf/byte_micro_perf/workloads/div.json
ByteMLPerf/byte_micro_perf/workloads/div.json
+22
-0
ByteMLPerf/byte_micro_perf/workloads/exp.json
ByteMLPerf/byte_micro_perf/workloads/exp.json
+17
-0
ByteMLPerf/byte_micro_perf/workloads/exponential.json
ByteMLPerf/byte_micro_perf/workloads/exponential.json
+17
-0
ByteMLPerf/byte_micro_perf/workloads/gather.json
ByteMLPerf/byte_micro_perf/workloads/gather.json
+17
-0
No files found.
ByteMLPerf/byte_micro_perf/backends/module_store.py
0 → 100644
View file @
24b257f1
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
random
import
torch
import
torch.distributed
as
dist
def
gemm_compute_size
(
input_shapes
,
torch_dtype
):
# input_shapes: [[M, K], [K, N]]
a_shape
,
b_shape
=
input_shapes
M
,
_
=
a_shape
_
,
N
=
b_shape
d_shape
=
[
M
,
N
]
# get element_size and dtype_size
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
,
b_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
d_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
if
torch_dtype
==
torch
.
int8
:
output_tensor_size
=
4
*
output_element_num
else
:
output_tensor_size
=
dtype_size
*
output_element_num
batch_size
=
M
tensor_size
=
input_tensor_size
+
output_tensor_size
return
(
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
)
def
gemm_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
# input_shapes: [[M, K], [K, N]]
a_shape
,
b_shape
=
input_shapes
M
,
_
=
a_shape
_
,
N
=
b_shape
d_shape
=
[
M
,
N
]
# create input tensors
a_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
b_tensor
=
torch
.
randint
(
0
,
7
,
b_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
d_tensor
=
torch
.
randint
(
0
,
7
,
d_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
a_tensor
,
b_tensor
,
d_tensor
]
def
batch_gemm_compute_size
(
input_shapes
,
torch_dtype
):
# input_shapes: [[bs, M, K], [bs, K, N]]
a_shape
,
b_shape
=
input_shapes
bs
,
M
,
_
=
a_shape
bs
,
_
,
N
=
b_shape
d_shape
=
[
bs
,
M
,
N
]
# get element_size and dtype_size
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
,
b_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
d_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
if
torch_dtype
==
torch
.
int8
:
output_tensor_size
=
4
*
output_element_num
else
:
output_tensor_size
=
dtype_size
*
output_element_num
batch_size
=
bs
tensor_size
=
input_tensor_size
+
output_tensor_size
return
(
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
)
def
batch_gemm_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
# input_shapes: [[bs, M, K], [bs, K, N]]
a_shape
,
b_shape
=
input_shapes
bs
,
M
,
_
=
a_shape
bs
,
_
,
N
=
b_shape
d_shape
=
[
bs
,
M
,
N
]
# create input tensors
a_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
b_tensor
=
torch
.
randint
(
0
,
7
,
b_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
d_tensor
=
torch
.
randint
(
0
,
7
,
d_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
a_tensor
,
b_tensor
,
d_tensor
]
def
group_gemm_compute_size
(
input_shapes
,
torch_dtype
):
"""
[
[[M1, K1], [K1, N1]],
[[M2, K2], [K2, N2]]
]
"""
input_tensor_size
=
0
output_tensor_size
=
0
for
problem_shape
in
input_shapes
:
a_shape
,
b_shape
=
problem_shape
M
,
_
=
a_shape
_
,
N
=
b_shape
d_shape
=
[
M
,
N
]
# get element_size and dtype_size
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
,
b_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
d_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
input_tensor_size
+=
dtype_size
*
input_element_num
if
torch_dtype
==
torch
.
int8
:
output_tensor_size
+=
4
*
output_element_num
else
:
output_tensor_size
+=
dtype_size
*
output_element_num
batch_size
=
1
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
group_gemm_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
"""
[
[[M1, K1], [K1, N1]],
[[M2, K2], [K2, N2]]
]
"""
left_tensors
=
[]
right_tensors
=
[]
output_tensors
=
[]
for
problem_shape
in
input_shapes
:
a_shape
,
b_shape
=
problem_shape
M
,
_
=
a_shape
_
,
N
=
b_shape
d_shape
=
[
M
,
N
]
# create input tensors
left_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
right_tensor
=
torch
.
randint
(
0
,
7
,
b_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
output_tensor
=
torch
.
randint
(
0
,
7
,
d_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
left_tensors
.
append
(
left_tensor
)
right_tensors
.
append
(
right_tensor
)
output_tensors
.
append
(
output_tensor
)
return
[
left_tensors
,
right_tensors
,
output_tensors
]
def
sin_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
c_shape
=
a_shape
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
c_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
output_tensor_size
=
dtype_size
*
output_element_num
batch_size
=
c_shape
[
0
]
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
sin_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
c_shape
=
a_shape
# create input tensors
a_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
c_tensor
=
torch
.
randint
(
0
,
7
,
c_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
a_tensor
,
c_tensor
]
def
cast_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
c_shape
=
a_shape
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
c_shape
]])
if
torch_dtype
==
torch
.
float32
:
dst_torch_dtype
=
torch
.
bfloat16
elif
torch_dtype
==
torch
.
bfloat16
or
torch_dtype
==
torch
.
float16
:
dst_torch_dtype
=
torch
.
float32
elif
torch_dtype
==
torch
.
int8
:
dst_torch_dtype
=
torch
.
int32
else
:
dst_torch_dtype
=
torch_dtype
src_dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
dst_dtype_size
=
torch
.
tensor
([],
dtype
=
dst_torch_dtype
).
element_size
()
input_tensor_size
=
src_dtype_size
*
input_element_num
output_tensor_size
=
dst_dtype_size
*
output_element_num
batch_size
=
c_shape
[
0
]
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
cast_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
c_shape
=
a_shape
if
torch_dtype
==
torch
.
float32
:
dst_torch_dtype
=
torch
.
bfloat16
elif
torch_dtype
==
torch
.
bfloat16
or
torch_dtype
==
torch
.
float16
:
dst_torch_dtype
=
torch
.
float32
elif
torch_dtype
==
torch
.
int8
:
dst_torch_dtype
=
torch
.
int32
else
:
dst_torch_dtype
=
torch_dtype
# create input tensors
a_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
c_tensor
=
torch
.
randint
(
0
,
7
,
c_shape
,
dtype
=
dst_torch_dtype
,
device
=
xpu_device
)
return
[
a_tensor
,
c_tensor
]
def
swiglu_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
input_tensor_shape
=
[
batch_size
,
hidden_size
]
output_tensor_shape
=
[
batch_size
,
hidden_size
]
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
input_tensor_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
output_tensor_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
output_tensor_size
=
dtype_size
*
output_element_num
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
swiglu_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
input_tensor_shape
=
[
batch_size
,
hidden_size
]
output_tensor_shape
=
[
batch_size
,
hidden_size
]
# create input tensors
input_tensor
=
torch
.
randint
(
0
,
7
,
input_tensor_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
output_tensor
=
torch
.
randint
(
0
,
7
,
output_tensor_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
input_tensor
,
output_tensor
]
def
add_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
b_shape
=
input_shapes
c_shape
=
a_shape
batch_size
,
hidden_size
=
a_shape
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
,
b_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
c_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
output_tensor_size
=
dtype_size
*
output_element_num
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
add_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
b_shape
=
input_shapes
c_shape
=
a_shape
# create input tensors
a_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
b_tensor
=
torch
.
randint
(
0
,
7
,
b_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
c_tensor
=
torch
.
randint
(
0
,
7
,
c_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
a_tensor
,
b_tensor
,
c_tensor
]
def
layer_norm_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
c_shape
=
a_shape
w_shape
=
a_shape
[
-
1
:]
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
,
w_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
c_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
output_tensor_size
=
dtype_size
*
output_element_num
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
layer_norm_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
c_shape
=
a_shape
w_shape
=
a_shape
[
-
1
:]
# create input tensors
a_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
c_tensor
=
torch
.
randint
(
0
,
7
,
c_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create weight tensors
w_tensor
=
torch
.
randint
(
0
,
7
,
w_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
a_tensor
,
c_tensor
,
w_tensor
]
def
softmax_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
c_shape
=
a_shape
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
c_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
output_tensor_size
=
dtype_size
*
output_element_num
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
softmax_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
c_shape
=
a_shape
# create input tensors
a_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
c_tensor
=
torch
.
randint
(
0
,
7
,
c_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
a_tensor
,
c_tensor
]
def
reduce_sum_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
c_shape
=
[
batch_size
,
1
]
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
c_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
output_tensor_size
=
dtype_size
*
output_element_num
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
reduce_sum_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
c_shape
=
[
batch_size
,
1
]
# create input tensors
a_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
c_tensor
=
torch
.
randint
(
0
,
7
,
c_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
a_tensor
,
c_tensor
]
def
reduce_min_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
values_shape
=
[
batch_size
,
1
]
indices_shape
=
[
batch_size
,
1
]
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
values_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
values_shape
]])
indices_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
indices_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
indices_dtype_size
=
torch
.
tensor
([],
dtype
=
torch
.
int64
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
output_tensor_size
=
dtype_size
*
values_element_num
+
indices_dtype_size
*
indices_element_num
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
reduce_min_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
values_shape
=
[
batch_size
,
1
]
indices_shape
=
[
batch_size
,
1
]
# create input tensors
a_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
values_tensor
=
torch
.
randint
(
0
,
7
,
values_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
indices_tensor
=
torch
.
randint
(
0
,
7
,
indices_shape
,
dtype
=
torch
.
int64
,
device
=
xpu_device
)
return
[
a_tensor
,
values_tensor
,
indices_tensor
]
def
index_add_compute_size
(
input_shapes
,
torch_dtype
):
# src_tensor -->(index_tensor) dst_tensor
dst_shape
,
src_shape
=
input_shapes
src_batch_size
=
src_shape
[
0
]
dst_batch_size
=
dst_shape
[
0
]
index_shape
=
[
src_batch_size
]
src_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
src_shape
]])
index_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
index_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
index_dtype_size
=
torch
.
tensor
([],
dtype
=
torch
.
int64
).
element_size
()
src_tensor_size
=
dtype_size
*
src_element_num
index_tensor_size
=
index_dtype_size
*
index_element_num
input_tensor_size
=
2
*
src_tensor_size
+
index_tensor_size
output_tensor_size
=
src_tensor_size
tensor_size
=
input_tensor_size
+
output_tensor_size
return
src_batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
index_add_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
# src_tensor -->(index_tensor) dst_tensor
dst_shape
,
src_shape
=
input_shapes
src_batch_size
=
src_shape
[
0
]
dst_batch_size
=
dst_shape
[
0
]
index_shape
=
[
src_batch_size
]
# create output tensors
dst_tensor
=
torch
.
randint
(
0
,
7
,
dst_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create input tensors
src_tensor
=
torch
.
randint
(
0
,
7
,
src_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
index_tensor
=
torch
.
randint
(
0
,
dst_batch_size
,
index_shape
,
dtype
=
torch
.
int64
,
device
=
xpu_device
)
return
[
dst_tensor
,
src_tensor
,
index_tensor
]
def
sort_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
c_shape
=
a_shape
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
c_shape
]])
indice_element_num
=
output_element_num
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
indice_dtype_size
=
torch
.
tensor
([],
dtype
=
torch
.
int64
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
output_tensor_size
=
dtype_size
*
output_element_num
+
indice_dtype_size
*
indice_element_num
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
sort_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
c_shape
=
a_shape
# create input tensors
a_tensor
=
torch
.
randint
(
0
,
7
,
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create output tensors
c_tensor
=
torch
.
randint
(
0
,
7
,
c_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
indice_tensor
=
torch
.
randint
(
0
,
7
,
c_shape
,
dtype
=
torch
.
int64
,
device
=
xpu_device
)
return
[
a_tensor
,
c_tensor
,
indice_tensor
]
def
unique_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
c_shape
=
a_shape
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
c_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
indice_dtype_size
=
torch
.
tensor
([],
dtype
=
torch
.
int64
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
output_tensor_size
=
dtype_size
*
output_element_num
+
indice_dtype_size
*
output_element_num
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
unique_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
c_shape
=
a_shape
# create input tensors
torch
.
manual_seed
(
1
)
a_tensor
=
torch
.
randint
(
0
,
1024
,
a_shape
,
dtype
=
torch_dtype
,
device
=
"cpu"
).
to
(
device
=
xpu_device
)
# create output tensors
c_tensor
=
torch
.
empty
(
c_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
count_tensor
=
torch
.
empty
(
c_shape
,
dtype
=
torch
.
int64
,
device
=
xpu_device
)
return
[
a_tensor
,
c_tensor
,
count_tensor
]
def
scatter_compute_size
(
input_shapes
,
torch_dtype
):
tensor_shape
=
input_shapes
[
0
]
batch_size
,
hidden_size
=
tensor_shape
index_shape
=
[
batch_size
]
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
tensor_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
tensor_shape
]])
index_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
index_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
index_dtype_size
=
torch
.
tensor
([],
dtype
=
torch
.
int64
).
element_size
()
input_element_num
=
dtype_size
*
input_element_num
+
index_dtype_size
*
index_element_num
output_element_num
=
dtype_size
*
output_element_num
tensor_size
=
input_element_num
+
output_element_num
return
batch_size
,
tensor_size
,
input_element_num
,
output_element_num
def
scatter_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
tensor_shape
=
input_shapes
[
0
]
batch_size
,
hidden_size
=
tensor_shape
index_shape
=
[
batch_size
]
# create output tensors
dst_tensor
=
torch
.
randint
(
0
,
7
,
tensor_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
# create input tensors
src_tensor
=
torch
.
randint
(
0
,
7
,
tensor_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
index
=
[
i
for
i
in
range
(
batch_size
)]
random
.
shuffle
(
index
)
index_tensor
=
torch
.
tensor
(
index
,
dtype
=
torch
.
int64
,
device
=
xpu_device
)
index_tensor
=
index_tensor
.
reshape
(
-
1
,
1
).
expand
(
-
1
,
hidden_size
)
return
[
dst_tensor
,
src_tensor
,
index_tensor
]
def
host2device_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
output_tensor_size
=
dtype_size
*
output_element_num
tensor_size
=
output_tensor_size
return
batch_size
,
tensor_size
,
0
,
output_tensor_size
def
host2device_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
host_tensor
=
torch
.
empty
(
a_shape
,
dtype
=
torch_dtype
,
device
=
"cpu"
).
pin_memory
()
device_tensor
=
torch
.
empty
(
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
host_tensor
,
device_tensor
]
def
allreduce_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
a_tensor
=
torch
.
zeros
(
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
a_tensor
]
def
allgather_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
output_tensor_size
=
dtype_size
*
output_element_num
tensor_size
=
output_tensor_size
return
batch_size
,
tensor_size
,
0
,
output_tensor_size
def
allgather_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
=
input_shapes
batch_size
,
hidden_size
=
a_shape
world_size
=
dist
.
get_world_size
()
tensor
=
torch
.
empty
([
batch_size
,
hidden_size
],
dtype
=
torch_dtype
,
device
=
xpu_device
)
tensors
=
list
(
torch
.
chunk
(
tensor
,
world_size
,
dim
=
0
))
return
[
tensors
]
def
alltoall_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
b_shape
=
input_shapes
batch_size
,
hidden_size
=
a_shape
world_size
=
dist
.
get_world_size
()
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
*
2
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
output_tensor_size
=
dtype_size
*
output_element_num
tensor_size
=
output_tensor_size
return
batch_size
,
tensor_size
,
0
,
output_tensor_size
def
alltoall_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
b_shape
=
input_shapes
batch_size
,
hidden_size
=
a_shape
world_size
=
dist
.
get_world_size
()
input_tensor
=
torch
.
empty
([
batch_size
,
hidden_size
],
dtype
=
torch_dtype
,
device
=
xpu_device
)
input_tensors
=
list
(
torch
.
chunk
(
input_tensor
,
world_size
,
dim
=
0
))
output_tensor
=
torch
.
empty
([
batch_size
,
hidden_size
],
dtype
=
torch_dtype
,
device
=
xpu_device
)
output_tensors
=
list
(
torch
.
chunk
(
output_tensor
,
world_size
,
dim
=
0
))
return
[
input_tensors
,
output_tensors
]
def
p2p_compute_size
(
input_shapes
,
torch_dtype
):
a_shape
,
b_shape
=
input_shapes
batch_size
,
hidden_size
=
a_shape
input_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
a_shape
]])
output_element_num
=
sum
([
math
.
prod
(
shape
)
for
shape
in
[
b_shape
]])
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
input_tensor_size
=
dtype_size
*
input_element_num
output_tensor_size
=
dtype_size
*
output_element_num
tensor_size
=
input_tensor_size
+
output_tensor_size
return
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
def
p2p_create_tensors
(
input_shapes
,
torch_dtype
,
xpu_device
):
a_shape
,
b_shape
=
input_shapes
batch_size
,
hidden_size
=
a_shape
a_tensor
=
torch
.
empty
(
a_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
b_tensor
=
torch
.
empty
(
b_shape
,
dtype
=
torch_dtype
,
device
=
xpu_device
)
return
[
a_tensor
,
b_tensor
]
"""
gemm ops
"""
class
GemmOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor_a
,
input_tensor_b
,
input_tensor_d
):
compute_dtype
=
input_tensor_a
.
dtype
if
compute_dtype
in
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
]:
torch
.
mm
(
input_tensor_a
,
input_tensor_b
,
out
=
input_tensor_d
)
else
:
raise
Exception
(
f
"GemmOp with dtype
{
compute_dtype
}
is not implemented"
)
class
BatchGemmOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor_a
,
input_tensor_b
,
input_tensor_d
):
compute_dtype
=
input_tensor_a
.
dtype
if
compute_dtype
in
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
]:
torch
.
bmm
(
input_tensor_a
,
input_tensor_b
,
out
=
input_tensor_d
)
else
:
raise
Exception
(
f
"BatchGemmOp with dtype
{
compute_dtype
}
is not implemented"
)
class
GroupGemmOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor_a
,
input_tensor_b
,
input_tensor_d
):
compute_dtype
=
input_tensor_a
[
0
].
dtype
for
a
,
b
,
d
in
zip
(
input_tensor_a
,
input_tensor_b
,
input_tensor_d
):
if
compute_dtype
in
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
]:
torch
.
mm
(
a
,
b
,
out
=
d
)
else
:
raise
Exception
(
f
"GroupGemmOp with dtype
{
compute_dtype
}
is not implemented"
)
"""
unary ops
"""
class
SinOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
torch
.
sin
(
input_tensor
,
out
=
output_tensor
)
class
CosOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
torch
.
cos
(
input_tensor
,
out
=
output_tensor
)
class
ExpOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
torch
.
exp
(
input_tensor
,
out
=
output_tensor
)
class
ExponentialOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
input_tensor
.
exponential_
()
class
LogOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
torch
.
log
(
input_tensor
,
out
=
output_tensor
)
class
SqrtOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
torch
.
sqrt
(
input_tensor
,
out
=
output_tensor
)
class
CastOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
output_tensor
=
input_tensor
.
to
(
output_tensor
.
dtype
)
class
SiluOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
output_tensor
=
torch
.
nn
.
functional
.
silu
(
input_tensor
)
class
GeluOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
output_tensor
=
torch
.
nn
.
functional
.
gelu
(
input_tensor
)
class
SwiGLUOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
torch
.
mul
(
torch
.
nn
.
functional
.
silu
(
input_tensor
),
input_tensor
,
out
=
output_tensor
)
"""
Binary ops
"""
class
AddOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor_a
,
input_tensor_b
,
input_tensor_c
):
torch
.
add
(
input_tensor_a
,
input_tensor_b
,
out
=
input_tensor_c
)
class
MulOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor_a
,
input_tensor_b
,
input_tensor_c
):
torch
.
mul
(
input_tensor_a
,
input_tensor_b
,
out
=
input_tensor_c
)
class
SubOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor_a
,
input_tensor_b
,
input_tensor_c
):
torch
.
sub
(
input_tensor_a
,
input_tensor_b
,
out
=
input_tensor_c
)
class
DivOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor_a
,
input_tensor_b
,
input_tensor_c
):
torch
.
div
(
input_tensor_a
,
input_tensor_b
,
out
=
input_tensor_c
)
"""
reduction ops
"""
class
LayerNormOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
,
weight_tensor
):
output_tensor
=
torch
.
nn
.
functional
.
layer_norm
(
input_tensor
,
(
input_tensor
.
shape
[
-
1
],),
weight_tensor
)
class
SoftmaxOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
output_tensor
=
torch
.
nn
.
functional
.
softmax
(
input_tensor
,
dim
=-
1
,
dtype
=
output_tensor
.
dtype
)
class
ReduceSumOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
):
torch
.
sum
(
input_tensor
,
dim
=-
1
,
keepdim
=
True
,
dtype
=
output_tensor
.
dtype
,
out
=
output_tensor
)
class
ReduceMinOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
value_tensor
,
indice_tensor
):
torch
.
min
(
input_tensor
,
dim
=-
1
,
keepdim
=
True
,
out
=
(
value_tensor
,
indice_tensor
))
class
ReduceMaxOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
value_tensor
,
indice_tensor
):
torch
.
max
(
input_tensor
,
dim
=-
1
,
keepdim
=
True
,
out
=
(
value_tensor
,
indice_tensor
))
"""
index_ops
"""
class
IndexAddOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
dst_tensor
,
src_tensor
,
index_tensor
):
dst_tensor
.
index_add_
(
0
,
index_tensor
,
src_tensor
)
class
SortOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
,
indice_tensor
):
torch
.
sort
(
input_tensor
,
dim
=-
1
,
out
=
(
output_tensor
,
indice_tensor
))
class
UniqueOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
,
output_tensor
,
count_tensor
):
output_tensor
,
count_tensor
=
torch
.
unique
(
input
=
input_tensor
,
sorted
=
False
,
return_counts
=
True
,
return_inverse
=
False
)
class
ScatterOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
dst_tensor
,
src_tensor
,
index_tensor
):
dst_tensor
.
scatter_
(
0
,
index_tensor
,
src_tensor
)
class
GatherOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
dst_tensor
,
src_tensor
,
index_tensor
):
torch
.
gather
(
src_tensor
,
0
,
index_tensor
,
out
=
dst_tensor
)
"""
h2d_ops
"""
class
Host2DeviceOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
host_tensor
,
device_tensor
):
device_tensor
.
copy_
(
host_tensor
)
class
Device2HostOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
host_tensor
,
device_tensor
):
host_tensor
.
copy_
(
device_tensor
)
"""
communication ops
"""
class
AllReduceOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
):
dist
.
all_reduce
(
input_tensor
,
op
=
dist
.
ReduceOp
.
SUM
)
class
AllGatherOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensors
):
dist
.
all_gather
(
input_tensors
,
input_tensors
[
dist
.
get_rank
()])
class
ReduceScatterOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensors
):
dist
.
reduce_scatter
(
input_tensors
[
dist
.
get_rank
()],
input_tensors
)
class
AllToAllOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensors
,
output_tensors
):
dist
.
all_to_all
(
output_tensors
,
input_tensors
)
class
BroadcastOp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input_tensor
):
dist
.
broadcast
(
input_tensor
,
0
)
class
P2POp
(
torch
.
nn
.
Module
):
def
forward
(
self
,
send_tensor
,
recv_tensor
):
world_size
=
dist
.
get_world_size
()
rank
=
dist
.
get_rank
()
reqs
=
[]
if
rank
!=
world_size
-
1
:
reqs
.
append
(
dist
.
isend
(
send_tensor
,
(
rank
+
1
)
%
world_size
))
if
rank
!=
0
:
reqs
.
append
(
dist
.
irecv
(
recv_tensor
,
(
rank
-
1
+
world_size
)
%
world_size
))
for
req
in
reqs
:
req
.
wait
()
op_registry
=
{
# gemm ops
"gemm"
:
GemmOp
(),
"gemv"
:
GemmOp
(),
"batch_gemm"
:
BatchGemmOp
(),
"group_gemm"
:
GroupGemmOp
(),
# unary ops
"sin"
:
SinOp
(),
"cos"
:
CosOp
(),
"exp"
:
ExpOp
(),
"exponential"
:
ExponentialOp
(),
"log"
:
LogOp
(),
"sqrt"
:
SqrtOp
(),
"cast"
:
CastOp
(),
"silu"
:
SiluOp
(),
"gelu"
:
GeluOp
(),
"swiglu"
:
SwiGLUOp
(),
# binary ops
"add"
:
AddOp
(),
"sub"
:
SubOp
(),
"mul"
:
MulOp
(),
"div"
:
DivOp
(),
# reduction ops
"layernorm"
:
LayerNormOp
(),
"softmax"
:
SoftmaxOp
(),
"reduce_sum"
:
ReduceSumOp
(),
"reduce_max"
:
ReduceMaxOp
(),
"reduce_min"
:
ReduceMinOp
(),
# index_ops
"index_add"
:
IndexAddOp
(),
"sort"
:
SortOp
(),
"unique"
:
UniqueOp
(),
"scatter"
:
ScatterOp
(),
"gather"
:
GatherOp
(),
# h2d_ops
"device2host"
:
Device2HostOp
(),
"host2device"
:
Host2DeviceOp
(),
# ccl ops
"broadcast"
:
BroadcastOp
(),
"allreduce"
:
AllReduceOp
(),
"allgather"
:
AllGatherOp
(),
"alltoall"
:
AllToAllOp
(),
"reducescatter"
:
ReduceScatterOp
(),
"p2p"
:
P2POp
(),
}
op_compute_size_funcs
=
{
# gemm_ops
"gemm"
:
gemm_compute_size
,
"gemv"
:
gemm_compute_size
,
"batch_gemm"
:
batch_gemm_compute_size
,
"group_gemm"
:
group_gemm_compute_size
,
# unary_ops
"sin"
:
sin_compute_size
,
"cos"
:
sin_compute_size
,
"exp"
:
sin_compute_size
,
"exponential"
:
sin_compute_size
,
"log"
:
sin_compute_size
,
"sqrt"
:
sin_compute_size
,
"cast"
:
cast_compute_size
,
"silu"
:
sin_compute_size
,
"gelu"
:
sin_compute_size
,
"swiglu"
:
swiglu_compute_size
,
# binary_ops
"add"
:
add_compute_size
,
"mul"
:
add_compute_size
,
"sub"
:
add_compute_size
,
"div"
:
add_compute_size
,
# reduction_ops
"layernorm"
:
layer_norm_compute_size
,
"softmax"
:
softmax_compute_size
,
"reduce_sum"
:
reduce_sum_compute_size
,
"reduce_min"
:
reduce_min_compute_size
,
"reduce_max"
:
reduce_min_compute_size
,
# index_ops
"index_add"
:
index_add_compute_size
,
"sort"
:
sort_compute_size
,
"unique"
:
unique_compute_size
,
"scatter"
:
scatter_compute_size
,
"gather"
:
scatter_compute_size
,
# h2d_ops
"host2device"
:
host2device_compute_size
,
"device2host"
:
host2device_compute_size
,
# ccl_ops
"broadcast"
:
host2device_compute_size
,
"allreduce"
:
host2device_compute_size
,
"allgather"
:
allgather_compute_size
,
"alltoall"
:
alltoall_compute_size
,
"reducescatter"
:
allgather_compute_size
,
"p2p"
:
p2p_compute_size
,
}
op_create_tensors_funcs
=
{
# gemm ops
"gemm"
:
gemm_create_tensors
,
"gemv"
:
gemm_create_tensors
,
"batch_gemm"
:
batch_gemm_create_tensors
,
"group_gemm"
:
group_gemm_create_tensors
,
# unary ops
"sin"
:
sin_create_tensors
,
"cos"
:
sin_create_tensors
,
"exp"
:
sin_create_tensors
,
"exponential"
:
sin_create_tensors
,
"log"
:
sin_create_tensors
,
"sqrt"
:
sin_create_tensors
,
"cast"
:
cast_create_tensors
,
"silu"
:
sin_create_tensors
,
"gelu"
:
sin_create_tensors
,
"swiglu"
:
swiglu_create_tensors
,
# binary ops
"add"
:
add_create_tensors
,
"mul"
:
add_create_tensors
,
"sub"
:
add_create_tensors
,
"div"
:
add_create_tensors
,
# reduction ops
"layernorm"
:
layer_norm_create_tensors
,
"softmax"
:
softmax_create_tensors
,
"reduce_sum"
:
reduce_sum_create_tensors
,
"reduce_min"
:
reduce_min_create_tensors
,
"reduce_max"
:
reduce_min_create_tensors
,
# index ops
"index_add"
:
index_add_create_tensors
,
"sort"
:
sort_create_tensors
,
"unique"
:
unique_create_tensors
,
"scatter"
:
scatter_create_tensors
,
"gather"
:
scatter_create_tensors
,
# h2d_ops
"host2device"
:
host2device_create_tensors
,
"device2host"
:
host2device_create_tensors
,
# ccl_ops
"broadcast"
:
allreduce_create_tensors
,
"allreduce"
:
allreduce_create_tensors
,
"allgather"
:
allgather_create_tensors
,
"alltoall"
:
alltoall_create_tensors
,
"reducescatter"
:
allgather_create_tensors
,
"p2p"
:
p2p_create_tensors
,
}
ByteMLPerf/byte_micro_perf/backends/utils.py
0 → 100644
View file @
24b257f1
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
from
typing
import
List
import
numpy
as
np
import
torch
from
backends
import
module_store
def
dump_communication_ops_report
(
op_name
:
str
,
torch_dtype
,
input_shapes
:
List
[
List
[
int
]],
compute_size_func
,
group_size
:
int
,
bandwidth_limit
:
float
,
latency
:
float
,
error
:
str
=
""
):
# get dtype name and dtype_size
dtype_name
=
str
(
torch_dtype
).
split
(
"."
)[
-
1
]
dtype_size
=
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
element_num
=
math
.
prod
(
input_shapes
[
0
])
tensor_size
=
dtype_size
*
element_num
mb
=
tensor_size
/
1024
/
1024
if
error
==
""
:
algo_bw
=
tensor_size
/
latency
/
1e3
"""
allreduce: 2 * (group_size - 1) * (tensor_size / group_size)
allgather: 1 * (group_size - 1) * (tensor_size / group_size)
reducescatter: 1 * (group_size - 1) * (tensor_size / group_size)
alltoall: 1 * (group_size - 1) * (tensor_size / group_size)
broadcast: tensor_size
p2p: tensor_size
"""
if
op_name
in
[
"allgather"
,
"reducescatter"
,
"alltoall"
]:
bus_bw
=
algo_bw
*
(
group_size
-
1
)
/
group_size
elif
op_name
in
[
"allreduce"
]:
bus_bw
=
2
*
algo_bw
*
(
group_size
-
1
)
/
group_size
elif
op_name
in
[
"broadcast"
,
"p2p"
,
"device2host"
,
"host2device"
]:
bus_bw
=
algo_bw
bandwidth_utils
=
None
if
bandwidth_limit
is
not
None
:
bandwidth_utils
=
round
((
algo_bw
/
bandwidth_limit
)
*
1e2
,
2
)
report
=
{
"Dtype"
:
str
(
dtype_name
),
"Tensor Shapes"
:
input_shapes
,
"Memory Size(MB)"
:
round
(
mb
,
2
),
"Group"
:
group_size
,
"Kernel bandwidth(GB/s)"
:
round
(
algo_bw
,
2
),
"Bus bandwidth(GB/s)"
:
round
(
bus_bw
,
2
),
"Bandwidth Utilization(%)"
:
bandwidth_utils
,
"Avg latency(us)"
:
round
(
latency
,
2
),
}
else
:
report
=
{
"Dtype"
:
str
(
dtype_name
),
"Tensor Shapes"
:
input_shapes
,
"Memory Size(MB)"
:
round
(
mb
,
2
),
"Group"
:
group_size
,
"Kernel bandwidth(GB/s)"
:
0
,
"Bus bandwidth(GB/s)"
:
0
,
"Bandwidth Utilization(%)"
:
None
,
"Avg latency(us)"
:
0
,
"Error"
:
error
,
}
return
report
def
dump_computation_ops_report
(
op_name
:
str
,
torch_dtype
:
str
,
input_shapes
:
List
[
List
[
int
]],
compute_size_func
,
bandwidth_limit
:
float
,
latency
:
float
,
error
:
str
=
""
):
# get dtype name and dtype_size
dtype_name
=
str
(
torch_dtype
).
split
(
"."
)[
-
1
]
batch_size
,
tensor_size
,
input_tensor_size
,
output_tensor_size
=
compute_size_func
(
input_shapes
,
torch_dtype
)
if
error
==
""
:
qps
=
round
(
1e6
/
latency
*
batch_size
,
2
)
algo_bw
=
tensor_size
/
latency
/
1e3
bandwidth_utils
=
None
if
bandwidth_limit
is
not
None
:
bandwidth_utils
=
round
((
algo_bw
/
bandwidth_limit
)
*
1e2
,
2
)
report
=
{
"Dtype"
:
str
(
dtype_name
),
"Tensor Shapes"
:
input_shapes
,
"Read IO Size(MB)"
:
round
(
input_tensor_size
/
1024
/
1024
,
2
),
"Write IO Size(MB)"
:
round
(
output_tensor_size
/
1024
/
1024
,
2
),
"Memory Size(MB)"
:
round
(
tensor_size
/
1024
/
1024
,
2
),
"Kernel bandwidth(GB/s)"
:
round
(
algo_bw
,
2
),
"Bandwidth Utilization(%)"
:
bandwidth_utils
,
"Avg latency(us)"
:
round
(
latency
,
2
),
"QPS"
:
qps
,
}
else
:
report
=
{
"Dtype"
:
str
(
dtype_name
),
"Tensor Shapes"
:
input_shapes
,
"Read IO Size(MB)"
:
round
(
input_tensor_size
/
1024
/
1024
,
2
),
"Write IO Size(MB)"
:
round
(
output_tensor_size
/
1024
/
1024
,
2
),
"Memory Size(MB)"
:
round
(
tensor_size
/
1024
/
1024
,
2
),
"Kernel bandwidth(GB/s)"
:
0
,
"Bandwidth Utilization(%)"
:
None
,
"Avg latency(us)"
:
0
,
"QPS"
:
0
,
"Error"
:
error
,
}
return
report
ByteMLPerf/byte_micro_perf/core/perf_engine.py
0 → 100644
View file @
24b257f1
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
json
import
time
import
datetime
import
signal
import
argparse
import
importlib
import
logging
import
subprocess
import
pathlib
import
traceback
import
random
from
typing
import
Any
,
Dict
,
List
import
itertools
from
collections
import
namedtuple
import
torch.distributed
import
torch.multiprocessing
as
mp
import
virtualenv
import
torch
# directory config
CUR_DIR
=
pathlib
.
Path
.
cwd
().
absolute
()
FILE_DIR
=
pathlib
.
Path
(
__file__
).
parent
.
absolute
()
BYTE_MLPERF_ROOT
=
FILE_DIR
.
parent
sys
.
path
.
insert
(
0
,
str
(
BYTE_MLPERF_ROOT
))
# logger config
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
"PerfEngine"
)
def
get_args
():
parser
=
argparse
.
ArgumentParser
()
# hardware config
parser
.
add_argument
(
"--hardware_type"
,
default
=
"GPU"
,
help
=
"The backend going to be evaluted, refs to backends/"
,
)
parser
.
add_argument
(
"--vendor_path"
,
help
=
"The hardware configs need to be loaded, refs to vendor_zoo/NVIDIA/A100-PCIe.json"
,
)
# task config
parser
.
add_argument
(
"--task_dir"
,
default
=
str
(
BYTE_MLPERF_ROOT
.
joinpath
(
"workloads"
)),
help
=
"The direcotry of tasks going to be evaluted, e.g., set to workloads"
)
parser
.
add_argument
(
"--task"
,
default
=
"gemm"
,
help
=
"The task going to be evaluted, refs to workloads/"
,
)
# feature control
parser
.
add_argument
(
"--parallel"
,
type
=
int
,
default
=
1
,
help
=
"Run all tasks in parallel if available"
)
parser
.
add_argument
(
"--activate_venv"
,
action
=
"store_true"
,
help
=
"Enable virtual environment to run the task"
,
)
args
=
parser
.
parse_args
()
return
args
def
load_workload
(
task
:
str
,
task_dir
:
str
)
->
Dict
[
str
,
Any
]:
"""
Return a list of dictionary with model Configuration
Args: List[str]
Returns: List[dic]
"""
modules_dir
=
pathlib
.
Path
(
task_dir
).
absolute
()
# create empty workload json data
workload_dict
=
{}
for
file
in
modules_dir
.
iterdir
():
if
(
file
.
stem
.
startswith
(
'_'
)
or
file
.
stem
.
startswith
(
'.'
)
or
file
.
is_dir
()
or
file
.
suffix
!=
'.json'
or
file
.
stem
!=
task
):
continue
workload_dict
=
json
.
loads
(
file
.
read_text
())
if
not
workload_dict
:
logger
.
error
(
f
"could not find
{
task
}
.json in
{
modules_dir
}
."
)
exit
(
1
)
return
workload_dict
def
parse_workload
(
workload
):
shape_list
=
[]
if
"input_shape_groups"
in
workload
:
input_shape_groups
=
workload
[
"input_shape_groups"
]
if
isinstance
(
workload
[
"input_shape_groups"
],
list
)
else
[
workload
[
"input_shape_groups"
]]
for
input_shape_group
in
input_shape_groups
:
if
"inputs"
in
input_shape_group
:
input_shape_list
=
[]
for
input_shapes
in
input_shape_group
[
"inputs"
]:
input_shape_list
.
append
([
list
(
shape
)
for
shape
in
itertools
.
product
(
*
input_shapes
)])
if
len
(
input_shape_list
)
==
1
:
shape_list
.
extend
(
input_shape_list
[
0
])
else
:
shape_list
.
extend
([
list
(
input_shape
)
for
input_shape
in
zip
(
*
input_shape_list
)])
else
:
gemm_keys
=
[
"M"
,
"K"
,
"N"
,
"MN"
,
"MK"
,
"KN"
]
gemm_values
=
[
input_shape_group
.
get
(
k
,
[])
for
k
in
gemm_keys
]
if
any
(
gemm_values
):
m
,
k
,
n
,
mn
,
mk
,
kn
=
gemm_values
# batch gemm
if
"batch_size"
in
input_shape_group
:
bs
=
input_shape_group
.
get
(
"batch_size"
,
[])
if
m
and
n
and
k
:
for
p
in
itertools
.
product
(
bs
,
m
,
k
,
n
):
shape_list
.
append
([[
p
[
0
],
p
[
1
],
p
[
2
]],
[
p
[
0
],
p
[
2
],
p
[
3
]]])
if
mn
and
k
:
for
p
in
itertools
.
product
(
bs
,
mn
,
k
):
shape_list
.
append
([[
p
[
0
],
p
[
1
][
0
],
p
[
2
]],
[
p
[
0
],
p
[
2
],
p
[
1
][
1
]]])
if
mk
and
n
:
for
p
in
itertools
.
product
(
bs
,
mk
,
n
):
shape_list
.
append
([[
p
[
0
],
p
[
1
][
0
],
p
[
1
][
1
]],
[
p
[
0
],
p
[
1
][
1
],
p
[
2
]]])
if
m
and
kn
:
for
p
in
itertools
.
product
(
bs
,
m
,
kn
):
shape_list
.
append
([[
p
[
0
],
p
[
1
],
p
[
2
][
0
]],
[
p
[
0
],
p
[
2
][
0
],
p
[
2
][
1
]]])
# group gemm
elif
"gemm_group"
in
input_shape_group
:
groups
=
input_shape_group
.
get
(
"gemm_group"
,
[])
batches
=
input_shape_group
.
get
(
"batch"
,
[])
kn
=
input_shape_group
.
get
(
"KN"
,
[])
if
k
and
n
:
kn
.
append
([
list
(
shape
)
for
shape
in
itertools
.
product
(
k
,
n
)])
for
batch
in
batches
:
for
_kn
in
kn
:
group_input_shape_list
=
[]
for
group
in
groups
:
group_input_shape_list
.
append
([[
group
*
batch
,
_kn
[
0
]],
[
_kn
[
0
],
_kn
[
1
]]])
shape_list
.
append
(
group_input_shape_list
)
# gemm
else
:
if
m
and
n
and
k
:
for
p
in
itertools
.
product
(
m
,
k
,
n
):
shape_list
.
append
([[
p
[
0
],
p
[
1
]],
[
p
[
1
],
p
[
2
]]])
if
mn
and
k
:
for
p
in
itertools
.
product
(
mn
,
k
):
shape_list
.
append
([[
p
[
0
][
0
],
p
[
1
]],
[
p
[
1
],
p
[
0
][
1
]]])
if
mk
and
n
:
for
p
in
itertools
.
product
(
mk
,
n
):
shape_list
.
append
([[
p
[
0
][
0
],
p
[
0
][
1
]],
[
p
[
0
][
1
],
p
[
1
]]])
if
m
and
kn
:
for
p
in
itertools
.
product
(
m
,
kn
):
shape_list
.
append
([[
p
[
0
],
p
[
1
][
0
]],
[
p
[
1
][
0
],
p
[
1
][
1
]]])
return
shape_list
ConfigInstance
=
namedtuple
(
"ConfigInstance"
,
[
"dtype"
,
"tensor_shapes"
,
"index"
,
"total"
])
ResultItem
=
namedtuple
(
"ResultItem"
,
[
"config"
,
"report"
])
class
PerfEngine
:
def
__init__
(
self
)
->
None
:
super
().
__init__
()
self
.
args
=
get_args
()
self
.
workload
=
load_workload
(
self
.
args
.
task
,
self
.
args
.
task_dir
)
self
.
backend_type
=
self
.
args
.
hardware_type
self
.
old_os_path
=
os
.
environ
[
"PATH"
]
self
.
prev_sys_path
=
list
(
sys
.
path
)
self
.
real_prefix
=
sys
.
prefix
self
.
version
=
self
.
get_version
()
def
get_version
(
self
):
version
=
""
try
:
version_file
=
os
.
path
.
join
(
str
(
BYTE_MLPERF_ROOT
),
"../VERSION"
)
with
open
(
version_file
)
as
f
:
_version
=
f
.
read
().
splitlines
()
version
=
'.'
.
join
(
v
.
split
(
'='
)[
1
]
for
v
in
_version
)
except
Exception
as
e
:
traceback
.
print_exc
()
logger
.
warning
(
f
"get bytemlperf version failed, error msg:
{
e
}
"
)
return
version
def
get_cpu_name
(
self
):
command
=
"lscpu | grep 'Model name' | awk -F: '{print $2}'"
cpu_name
=
subprocess
.
check_output
(
command
,
shell
=
True
)
return
cpu_name
.
decode
().
strip
()
def
start_engine
(
self
)
->
None
:
if
self
.
args
.
activate_venv
:
self
.
activate_venv
(
self
.
backend_type
)
# init backend
hardware_type
=
self
.
backend_type
logger
.
info
(
"Loading Heterogeneous Backend: {}"
.
format
(
hardware_type
))
backend_module
=
importlib
.
import_module
(
"backends."
+
hardware_type
+
".backend_"
+
hardware_type
.
lower
())
self
.
backend_class
=
getattr
(
backend_module
,
"Backend"
+
hardware_type
)
self
.
backend
=
self
.
backend_class
(
self
.
workload
,
self
.
args
.
vendor_path
)
# create output dir based on task
# {BYTEMLPERF_ROOT}/byte_micro_perf/reports/{backend_type}/{task_name}
hardware_reports_dir
=
BYTE_MLPERF_ROOT
.
joinpath
(
"reports"
,
self
.
backend_type
)
output_dir
=
BYTE_MLPERF_ROOT
.
joinpath
(
"reports"
,
self
.
backend_type
,
self
.
workload
[
"operator"
]
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# get input shape info
target_group_list
=
self
.
workload
.
get
(
"group"
,
[
1
])
target_group_list
.
sort
()
device_count
=
getattr
(
self
.
backend
,
"get_device_count"
)()
group_list
=
[]
for
group
in
target_group_list
:
if
group
<=
device_count
:
group_list
.
append
(
group
)
else
:
break
dtype_list
=
self
.
workload
.
get
(
"dtype"
,
[
"float32"
])
shape_list
=
parse_workload
(
self
.
workload
)
if
not
group_list
or
not
dtype_list
or
not
shape_list
:
logger
.
error
(
"empty group/dtype/shape"
)
exit
(
1
)
test_list
=
[]
case_index
=
0
for
dtype
in
dtype_list
:
for
shape
in
shape_list
:
test_list
.
append
(
ConfigInstance
(
dtype
,
shape
,
case_index
+
1
,
len
(
dtype_list
)
*
len
(
shape_list
)))
case_index
=
case_index
+
1
try
:
mp
.
set_start_method
(
"spawn"
,
force
=
True
)
except
Exception
as
e
:
traceback
.
print_exc
()
logger
.
error
(
f
"Set start method failed, error msg:
{
e
}
"
)
# terminate subprocesses
subprocess_pids
=
[]
def
signal_handler
(
signum
,
frame
):
logger
.
info
(
f
"Received signal
{
signum
}
, exiting..."
)
if
subprocess_pids
:
for
pid
in
subprocess_pids
:
logger
.
info
(
f
"terminate subprocess:
{
pid
}
"
)
os
.
kill
(
pid
,
signal
.
SIGTERM
)
sys
.
exit
(
0
)
signal
.
signal
(
signal
.
SIGINT
,
signal_handler
)
signal
.
signal
(
signal
.
SIGTERM
,
signal_handler
)
# all operations will enter subprocess to test in parallel
for
group
in
group_list
:
logger
.
info
(
f
"Start to test group size:
{
group
}
"
)
instance_num
=
min
(
device_count
,
max
(
1
,
self
.
args
.
parallel
))
if
group
==
1
else
group
if
self
.
workload
[
"operator"
]
in
[
"device2host"
,
"host2device"
]:
instance_num
=
1
input_queues
=
mp
.
Queue
()
output_queues
=
mp
.
Queue
(
maxsize
=
1
)
try
:
_subprocesses
=
mp
.
spawn
(
fn
=
self
.
perf_func
,
args
=
(
instance_num
,
group
,
output_dir
,
test_list
,
input_queues
,
output_queues
),
nprocs
=
instance_num
,
join
=
False
,
daemon
=
False
)
subprocess_pids
=
_subprocesses
.
pids
()
for
_
in
range
(
instance_num
):
assert
"ready"
==
output_queues
.
get
()
logger
.
info
(
"all ranks are ready and listening, init done"
)
start_time
=
time
.
perf_counter_ns
()
if
group
==
1
:
for
test_instance
in
test_list
:
input_queues
.
put
(
test_instance
,
False
)
for
_
in
range
(
instance_num
):
input_queues
.
put
(
None
,
False
)
result_list
=
[]
if
group
==
1
:
for
_
in
range
(
instance_num
):
result_list
.
extend
(
output_queues
.
get
())
elif
group
>
1
:
result_list
.
extend
(
output_queues
.
get
())
result_list
=
sorted
(
result_list
,
key
=
lambda
x
:
x
.
config
.
index
)
dtype_results_mapping
=
{}
for
result
in
result_list
:
if
result
.
config
.
dtype
not
in
dtype_results_mapping
:
dtype_results_mapping
[
result
.
config
.
dtype
]
=
[]
dtype_results_mapping
[
result
.
config
.
dtype
].
append
(
result
)
for
dtype
,
results
in
dtype_results_mapping
.
items
():
dtype_results_mapping
[
dtype
]
=
sorted
(
results
,
key
=
lambda
x
:
x
.
config
.
index
)
base_report
=
{
"Operator"
:
self
.
workload
[
"operator"
].
upper
(),
"Backend"
:
self
.
backend_type
,
"Host Info"
:
self
.
get_cpu_name
(),
"Device Info"
:
getattr
(
self
.
backend
,
"get_device_name"
)(),
"Version"
:
self
.
version
,
"Execution Date"
:
time
.
strftime
(
"%Y-%m-%d %H:%M:%S"
),
"Performance"
:
[
result
.
report
for
result
in
dtype_results_mapping
[
dtype
]]
}
filename
=
(
f
"result-
{
str
(
dtype
)
}
"
+
(
f
"-group
{
group
}
"
if
group
>
1
else
""
)
+
".json"
)
filepath
=
output_dir
.
joinpath
(
filename
)
with
open
(
filepath
,
"w"
)
as
f
:
json
.
dump
(
base_report
,
f
,
indent
=
4
)
for
process
in
_subprocesses
.
processes
:
process
.
join
()
end_time
=
time
.
perf_counter_ns
()
duration
=
(
end_time
-
start_time
)
/
1e9
duration
=
round
(
duration
,
3
)
current_time
=
datetime
.
datetime
.
now
().
strftime
(
"%Y-%m-%d %H:%M:%S"
)
ret_code
=
0
for
process
in
_subprocesses
.
processes
:
if
process
.
exitcode
!=
0
:
ret_code
=
process
.
exitcode
break
if
ret_code
!=
0
:
with
open
(
f
"
{
hardware_reports_dir
}
/_run_report.log"
,
"a"
)
as
f
:
print
(
f
"[failed]
{
self
.
args
.
task
}
, group_size=
{
group
}
,
{
current_time
}
,
{
duration
}
s"
,
file
=
f
)
else
:
with
open
(
f
"
{
hardware_reports_dir
}
/_run_report.log"
,
"a"
)
as
f
:
print
(
f
"[success]
{
self
.
args
.
task
}
, group_size=
{
group
}
,
{
current_time
}
,
{
duration
}
s"
,
file
=
f
)
except
Exception
as
e
:
traceback
.
print_exc
()
logger
.
error
(
f
"Execute task:
{
self
.
args
.
task
}
failed, group:
{
group
}
, error msg:
{
e
}
"
)
current_time
=
datetime
.
datetime
.
now
().
strftime
(
"%Y-%m-%d %H:%M:%S"
)
with
open
(
f
"
{
hardware_reports_dir
}
/_run_report.log"
,
"a"
)
as
f
:
print
(
f
"[error]
{
self
.
args
.
task
}
, group_size=
{
group
}
,
{
current_time
}
"
,
file
=
f
)
subprocess_pids
=
[]
time
.
sleep
(
1
)
if
self
.
args
.
activate_venv
:
self
.
deactivate_venv
()
def
perf_func
(
self
,
rank
:
int
,
*
args
):
world_size
,
group_size
,
output_dir
,
test_list
,
input_queues
,
output_queues
=
args
backend_instance
=
self
.
backend_class
(
self
.
workload
,
self
.
args
.
vendor_path
)
backend_instance
.
rank
=
rank
backend_instance
.
world_size
=
world_size
backend_instance
.
set_device
(
rank
)
if
group_size
>
1
:
backend_instance
.
initialize_ccl
(
rank
,
world_size
)
op_name
=
self
.
workload
[
"operator"
]
backend_instance
.
get_op_instance
()
output_queues
.
put
(
"ready"
)
result_list
=
[]
if
group_size
==
1
:
while
True
:
test_instance
=
input_queues
.
get
()
if
test_instance
is
None
:
break
test_dtype
=
test_instance
.
dtype
test_shape
=
test_instance
.
tensor_shapes
"""
input_shape could be:
List[int]: single shape. cos
List[List[int]]: multiple inputs. add
List[List[List[in]]]: multiple inputs with multiple problems. group_gemm
"""
if
isinstance
(
test_shape
[
0
],
int
):
test_shape
=
[
test_shape
]
try
:
reports
=
backend_instance
.
perf
(
test_shape
,
test_dtype
)
except
Exception
as
e
:
traceback
.
print_exc
()
logger
.
error
(
f
"Execute op:
{
op_name
.
lower
()
}
failed, input_shape:
{
test_shape
}
, dtype:
{
test_dtype
}
, error msg:
{
e
}
"
)
reports
=
{}
if
reports
and
"Error"
not
in
reports
:
result_list
.
append
(
ResultItem
(
test_instance
,
reports
))
latency
=
reports
.
get
(
"Avg latency(us)"
,
0
)
kernel_bw
=
reports
.
get
(
"Kernel bandwidth(GB/s)"
,
0
)
bus_bw
=
reports
.
get
(
"Bus bandwidth(GB/s)"
,
0
)
print
(
f
"rank
{
rank
}
,
{
test_instance
}
, latency:
{
latency
}
\n
kernel_bw:
{
kernel_bw
}
, bus_bw:
{
bus_bw
}
"
)
else
:
print
(
f
"rank
{
rank
}
,
{
test_instance
}
, error"
)
output_queues
.
put
(
result_list
)
elif
group_size
>
1
:
for
test_instance
in
test_list
:
test_dtype
=
test_instance
.
dtype
test_shape
=
test_instance
.
tensor_shapes
"""
input_shape could be:
List[int]: single shape. cos
List[List[int]]: multiple inputs. add
List[List[List[in]]]: multiple inputs with multiple problems. group_gemm
"""
if
isinstance
(
test_shape
[
0
],
int
):
test_shape
=
[
test_shape
]
try
:
reports
=
backend_instance
.
perf
(
test_shape
,
test_dtype
)
except
Exception
as
e
:
traceback
.
print_exc
()
logger
.
error
(
f
"Execute op:
{
op_name
.
lower
()
}
failed, input_shape:
{
test_shape
}
, dtype:
{
test_dtype
}
, error msg:
{
e
}
"
)
reports
=
{}
if
reports
and
"Error"
not
in
reports
:
result_list
.
append
(
ResultItem
(
test_instance
,
reports
))
latency
=
reports
.
get
(
"Avg latency(us)"
,
0
)
kernel_bw
=
reports
.
get
(
"Kernel bandwidth(GB/s)"
,
0
)
bus_bw
=
reports
.
get
(
"Bus bandwidth(GB/s)"
,
0
)
if
rank
==
0
:
print
(
f
"rank
{
rank
}
,
{
test_instance
}
, latency:
{
latency
}
\n
kernel_bw:
{
kernel_bw
}
, bus_bw:
{
bus_bw
}
"
)
else
:
if
rank
==
0
:
print
(
f
"rank
{
rank
}
,
{
test_instance
}
, error"
)
if
rank
==
0
:
output_queues
.
put
(
result_list
)
if
group_size
>
1
:
backend_instance
.
destroy_process_group
()
def
activate_venv
(
self
,
hardware_type
:
str
)
->
bool
:
if
os
.
path
.
exists
(
"backends/"
+
hardware_type
+
"/requirements.txt"
):
logger
.
info
(
"Activating Virtual Env for "
+
hardware_type
)
venv_dir
=
os
.
path
.
join
(
"backends"
,
hardware_type
+
"/venv"
)
activate_file
=
os
.
path
.
join
(
venv_dir
,
"bin"
,
"activate_this.py"
)
if
not
os
.
path
.
exists
(
venv_dir
):
logger
.
info
(
"venv not exist, Creating Virtual Env for "
+
hardware_type
)
virtualenv
.
create_environment
(
venv_dir
,
True
)
exec
(
open
(
activate_file
).
read
(),
{
"__file__"
:
activate_file
})
python_path
=
os
.
path
.
join
(
venv_dir
,
"bin"
,
"python3"
)
subprocess
.
call
(
[
python_path
,
"-m"
,
"pip"
,
"install"
,
"--upgrade"
,
"pip"
,
"--quiet"
]
)
subprocess
.
call
(
[
python_path
,
"-m"
,
"pip"
,
"install"
,
"-r"
,
"backends/"
+
hardware_type
+
"/requirements.txt"
,
"-q"
,
]
)
else
:
exec
(
open
(
activate_file
).
read
(),
{
"__file__"
:
activate_file
})
"""
just in case install failed in pre-run.
"""
python_path
=
os
.
path
.
join
(
venv_dir
,
"bin"
,
"python3"
)
subprocess
.
call
(
[
python_path
,
"-m"
,
"pip"
,
"install"
,
"--upgrade"
,
"pip"
,
"--quiet"
]
)
subprocess
.
call
(
[
python_path
,
"-m"
,
"pip"
,
"install"
,
"-r"
,
"backends/"
+
hardware_type
+
"/requirements.txt"
,
"-q"
,
]
)
if
not
hasattr
(
sys
,
"real_prefix"
):
return
False
return
True
return
True
def
deactivate_venv
(
self
):
sys
.
path
[:
0
]
=
self
.
prev_sys_path
# will also revert the added site-packages
sys
.
prefix
=
self
.
real_prefix
os
.
environ
[
"PATH"
]
=
self
.
old_os_path
if
__name__
==
"__main__"
:
engine
=
PerfEngine
()
engine
.
start_engine
()
ByteMLPerf/byte_micro_perf/launch.py
0 → 100644
View file @
24b257f1
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
argparse
import
pathlib
import
logging
import
subprocess
import
signal
# directory config
CUR_DIR
=
pathlib
.
Path
.
cwd
().
absolute
()
FILE_DIR
=
pathlib
.
Path
(
__file__
).
parent
.
absolute
()
BYTE_MLPERF_ROOT
=
FILE_DIR
sys
.
path
.
insert
(
0
,
str
(
BYTE_MLPERF_ROOT
))
# logger config
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
"lanuch"
)
def
parse_task
(
task_dir
):
tasks
=
[]
if
os
.
path
.
isdir
(
task_dir
):
for
root
,
_
,
files
in
os
.
walk
(
task_dir
,
topdown
=
False
):
for
name
in
files
:
if
name
.
endswith
(
".json"
):
tasks
.
append
(
name
.
rsplit
(
'.'
,
1
)[
0
])
return
tasks
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
# hardware config
parser
.
add_argument
(
"--hardware_type"
,
default
=
"GPU"
,
help
=
"The backend going to be evaluted, refs to backends/"
,
)
parser
.
add_argument
(
"--vendor_path"
,
help
=
"The hardware configs need to be loaded, refs to vendor_zoo/"
,
)
# task config
parser
.
add_argument
(
"--task_dir"
,
default
=
str
(
BYTE_MLPERF_ROOT
.
joinpath
(
"workloads"
).
absolute
()),
help
=
"The direcotry of tasks going to be evaluted, e.g., set to workloads"
)
parser
.
add_argument
(
"--task"
,
default
=
"all"
,
help
=
"The task going to be evaluted, refs to workloads/, default use all tasks in workloads/"
)
# list all supported task and hardware
parser
.
add_argument
(
"--show_task_list"
,
action
=
"store_true"
,
help
=
"Print all available task names"
)
parser
.
add_argument
(
"--show_hardware_list"
,
action
=
"store_true"
,
help
=
"Print all hardware bytemlperf supported"
,
)
# feature control
parser
.
add_argument
(
"--parallel"
,
type
=
int
,
default
=
1
,
help
=
"Run all tasks in parallel if available"
)
parser
.
add_argument
(
"--install_requirements"
,
action
=
"store_true"
,
help
=
"Install all required packages"
)
parser
.
add_argument
(
"--activate_venv"
,
action
=
"store_true"
,
help
=
"Enable python virtual environment"
)
args
=
parser
.
parse_args
()
args
.
vendor_path
=
pathlib
.
Path
(
args
.
vendor_path
).
absolute
()
if
args
.
vendor_path
else
None
args
.
task_dir
=
pathlib
.
Path
(
args
.
task_dir
).
absolute
()
os
.
chdir
(
str
(
BYTE_MLPERF_ROOT
))
# show tasks
task_list
=
[
file
.
stem
for
file
in
args
.
task_dir
.
iterdir
()]
task_list
.
sort
()
task_mapping
=
{
"all"
:
task_list
,
"gemm_ops"
:
[],
"unary_ops"
:
[],
"binary_ops"
:
[],
"reduction_ops"
:
[],
"index_ops"
:
[],
"h2d_ops"
:
[],
"ccl_ops"
:
[]
}
for
task
in
task_list
:
if
task
in
[
"gemm"
,
"gemv"
,
"batch_gemm"
,
"group_gemm"
]:
task_mapping
[
"gemm_ops"
].
append
(
task
)
if
task
in
[
"sin"
,
"cos"
,
"exp"
,
"exponential"
,
"log"
,
"sqrt"
,
"cast"
,
"silu"
,
"gelu"
,
"swiglu"
]:
task_mapping
[
"unary_ops"
].
append
(
task
)
if
task
in
[
"add"
,
"mul"
,
"sub"
,
"div"
]:
task_mapping
[
"binary_ops"
].
append
(
task
)
if
task
in
[
"layernorm"
,
"softmax"
,
"reduce_sum"
,
"reduce_max"
,
"reduce_min"
]:
task_mapping
[
"reduction_ops"
].
append
(
task
)
if
task
in
[
"index_add"
,
"sort"
,
"unique"
,
"gather"
,
"scatter"
]:
task_mapping
[
"index_ops"
].
append
(
task
)
if
task
in
[
"host2device"
,
"device2host"
,
"device2device"
]:
task_mapping
[
"h2d_ops"
].
append
(
task
)
if
task
in
[
"allgather"
,
"allreduce"
,
"alltoall"
,
"broadcast"
,
"p2p"
,
"reduce_scatter"
]:
task_mapping
[
"ccl_ops"
].
append
(
task
)
if
args
.
show_task_list
:
logger
.
info
(
"******************* Supported Task *******************"
)
print
(
task_list
)
exit
(
0
)
# show hardwares
hardware_list
=
[]
for
file
in
BYTE_MLPERF_ROOT
.
joinpath
(
"backends"
).
iterdir
():
if
file
.
is_dir
()
and
file
.
stem
.
startswith
(
"_"
)
is
False
:
hardware_list
.
append
(
file
.
stem
)
if
args
.
show_hardware_list
:
logger
.
info
(
"***************** Supported Hardware Backend *****************"
)
print
(
hardware_list
)
exit
(
0
)
# check task
test_cases
=
[]
if
args
.
task
in
task_mapping
.
keys
():
test_cases
=
task_mapping
[
args
.
task
]
else
:
specified_tasks
=
args
.
task
.
split
(
","
)
for
task
in
specified_tasks
:
if
task
not
in
task_list
:
logger
.
error
(
f
"Task
{
task
}
not found in
{
args
.
task_dir
}
"
)
exit
(
1
)
test_cases
.
append
(
task
)
logger
.
info
(
f
"******************* Tasks: *****************"
)
logger
.
info
(
f
"
{
test_cases
}
\n
"
)
# check hardware
hardware
=
args
.
hardware_type
if
hardware
not
in
hardware_list
:
logger
.
error
(
f
"Hardware
{
hardware
}
not found in
{
BYTE_MLPERF_ROOT
.
joinpath
(
'backends'
)
}
"
)
exit
(
1
)
logger
.
info
(
f
"******************* hardware: *****************"
)
logger
.
info
(
f
"
{
hardware
}
\n
"
)
if
args
.
install_requirements
:
logger
.
info
(
"******************* Pip Package Installing *******************"
)
subprocess
.
run
(
[
"python3"
,
"-m"
,
"pip"
,
"install"
,
"pip"
,
"--upgrade"
,
"--quiet"
]
)
subprocess
.
run
(
[
"python3"
,
"-m"
,
"pip"
,
"install"
,
"-r"
,
"requirements.txt"
,
"--quiet"
]
)
if
not
args
.
activate_venv
:
subprocess
.
run
(
[
"python3"
,
"-m"
,
"pip"
,
"install"
,
"-r"
,
f
"backends/
{
hardware
}
/requirements.txt"
,
"--quiet"
]
)
outputs_dir
=
pathlib
.
Path
(
BYTE_MLPERF_ROOT
).
joinpath
(
"reports"
,
args
.
hardware_type
)
if
not
outputs_dir
.
exists
():
outputs_dir
.
mkdir
(
parents
=
True
)
with
open
(
f
"
{
BYTE_MLPERF_ROOT
}
/reports/
{
args
.
hardware_type
}
/_run_report.log"
,
"w"
)
as
file
:
pass
# terminate task perf process
subprocess_pid
=
-
1
def
signal_handler
(
signum
,
frame
):
logger
.
info
(
f
"Received signal
{
signum
}
, exiting..."
)
if
subprocess_pid
!=
-
1
:
logger
.
info
(
f
"terminate subprocess:
{
subprocess_pid
}
"
)
os
.
kill
(
subprocess_pid
,
signal
.
SIGTERM
)
sys
.
exit
(
0
)
signal
.
signal
(
signal
.
SIGINT
,
signal_handler
)
signal
.
signal
(
signal
.
SIGTERM
,
signal_handler
)
failed_ops
=
[]
for
task
in
test_cases
:
cmds
=
[
"python3"
,
"./core/perf_engine.py"
,
"--hardware_type"
,
args
.
hardware_type
,
"--vendor_path"
,
str
(
args
.
vendor_path
),
"--task"
,
task
,
"--task_dir"
,
str
(
args
.
task_dir
),
"--parallel"
,
str
(
args
.
parallel
)
]
if
args
.
activate_venv
:
cmds
.
append
(
"--activate_venv"
)
print
(
f
"******************************************* Start to test op: [
{
task
}
]. *******************************************"
)
process
=
subprocess
.
Popen
(
cmds
)
subprocess_pid
=
process
.
pid
ret
=
process
.
wait
()
if
ret
!=
0
:
failed_ops
.
append
(
task
)
print
(
""
)
if
failed_ops
:
logger
.
error
(
f
"Failed ops:
{
failed_ops
}
"
)
exit
(
1
)
else
:
logger
.
info
(
"All ops passed"
)
ByteMLPerf/byte_micro_perf/requirements.txt
0 → 100644
View file @
24b257f1
matplotlib
pandas
virtualenv==16.7.12
scikit-learn
prompt_toolkit
tqdm
opencv-python
transformers
tokenization
fpdf
attrs
decorator
typing-extensions
pydot
\ No newline at end of file
ByteMLPerf/byte_micro_perf/run.sh
0 → 100644
View file @
24b257f1
source
/home/workspace/dtk-24.04.3/env.sh
python3 ./launch.py
--parallel
8
ByteMLPerf/byte_micro_perf/scripts/convert.py
0 → 100644
View file @
24b257f1
import
sys
import
csv
import
json
import
pathlib
import
argparse
import
logging
CUR_DIR
=
pathlib
.
Path
(
__file__
).
parent
.
absolute
()
PRJ_ROOT_DIR
=
CUR_DIR
.
parent
sys
.
path
.
insert
(
0
,
str
(
PRJ_ROOT_DIR
))
unique_attrs
=
[
"op_name"
,
"sku_name"
,
"owner"
,
"perf_mode"
]
def
get_unique_key
(
op_name
,
sku_name
,
owner
,
perf_mode
,
*
args
,
**
kwargs
):
return
"."
.
join
([
sku_name
,
owner
,
op_name
,
perf_mode
]).
replace
(
" "
,
"_"
)
arguments_map
=
{
# 单目算子
# [batch, len] --> [batch, len]
"sin"
:
[
"dtype"
,
"batch"
,
"len"
],
"cos"
:
[
"dtype"
,
"batch"
,
"len"
],
"exp"
:
[
"dtype"
,
"batch"
,
"len"
],
"exponential"
:
[
"dtype"
,
"batch"
,
"len"
],
"silu"
:
[
"dtype"
,
"batch"
,
"len"
],
"gelu"
:
[
"dtype"
,
"batch"
,
"len"
],
"swiglu"
:
[
"dtype"
,
"batch"
,
"len"
],
# float32: float32 --> float16/bfloat16
# float16: float16 --> float32
# bfloat16: bfloat16 --> float32
"cast"
:
[
"dtype"
,
"batch"
,
"len"
],
# 双目算子
# [batch, len] (op) [batch, len] --> [batch, len]
"add"
:
[
"dtype"
,
"batch"
,
"len"
],
"mul"
:
[
"dtype"
,
"batch"
,
"len"
],
"sub"
:
[
"dtype"
,
"batch"
,
"len"
],
"div"
:
[
"dtype"
,
"batch"
,
"len"
],
# 规约算子
# [batch, len] --> [batch, len]
"layernorm"
:
[
"dtype"
,
"batch"
,
"len"
],
"softmax"
:
[
"dtype"
,
"batch"
,
"len"
],
# [batch, len] --> [batch, 1]
"reduce_sum"
:
[
"dtype"
,
"batch"
,
"len"
],
"reduce_min"
:
[
"dtype"
,
"batch"
,
"len"
],
"reduce_max"
:
[
"dtype"
,
"batch"
,
"len"
],
# 索引算子
# [batch, len] (op) [batch] --> [batch, len]
"index_add"
:
[
"dtype"
,
"batch"
,
"len"
],
# [batch, len] --> [batch, len]
"sort"
:
[
"dtype"
,
"batch"
,
"len"
],
"unique"
:
[
"dtype"
,
"batch"
,
"len"
],
"gather"
:
[
"dtype"
,
"batch"
,
"len"
],
"scatter"
:
[
"dtype"
,
"batch"
,
"len"
],
# 矩阵算子
# [M, K] * [K, N] --> [M, N]
"gemm"
:
[
"dtype"
,
"M"
,
"N"
,
"K"
],
# [batch, M, K] * [batch, K, N] --> [batch, M, N]
"batch_gemm"
:
[
"dtype"
,
"batch"
,
"M"
,
"N"
,
"K"
],
# # group * {[M, K] * [K, N] = [M, N]
"group_gemm"
:
[
"dtype"
,
"batch"
,
"group"
,
"M_str"
,
"N"
,
"K"
],
# 通信算子
# [batch, len] --> [batch, len]
# tp_size split over batch
"broadcast"
:
[
"dtype"
,
"tp_size"
,
"batch"
,
"len"
],
"allreduce"
:
[
"dtype"
,
"tp_size"
,
"batch"
,
"len"
],
"allgather"
:
[
"dtype"
,
"tp_size"
,
"batch"
,
"len"
],
"alltoall"
:
[
"dtype"
,
"tp_size"
,
"batch"
,
"len"
],
"reducescatter"
:
[
"dtype"
,
"tp_size"
,
"batch"
,
"len"
],
"p2p"
:
[
"dtype"
,
"tp_size"
,
"batch"
,
"len"
],
"device2host"
:
[
"dtype"
,
"batch"
,
"len"
],
"host2device"
:
[
"dtype"
,
"batch"
,
"len"
]
}
target_attrs
=
[
# latency in us
"latency"
]
def
get_csv_headers
(
op_name
):
return
unique_attrs
+
arguments_map
.
get
(
op_name
,
[])
+
target_attrs
logger
=
logging
.
getLogger
(
"bytemlperf_aeolus"
)
def
setup_logger
(
loglevel
:
str
):
fmt
=
logging
.
Formatter
(
fmt
=
"%(asctime)s.%(msecs)03d %(filename)s:%(lineno)d [%(levelname)s]: %(message)s"
,
datefmt
=
"%Y-%m-%d %H:%M:%S"
,
)
handler
=
logging
.
StreamHandler
(
stream
=
sys
.
stdout
)
handler
.
setFormatter
(
fmt
)
logger
.
addHandler
(
handler
)
logger
.
setLevel
(
loglevel
.
upper
())
logger
.
propagate
=
False
sku_name_mapping
=
{
"MLU590-M9"
:
"MLU590 M9"
,
"MLU590-M9D"
:
"MLU590 M9D"
,
"MLU590-M9DK"
:
"MLU590 M9D"
,
"Iluvatar BI-V150"
:
"BI-V150"
,
"NVIDIA A800-SXM4-80GB"
:
"A800 80GB SXM"
,
"NVIDIA H800"
:
"H800 80GB SXM"
,
"NVIDIA H20"
:
"H20 96GB SXM"
,
"Ascend910B2C"
:
"Ascend910B2"
}
dtype_map
=
{
"float"
:
"float32"
,
"half"
:
"float16"
,
"int"
:
"int32"
}
def
normal_ops_func
(
op
,
sku_name
,
frame
,
perf_mode
,
json_data
):
if
not
json_data
or
"Error"
in
json_data
:
return
dtype
=
json_data
[
"Dtype"
]
if
dtype
in
dtype_map
:
dtype
=
dtype_map
[
dtype
]
batch
=
json_data
[
"Tensor Shapes"
][
0
][
0
]
len
=
json_data
[
"Tensor Shapes"
][
0
][
1
]
latency
=
json_data
[
"Avg latency(us)"
]
return
[
op
,
sku_name
,
frame
,
perf_mode
,
dtype
,
batch
,
len
,
latency
]
def
gemm_func
(
op
,
sku_name
,
frame
,
perf_mode
,
json_data
):
if
not
json_data
or
"Error"
in
json_data
:
return
dtype
=
json_data
[
"Dtype"
]
if
dtype
in
dtype_map
:
dtype
=
dtype_map
[
dtype
]
M
=
json_data
[
"Tensor Shapes"
][
0
][
0
]
K
=
json_data
[
"Tensor Shapes"
][
0
][
1
]
N
=
json_data
[
"Tensor Shapes"
][
1
][
1
]
latency
=
json_data
[
"Avg latency(us)"
]
return
[
op
,
sku_name
,
frame
,
perf_mode
,
dtype
,
M
,
N
,
K
,
latency
]
def
batch_gemm_func
(
op
,
sku_name
,
frame
,
perf_mode
,
json_data
):
if
not
json_data
or
"Error"
in
json_data
:
return
dtype
=
json_data
[
"Dtype"
]
if
dtype
in
dtype_map
:
dtype
=
dtype_map
[
dtype
]
batch_size
=
json_data
[
"Tensor Shapes"
][
0
][
0
]
M
=
json_data
[
"Tensor Shapes"
][
0
][
1
]
K
=
json_data
[
"Tensor Shapes"
][
0
][
2
]
N
=
json_data
[
"Tensor Shapes"
][
1
][
2
]
latency
=
json_data
[
"Avg latency(us)"
]
return
[
op
,
sku_name
,
frame
,
perf_mode
,
dtype
,
batch_size
,
M
,
N
,
K
,
latency
]
def
group_gemm_func
(
op
,
sku_name
,
frame
,
perf_mode
,
json_data
):
if
not
json_data
or
"Error"
in
json_data
:
return
dtype
=
json_data
[
"Dtype"
]
if
dtype
in
dtype_map
:
dtype
=
dtype_map
[
dtype
]
batch_size
=
json_data
[
"Tensor Shapes"
][
0
][
0
][
0
]
group
=
len
(
json_data
[
"Tensor Shapes"
])
M_list
=
[
int
(
json_data
[
"Tensor Shapes"
][
i
][
0
][
0
])
//
batch_size
for
i
in
range
(
group
)]
M_list_str
=
"/"
.
join
([
str
(
m
)
for
m
in
M_list
])
K
=
json_data
[
"Tensor Shapes"
][
0
][
0
][
1
]
N
=
json_data
[
"Tensor Shapes"
][
0
][
1
][
1
]
latency
=
json_data
[
"Avg latency(us)"
]
return
[
op
,
sku_name
,
frame
,
perf_mode
,
dtype
,
batch_size
,
group
,
M_list_str
,
N
,
K
,
latency
]
def
ccl_ops_func
(
op
,
sku_name
,
frame
,
perf_mode
,
json_data
):
if
not
json_data
or
"Error"
in
json_data
:
return
dtype
=
json_data
[
"Dtype"
]
if
dtype
in
dtype_map
:
dtype
=
dtype_map
[
dtype
]
tp_size
=
json_data
[
"Group"
]
batch
=
json_data
[
"Tensor Shapes"
][
0
][
0
]
len
=
json_data
[
"Tensor Shapes"
][
0
][
1
]
latency
=
json_data
[
"Avg latency(us)"
]
return
[
op
,
sku_name
,
frame
,
perf_mode
,
dtype
,
tp_size
,
batch
,
len
,
latency
]
def
d2h_h2d_func
(
op
,
sku_name
,
frame
,
perf_mode
,
json_data
):
if
not
json_data
or
"Error"
in
json_data
:
return
dtype
=
json_data
[
"Dtype"
]
if
dtype
in
dtype_map
:
dtype
=
dtype_map
[
dtype
]
batch
=
json_data
[
"Tensor Shapes"
][
0
][
0
]
len
=
json_data
[
"Tensor Shapes"
][
0
][
1
]
latency
=
json_data
[
"Avg latency(us)"
]
return
[
op
,
sku_name
,
frame
,
perf_mode
,
dtype
,
batch
,
len
,
latency
]
post_func_map
=
{
"sin"
:
normal_ops_func
,
"cos"
:
normal_ops_func
,
"exp"
:
normal_ops_func
,
"exponential"
:
normal_ops_func
,
"silu"
:
normal_ops_func
,
"gelu"
:
normal_ops_func
,
"swiglu"
:
normal_ops_func
,
"cast"
:
normal_ops_func
,
"add"
:
normal_ops_func
,
"mul"
:
normal_ops_func
,
"sub"
:
normal_ops_func
,
"div"
:
normal_ops_func
,
"layernorm"
:
normal_ops_func
,
"softmax"
:
normal_ops_func
,
"reduce_sum"
:
normal_ops_func
,
"reduce_min"
:
normal_ops_func
,
"reduce_max"
:
normal_ops_func
,
"index_add"
:
normal_ops_func
,
"sort"
:
normal_ops_func
,
"unique"
:
normal_ops_func
,
"gather"
:
normal_ops_func
,
"scatter"
:
normal_ops_func
,
"gemm"
:
gemm_func
,
"batch_gemm"
:
batch_gemm_func
,
"group_gemm"
:
group_gemm_func
,
"broadcast"
:
ccl_ops_func
,
"allreduce"
:
ccl_ops_func
,
"allgather"
:
ccl_ops_func
,
"alltoall"
:
ccl_ops_func
,
"reducescatter"
:
ccl_ops_func
,
"p2p"
:
ccl_ops_func
,
"device2host"
:
d2h_h2d_func
,
"host2device"
:
d2h_h2d_func
}
def
postprocess
(
op
,
file_list
,
dst_dir
):
json_data_list
=
[
json
.
load
(
open
(
file
))
for
file
in
file_list
]
if
not
json_data_list
:
logger
.
error
(
f
"no data found in
{
file_list
}
"
)
return
sku_name
=
json_data_list
[
0
][
"Device Info"
]
sku_name
=
sku_name_mapping
.
get
(
sku_name
,
sku_name
)
perf_datas
=
[]
for
json_data
in
json_data_list
:
if
"Performance"
not
in
json_data
:
logger
.
error
(
f
"no performance data"
)
continue
perf_data
=
json_data
[
"Performance"
]
if
not
perf_datas
:
perf_datas
=
perf_data
else
:
perf_datas
.
extend
(
perf_data
)
unique_name
=
get_unique_key
(
op
,
sku_name
,
"torch"
,
"host"
)
unique_csv_file
=
f
"
{
unique_name
}
.csv"
unique_csv_path
=
dst_dir
/
unique_csv_file
with
open
(
unique_csv_path
,
"w"
)
as
f
:
writer
=
csv
.
writer
(
f
)
writer
.
writerow
(
get_csv_headers
(
op
))
for
perf_data
in
perf_datas
:
if
op
in
post_func_map
:
row
=
post_func_map
[
op
](
op
,
sku_name
,
"torch"
,
"host"
,
perf_data
)
if
row
:
writer
.
writerow
(
row
)
def
convert_src
(
src
,
dst
):
logger
.
info
(
f
"src:
{
src
}
"
)
logger
.
info
(
f
"dst:
{
dst
}
"
)
op_data_map
=
{}
for
file
in
src
.
rglob
(
"*.json"
):
dir_name
=
file
.
parent
.
name
if
dir_name
==
"gemv"
:
dir_name
=
"gemm"
if
not
dir_name
in
op_data_map
:
op_data_map
[
dir_name
]
=
[]
op_data_map
[
dir_name
].
append
(
file
)
for
op
,
files
in
op_data_map
.
items
():
logger
.
info
(
f
"op:
{
op
}
"
)
if
op
not
in
arguments_map
and
op
!=
"gemv"
:
logger
.
error
(
f
"invalid op:
{
op
}
"
)
continue
postprocess
(
op
,
files
,
dst
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--src"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
default
=
"./temp"
)
parser
.
add_argument
(
"--log_level"
,
type
=
str
,
default
=
"INFO"
)
args
=
parser
.
parse_args
()
setup_logger
(
args
.
log_level
)
src_dir
=
pathlib
.
Path
(
args
.
src
).
absolute
()
if
not
src_dir
.
exists
():
logger
.
error
(
f
"
{
args
.
src
}
does not exist"
)
exit
(
1
)
elif
not
src_dir
.
is_dir
():
logger
.
error
(
f
"
{
args
.
src
}
is not a directory"
)
exit
(
1
)
output_dir
=
pathlib
.
Path
(
args
.
output_dir
).
absolute
()
if
not
output_dir
.
exists
():
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
elif
not
output_dir
.
is_dir
():
logger
.
error
(
f
"
{
args
.
output_dir
}
is not a directory"
)
exit
(
1
)
convert_src
(
src_dir
,
output_dir
)
ByteMLPerf/byte_micro_perf/workloads/add.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"add"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
],
[
8192
]
],
[
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
],
[
8192
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/allgather.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"allgather"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
,
262144
,
524288
,
1048576
,
2097152
],
[
1024
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
],
"group"
:
[
2
,
4
,
8
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/allreduce.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"allreduce"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
,
262144
,
524288
,
1048576
,
2097152
],
[
1024
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
],
"group"
:
[
2
,
4
,
8
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/alltoall.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"alltoall"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
,
262144
,
524288
,
1048576
,
2097152
],
[
1024
]
],
[
[
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
,
262144
,
524288
,
1048576
,
2097152
],
[
1024
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
],
"group"
:
[
2
,
4
,
8
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/batch_gemm.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"batch_gemm"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"batch_size"
:
[
8
,
12
,
16
,
20
,
24
,
28
,
32
],
"M"
:
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
],
"KN"
:
[
[
1024
,
1024
],
[
4096
,
4096
],
[
8192
,
8192
],
[
16384
,
32
],
[
16384
,
128
],
[
16384
,
1024
],
[
32
,
16384
],
[
128
,
16384
],
[
1024
,
16384
]]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
,
"int8"
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/broadcast.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"broadcast"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
,
262144
,
524288
,
1048576
,
2097152
],
[
1024
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
],
"group"
:
[
2
,
4
,
8
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/cast.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"cast"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
],
[
8192
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/cos.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"cos"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
],
[
8192
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/device2host.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"device2host"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
],
[
1024
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/div.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"div"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
],
[
8192
]
],
[
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
],
[
8192
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/exp.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"exp"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
],
[
8192
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/exponential.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"exponential"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
4
,
8
,
16
,
32
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
],
[
8192
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
]
}
\ No newline at end of file
ByteMLPerf/byte_micro_perf/workloads/gather.json
0 → 100644
View file @
24b257f1
{
"operator"
:
"gather"
,
"iterations"
:
100
,
"input_shape_groups"
:
{
"inputs"
:
[
[
[
1024
],
[
1024
,
2048
,
4096
,
8192
,
16384
,
32768
,
65536
,
131072
,
262144
,
524288
]
]
]
},
"dtype"
:
[
"float32"
,
"bfloat16"
,
"float16"
]
}
\ No newline at end of file
Prev
1
…
9
10
11
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment