Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
sunzhq2
bytemlperf-dcu
Commits
e4cefa34
Commit
e4cefa34
authored
Nov 27, 2024
by
wangkaixiong
🚴🏼
Browse files
uopdate datasets
parent
24b257f1
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
401 additions
and
0 deletions
+401
-0
.gitignore
.gitignore
+10
-0
ByteMLPerf/byte_infer_perf/general_perf/datasets/data_loader.py
...Perf/byte_infer_perf/general_perf/datasets/data_loader.py
+91
-0
ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/data_loader.py
...er_perf/general_perf/datasets/fake_dataset/data_loader.py
+132
-0
ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/test_accuracy.py
..._perf/general_perf/datasets/fake_dataset/test_accuracy.py
+50
-0
ByteMLPerf/byte_infer_perf/general_perf/datasets/test_accuracy.py
...rf/byte_infer_perf/general_perf/datasets/test_accuracy.py
+118
-0
No files found.
.gitignore
0 → 100644
View file @
e4cefa34
*.tar.gz
*.whl
*.zip
*.json
*.pyc
*.pickle
*.torrent
*.pyc
*.npy
*.csv
\ No newline at end of file
ByteMLPerf/byte_infer_perf/general_perf/datasets/data_loader.py
0 → 100644
View file @
e4cefa34
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
numpy
as
np
log
=
logging
.
getLogger
(
"Dataset"
)
INPUT_TYPE
=
{
"UINT8"
:
np
.
uint8
,
"FLOAT32"
:
np
.
float32
,
"FLOAT16"
:
np
.
float16
,
"LONG"
:
np
.
long
,
"INT32"
:
np
.
int32
,
"INT64"
:
np
.
int64
,
"BOOL"
:
np
.
bool
}
class
Dataset
():
def
__init__
(
self
,
config
):
self
.
config
=
config
self
.
cur_bs
=
1
self
.
batched_data
=
[]
self
.
labels
=
[]
self
.
items
=
0
self
.
batch_num
=
int
(
self
.
items
/
self
.
cur_bs
)
def
name
(
self
)
->
str
:
"""
Return the name of dataset
"""
raise
NotImplementedError
(
"Dataset:name"
)
def
get_item_count
(
self
)
->
int
:
"""
Return the number of data loaded
"""
return
self
.
items
def
get_batch_count
(
self
)
->
int
:
"""
Return the number of batched data
"""
return
self
.
batch_num
def
preprocess
(
self
):
"""
Data preprocess will happened here
"""
return
def
get_samples
(
self
,
sample_id
):
"""
Query data with sample id
"""
if
sample_id
>=
len
(
self
.
batched_data
)
or
sample_id
<
0
:
raise
ValueError
(
"Your Input ID is out of range"
)
return
self
.
batched_data
[
sample_id
],
self
.
labels
[
sample_id
]
def
rebatch
(
self
,
new_bs
,
skip
=
True
)
->
None
:
"""
Rebatch Datasets to specified number
"""
raise
NotImplementedError
(
"Dataset:rebatch"
)
def
get_fake_samples
(
self
,
batch_size
,
shape
,
input_type
):
"""
Generate fake data for testing
"""
data
=
{}
if
not
input_type
:
raise
ValueError
(
"Please provide input type"
)
i
=
0
for
key
,
val
in
shape
.
items
():
val
=
[
val
[
0
]
*
batch_size
]
+
val
[
1
:]
data
[
key
]
=
np
.
random
.
random
(
size
=
val
).
astype
(
INPUT_TYPE
[
input_type
[
i
]])
i
+=
1
return
data
ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/data_loader.py
0 → 100644
View file @
e4cefa34
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
numpy
as
np
from
general_perf.datasets
import
data_loader
INPUT_TYPE
=
{
"UINT8"
:
np
.
uint8
,
"FLOAT16"
:
np
.
float16
,
"FLOAT32"
:
np
.
float32
,
"LONG"
:
np
.
long
,
"INT32"
:
np
.
int32
,
"INT64"
:
np
.
int64
,
"BOOL"
:
np
.
bool
}
log
=
logging
.
getLogger
(
"FAKE_DATA"
)
class
DataLoader
(
data_loader
.
Dataset
):
def
__init__
(
self
,
config
):
super
(
DataLoader
,
self
).
__init__
(
config
)
self
.
config
=
config
self
.
cur_bs
=
1
def
name
(
self
):
return
'fake_dataset'
def
get_batch_count
(
self
):
# always return 100
return
100
def
generate_fake_data
(
self
):
input_shape
=
self
.
config
[
"input_shape"
]
input_type
=
self
.
config
[
"input_type"
].
split
(
','
)
return
self
.
get_fake_samples_regular
(
self
.
cur_bs
,
input_shape
,
input_type
)
def
rebatch
(
self
,
new_bs
,
skip
=
True
):
log
.
info
(
"Rebatching batch size to: {} ..."
.
format
(
new_bs
))
if
self
.
cur_bs
==
new_bs
and
skip
:
return
self
.
cur_bs
=
new_bs
def
get_samples
(
self
,
sample_id
):
if
sample_id
>
99
or
sample_id
<
0
:
raise
ValueError
(
"Your Input ID is out of range"
)
np
.
random
.
seed
(
sample_id
)
return
self
.
generate_fake_data
()
def
get_fake_samples_regular
(
self
,
batch_size
,
shape
,
input_type
):
data
=
{}
if
not
input_type
:
raise
ValueError
(
"Please provide input type"
)
i
=
0
for
key
,
val
in
shape
.
items
():
val
=
[
batch_size
]
+
val
[
1
:]
if
'LONG'
in
input_type
[
i
]
or
'INT'
in
input_type
[
i
]:
if
"mask"
in
key
or
"segment"
in
key
:
data
[
key
]
=
np
.
random
.
randint
(
low
=
0
,
high
=
2
,
size
=
val
).
astype
(
INPUT_TYPE
[
input_type
[
i
]])
elif
self
.
config
[
"model"
]
==
"internal_videobert01-onnx-fp32"
and
key
==
"1_input_1"
:
data
[
key
]
=
np
.
random
.
ones
(
size
=
val
).
astype
(
INPUT_TYPE
[
input_type
[
i
]])
else
:
data
[
key
]
=
np
.
random
.
randint
(
low
=
0
,
high
=
1000
,
size
=
val
).
astype
(
INPUT_TYPE
[
input_type
[
i
]])
elif
'STRING'
in
input_type
[
i
]:
data
[
key
]
=
'This is a test string.'
elif
'BOOL'
in
input_type
[
i
]:
data
[
key
]
=
np
.
zeros
(
shape
=
val
,
dtype
=
bool
)
else
:
sample_data
=
np
.
random
.
random
(
size
=
val
)
*
2
-
1
data
[
key
]
=
sample_data
.
astype
(
INPUT_TYPE
[
input_type
[
i
]])
i
+=
1
return
data
def
get_fake_samples_bert
(
self
,
batch_size
,
shape
,
input_type
):
data
=
{}
avg_seq_len
=
192
max_seq_len
=
384
if
not
input_type
:
raise
ValueError
(
"Please provide input type"
)
i
=
0
for
key
,
val
in
shape
.
items
():
val
=
[
val
[
0
]
*
batch_size
]
+
val
[
1
:]
if
i
==
0
:
# fake input id and mask
input_ids
=
np
.
random
.
randint
(
low
=
0
,
high
=
30523
,
size
=
val
).
astype
(
INPUT_TYPE
[
input_type
[
i
]])
data
[
key
]
=
input_ids
elif
i
==
1
:
# fake input array length
input_len
=
np
.
random
.
randint
(
low
=
2
*
avg_seq_len
-
max_seq_len
,
high
=
max_seq_len
+
1
,
size
=
(
batch_size
),
dtype
=
np
.
int32
)
input_mask
=
np
.
zeros
(
val
).
astype
(
INPUT_TYPE
[
input_type
[
i
]])
for
b_idx
,
s_len
in
enumerate
(
input_len
):
input_mask
[
b_idx
][:
s_len
]
=
1
data
[
key
]
=
input_mask
else
:
data
[
key
]
=
np
.
zeros
(
val
).
astype
(
INPUT_TYPE
[
input_type
[
i
]])
i
+=
1
return
data
ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/test_accuracy.py
0 → 100644
View file @
e4cefa34
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
numpy
as
np
from
general_perf.datasets
import
test_accuracy
from
tqdm
import
tqdm
log
=
logging
.
getLogger
(
"TestAccuracy"
)
class
AccuracyChecker
(
test_accuracy
.
AccuracyChecker
):
def
calculate_acc
(
self
,
data_percent
=
10
):
log
.
info
(
"Start to calculate accuracy..."
)
num
=
int
((
data_percent
/
100
)
*
self
.
dataloader
.
get_batch_count
()
)
if
data_percent
else
self
.
dataloader
.
get_batch_count
()
diffs
=
[]
for
i
in
tqdm
(
range
(
num
)):
test_data
=
self
.
dataloader
.
get_samples
(
i
)
results
=
self
.
runtime_backend
.
predict
(
test_data
)
if
isinstance
(
results
,
dict
):
list_key
=
list
(
results
.
keys
())
list_key
.
sort
()
for
key
in
list_key
:
diffs
.
extend
(
results
[
key
].
flatten
())
elif
isinstance
(
results
,
list
):
for
out
in
results
:
diffs
.
extend
(
out
.
flatten
())
else
:
diffs
.
extend
(
results
)
log
.
info
(
'Batch size is {}, Accuracy: {}'
.
format
(
self
.
dataloader
.
cur_bs
,
0.0
))
np
.
save
(
self
.
output_dir
+
"/{}.npy"
.
format
(
self
.
dataloader
.
name
()),
np
.
array
(
diffs
),
allow_pickle
=
True
)
return
{
"Fake Dataset Accuracy"
:
0
}
ByteMLPerf/byte_infer_perf/general_perf/datasets/test_accuracy.py
0 → 100644
View file @
e4cefa34
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
logging
from
typing
import
Any
,
Dict
import
matplotlib.pyplot
as
plt
import
numpy
as
np
log
=
logging
.
getLogger
(
"TestAccuracy"
)
def
draw_all_diff
(
ori_outs
,
cur_outs
,
file_name
)
->
Dict
[
str
,
Any
]:
ori_data
=
ori_outs
.
flatten
()
cur_data
=
cur_outs
.
flatten
()
'''
Nan & Inf is not compareable, replece with 0
'''
ori_data
[
np
.
isnan
(
ori_data
)]
=
0.0
ori_data
[
np
.
isinf
(
ori_data
)]
=
0.0
cur_data
[
np
.
isnan
(
cur_data
)]
=
0.0
cur_data
[
np
.
isinf
(
cur_data
)]
=
0.0
length
=
min
(
ori_data
.
shape
[
0
],
300
)
diff
=
ori_data
-
cur_data
ori_data
=
np
.
where
(
ori_data
==
0
,
1
,
ori_data
)
rel_diff
=
np
.
divide
(
diff
,
ori_data
)
rel_diff
=
np
.
nan_to_num
(
rel_diff
)
log
.
info
(
'Mean Diff: {}, Std Diff: {}, Max Diff: {}, Max Rel-Diff: {}, Mean Rel-Diff: {}'
.
format
(
np
.
mean
(
abs
(
diff
)),
np
.
std
(
abs
(
diff
)),
abs
(
diff
).
max
(),
abs
(
rel_diff
).
max
(),
np
.
mean
(
abs
(
rel_diff
))))
result
=
{}
result
[
"Mean Diff"
]
=
round
(
float
(
np
.
mean
(
abs
(
diff
))),
5
)
result
[
"Std Diff"
]
=
round
(
float
(
np
.
std
(
abs
(
diff
))),
5
)
result
[
"Max Diff"
]
=
round
(
float
(
abs
(
diff
).
max
()),
5
)
result
[
"Max Rel-Diff"
]
=
round
(
float
(
abs
(
rel_diff
).
max
()),
5
)
result
[
"Mean Rel-Diff"
]
=
round
(
float
(
np
.
mean
(
abs
(
rel_diff
))),
5
)
plt
.
figure
(
figsize
=
(
16
,
8
))
plt
.
cla
()
plt
.
subplot
(
1
,
3
,
1
)
plt
.
yscale
(
'log'
)
plt
.
hist
(
diff
,
bins
=
length
,
alpha
=
0.5
,
label
=
'Diff'
,
range
=
(
diff
.
min
(),
diff
.
max
()))
plt
.
xlabel
(
"Diff Distribute"
)
plt
.
subplot
(
1
,
3
,
2
)
plt
.
yscale
(
'log'
)
plt
.
hist
(
ori_data
,
bins
=
length
,
alpha
=
0.5
,
label
=
'CPU'
,
range
=
(
ori_data
.
min
(),
ori_data
.
max
()))
plt
.
xlabel
(
"CPU Result"
)
plt
.
subplot
(
1
,
3
,
3
)
plt
.
yscale
(
'log'
)
plt
.
hist
(
cur_data
,
bins
=
length
,
alpha
=
0.5
,
label
=
'Backend'
,
range
=
(
cur_data
.
min
(),
cur_data
.
max
()))
plt
.
xlabel
(
"Backend Result"
)
plt
.
savefig
(
file_name
,
dpi
=
300
)
return
result
class
AccuracyChecker
():
def
__init__
(
self
):
self
.
configs
=
None
self
.
dataloader
=
None
self
.
runtime_backend
=
None
self
.
output_dir
=
""
def
calculate_diff
(
self
)
->
Dict
[
str
,
float
]:
"""
Return a dictionary of Mean Diff, Std Diff and Max Diff
Args: None
Returns: Dict[str, float]
"""
cpu_data_path
=
os
.
path
.
abspath
(
'general_perf/reports/CPU/'
+
self
.
configs
[
"model"
])
if
not
os
.
path
.
exists
(
cpu_data_path
):
log
.
info
(
"Fetch CPU Data Failed"
)
return
{}
vendor_data
=
np
.
load
(
self
.
output_dir
+
"/{}.npy"
.
format
(
self
.
dataloader
.
name
()))
cpu_data
=
np
.
load
(
cpu_data_path
+
"/{}.npy"
.
format
(
self
.
dataloader
.
name
()))
return
draw_all_diff
(
cpu_data
,
vendor_data
,
self
.
output_dir
+
"/"
+
self
.
configs
[
"model"
]
+
'-to-'
+
self
.
configs
[
'compile_precision'
].
lower
()
+
'.png'
)
def
calculate_acc
(
self
,
data_percent
)
->
Dict
[
str
,
Any
]:
raise
NotImplementedError
(
"Dataset: caculate_acc"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment