Commit e4cefa34 authored by wangkaixiong's avatar wangkaixiong 🚴🏼
Browse files

uopdate datasets

parent 24b257f1
*.tar.gz
*.whl
*.zip
*.json
*.pyc
*.pickle
*.torrent
*.pyc
*.npy
*.csv
\ No newline at end of file
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import numpy as np
log = logging.getLogger("Dataset")
INPUT_TYPE = {
"UINT8": np.uint8,
"FLOAT32": np.float32,
"FLOAT16": np.float16,
"LONG": np.long,
"INT32": np.int32,
"INT64": np.int64,
"BOOL": np.bool
}
class Dataset():
def __init__(self, config):
self.config = config
self.cur_bs = 1
self.batched_data = []
self.labels = []
self.items = 0
self.batch_num = int(self.items / self.cur_bs)
def name(self) -> str:
"""
Return the name of dataset
"""
raise NotImplementedError("Dataset:name")
def get_item_count(self) -> int:
"""
Return the number of data loaded
"""
return self.items
def get_batch_count(self) -> int:
"""
Return the number of batched data
"""
return self.batch_num
def preprocess(self):
"""
Data preprocess will happened here
"""
return
def get_samples(self, sample_id):
"""
Query data with sample id
"""
if sample_id >= len(self.batched_data) or sample_id < 0:
raise ValueError("Your Input ID is out of range")
return self.batched_data[sample_id], self.labels[sample_id]
def rebatch(self, new_bs, skip=True) -> None:
"""
Rebatch Datasets to specified number
"""
raise NotImplementedError("Dataset:rebatch")
def get_fake_samples(self, batch_size, shape, input_type):
"""
Generate fake data for testing
"""
data = {}
if not input_type:
raise ValueError("Please provide input type")
i = 0
for key, val in shape.items():
val = [val[0] * batch_size] + val[1:]
data[key] = np.random.random(size=val).astype(
INPUT_TYPE[input_type[i]])
i += 1
return data
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import numpy as np
from general_perf.datasets import data_loader
INPUT_TYPE = {
"UINT8": np.uint8,
"FLOAT16": np.float16,
"FLOAT32": np.float32,
"LONG": np.long,
"INT32": np.int32,
"INT64": np.int64,
"BOOL": np.bool
}
log = logging.getLogger("FAKE_DATA")
class DataLoader(data_loader.Dataset):
def __init__(self, config):
super(DataLoader, self).__init__(config)
self.config = config
self.cur_bs = 1
def name(self):
return 'fake_dataset'
def get_batch_count(self):
# always return 100
return 100
def generate_fake_data(self):
input_shape = self.config["input_shape"]
input_type = self.config["input_type"].split(',')
return self.get_fake_samples_regular(self.cur_bs, input_shape,
input_type)
def rebatch(self, new_bs, skip=True):
log.info("Rebatching batch size to: {} ...".format(new_bs))
if self.cur_bs == new_bs and skip:
return
self.cur_bs = new_bs
def get_samples(self, sample_id):
if sample_id > 99 or sample_id < 0:
raise ValueError("Your Input ID is out of range")
np.random.seed(sample_id)
return self.generate_fake_data()
def get_fake_samples_regular(self, batch_size, shape, input_type):
data = {}
if not input_type:
raise ValueError("Please provide input type")
i = 0
for key, val in shape.items():
val = [batch_size] + val[1:]
if 'LONG' in input_type[i] or 'INT' in input_type[i]:
if "mask" in key or "segment" in key:
data[key] = np.random.randint(
low=0, high=2,
size=val).astype(INPUT_TYPE[input_type[i]])
elif self.config[
"model"] == "internal_videobert01-onnx-fp32" and key == "1_input_1":
data[key] = np.random.ones(size=val).astype(
INPUT_TYPE[input_type[i]])
else:
data[key] = np.random.randint(
low=0, high=1000,
size=val).astype(INPUT_TYPE[input_type[i]])
elif 'STRING' in input_type[i]:
data[key] = 'This is a test string.'
elif 'BOOL' in input_type[i]:
data[key] = np.zeros(shape=val, dtype=bool)
else:
sample_data = np.random.random(size=val) * 2 - 1
data[key] = sample_data.astype(INPUT_TYPE[input_type[i]])
i += 1
return data
def get_fake_samples_bert(self, batch_size, shape, input_type):
data = {}
avg_seq_len = 192
max_seq_len = 384
if not input_type:
raise ValueError("Please provide input type")
i = 0
for key, val in shape.items():
val = [val[0] * batch_size] + val[1:]
if i == 0:
# fake input id and mask
input_ids = np.random.randint(low=0, high=30523,
size=val).astype(
INPUT_TYPE[input_type[i]])
data[key] = input_ids
elif i == 1:
# fake input array length
input_len = np.random.randint(low=2 * avg_seq_len -
max_seq_len,
high=max_seq_len + 1,
size=(batch_size),
dtype=np.int32)
input_mask = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
for b_idx, s_len in enumerate(input_len):
input_mask[b_idx][:s_len] = 1
data[key] = input_mask
else:
data[key] = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
i += 1
return data
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import numpy as np
from general_perf.datasets import test_accuracy
from tqdm import tqdm
log = logging.getLogger("TestAccuracy")
class AccuracyChecker(test_accuracy.AccuracyChecker):
def calculate_acc(self, data_percent=10):
log.info("Start to calculate accuracy...")
num = int((data_percent / 100) * self.dataloader.get_batch_count()
) if data_percent else self.dataloader.get_batch_count()
diffs = []
for i in tqdm(range(num)):
test_data = self.dataloader.get_samples(i)
results = self.runtime_backend.predict(test_data)
if isinstance(results, dict):
list_key = list(results.keys())
list_key.sort()
for key in list_key:
diffs.extend(results[key].flatten())
elif isinstance(results, list):
for out in results:
diffs.extend(out.flatten())
else:
diffs.extend(results)
log.info('Batch size is {}, Accuracy: {}'.format(
self.dataloader.cur_bs, 0.0))
np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
np.array(diffs),
allow_pickle=True)
return {"Fake Dataset Accuracy": 0}
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import logging
from typing import Any, Dict
import matplotlib.pyplot as plt
import numpy as np
log = logging.getLogger("TestAccuracy")
def draw_all_diff(ori_outs, cur_outs, file_name) -> Dict[str, Any]:
ori_data = ori_outs.flatten()
cur_data = cur_outs.flatten()
'''
Nan & Inf is not compareable, replece with 0
'''
ori_data[np.isnan(ori_data)] = 0.0
ori_data[np.isinf(ori_data)] = 0.0
cur_data[np.isnan(cur_data)] = 0.0
cur_data[np.isinf(cur_data)] = 0.0
length = min(ori_data.shape[0], 300)
diff = ori_data - cur_data
ori_data = np.where(ori_data == 0, 1, ori_data)
rel_diff = np.divide(diff, ori_data)
rel_diff = np.nan_to_num(rel_diff)
log.info('Mean Diff: {}, Std Diff: {}, Max Diff: {}, Max Rel-Diff: {}, Mean Rel-Diff: {}'.format(
np.mean(abs(diff)), np.std(abs(diff)),
abs(diff).max(), abs(rel_diff).max(), np.mean(abs(rel_diff))))
result = {}
result["Mean Diff"] = round(float(np.mean(abs(diff))), 5)
result["Std Diff"] = round(float(np.std(abs(diff))), 5)
result["Max Diff"] = round(float(abs(diff).max()), 5)
result["Max Rel-Diff"] = round(float(abs(rel_diff).max()), 5)
result["Mean Rel-Diff"] = round(float(np.mean(abs(rel_diff))), 5)
plt.figure(figsize=(16, 8))
plt.cla()
plt.subplot(1, 3, 1)
plt.yscale('log')
plt.hist(diff,
bins=length,
alpha=0.5,
label='Diff',
range=(diff.min(), diff.max()))
plt.xlabel("Diff Distribute")
plt.subplot(1, 3, 2)
plt.yscale('log')
plt.hist(ori_data,
bins=length,
alpha=0.5,
label='CPU',
range=(ori_data.min(), ori_data.max()))
plt.xlabel("CPU Result")
plt.subplot(1, 3, 3)
plt.yscale('log')
plt.hist(cur_data,
bins=length,
alpha=0.5,
label='Backend',
range=(cur_data.min(), cur_data.max()))
plt.xlabel("Backend Result")
plt.savefig(file_name, dpi=300)
return result
class AccuracyChecker():
def __init__(self):
self.configs = None
self.dataloader = None
self.runtime_backend = None
self.output_dir = ""
def calculate_diff(self) -> Dict[str, float]:
"""
Return a dictionary of Mean Diff, Std Diff and Max Diff
Args: None
Returns: Dict[str, float]
"""
cpu_data_path = os.path.abspath('general_perf/reports/CPU/' +
self.configs["model"])
if not os.path.exists(cpu_data_path):
log.info("Fetch CPU Data Failed")
return {}
vendor_data = np.load(self.output_dir +
"/{}.npy".format(self.dataloader.name()))
cpu_data = np.load(cpu_data_path +
"/{}.npy".format(self.dataloader.name()))
return draw_all_diff(
cpu_data, vendor_data,
self.output_dir + "/" + self.configs["model"] + '-to-' + self.configs['compile_precision'].lower() + '.png')
def calculate_acc(self, data_percent) -> Dict[str, Any]:
raise NotImplementedError("Dataset: caculate_acc")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment