Commit 347dae81 authored by wangkaixiong's avatar wangkaixiong 🚴🏼
Browse files

uopdate datasets

parent e4cefa34
__pycache__
*.tar.gz
*.whl
*.zip
......@@ -7,4 +8,4 @@
*.torrent
*.pyc
*.npy
*.csv
\ No newline at end of file
*.csv
__pycache__
*.pyc
*.prototxt
*.deploy
.vscode/
*.npy
*.tar
span.log
byte_micro_perf/backends/*/venv/
byte_micro_perf/reports/
byte_infer_perf/general_perf/tools/venv/
byte_infer_perf/general_perf/backends/*/venv/
byte_infer_perf/general_perf/model_zoo/*
!byte_infer_perf/general_perf/model_zoo/*.json
byte_infer_perf/general_perf/download/*.*
!byte_infer_perf/general_perf/download/README.md
byte_infer_perf/general_perf/datasets/open_imagenet/preprocessed/
byte_infer_perf/general_perf/datasets/*
!byte_infer_perf/general_perf/datasets/fake_dataset
!*.py
byte_infer_perf/general_perf/reports/*
!byte_infer_perf/general_perf/_inference/general_perf/reports/README
format_code.sh
init_env.sh
# __pycache__
# *.pyc
# *.prototxt
# *.deploy
# .vscode/
# *.npy
# *.tar
# span.log
# byte_micro_perf/backends/*/venv/
# byte_micro_perf/reports/
# byte_infer_perf/general_perf/tools/venv/
# byte_infer_perf/general_perf/backends/*/venv/
# byte_infer_perf/general_perf/model_zoo/*
# !byte_infer_perf/general_perf/model_zoo/*.json
# byte_infer_perf/general_perf/download/*.*
# !byte_infer_perf/general_perf/download/README.md
# byte_infer_perf/general_perf/datasets/open_imagenet/preprocessed/
# byte_infer_perf/general_perf/datasets/*
# !byte_infer_perf/general_perf/datasets/fake_dataset
# !*.py
# byte_infer_perf/general_perf/reports/*
# !byte_infer_perf/general_perf/_inference/general_perf/reports/README
# format_code.sh
# init_env.sh
byte_infer_perf/llm_perf/download
byte_infer_perf/llm_perf/model_zoo/sota
byte_infer_perf/llm_perf/reports
# byte_infer_perf/llm_perf/download
# byte_infer_perf/llm_perf/model_zoo/sota
# byte_infer_perf/llm_perf/reports
workspace
test
\ No newline at end of file
# workspace
# test
\ No newline at end of file
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import numpy as np
from general_perf.datasets import data_loader
from tqdm import tqdm
import collections
log = logging.getLogger("CAIL2019")
maxlen = 1024
class DataLoader(data_loader.Dataset):
def __init__(self, config):
super(DataLoader, self).__init__(config)
log.info("Initial...")
self.config = config
self.cur_bs = 2
batch_token_ids = np.load(
"general_perf/datasets/{}/batch_token_ids.npy".format(
self.config['dataset_name']),
allow_pickle=True)
batch_segment_ids = np.load(
"general_perf/datasets/{}/batch_segment_ids.npy".format(
self.config['dataset_name']),
allow_pickle=True)
labels = np.load("general_perf/datasets/{}/label.npy".format(
self.config['dataset_name']),
allow_pickle=True)
self.feed_dict = collections.defaultdict(list)
self.feed_dict['batch_token_ids'] = batch_token_ids.tolist()
self.feed_dict['batch_segment_ids'] = batch_segment_ids.tolist()
self.feed_dict['label'] = labels.tolist()
self.items = len(self.feed_dict['label'])
self.batch_num = int(self.items / self.cur_bs)
for i in range(self.items):
batch_token_id = np.pad(
self.feed_dict['batch_token_ids'][i],
(0, 1024 - len(self.feed_dict['batch_token_ids'][i])),
'constant').astype(np.float32)
batch_segment_id = np.pad(
self.feed_dict['batch_segment_ids'][i],
(0, 1024 - len(self.feed_dict['batch_segment_ids'][i])),
'constant').astype(np.float32)
self.feed_dict['batch_token_ids'][i] = batch_token_id.tolist()
self.feed_dict['batch_segment_ids'][i] = batch_segment_id.tolist()
def name(self):
return self.config['dataset_name']
def preprocess(self):
log.info("Preprocessing...")
self.rebatch(self.cur_bs, skip=False)
def rebatch(self, new_bs, skip=True):
log.info("Rebatching batch size to: {} ...".format(new_bs))
if self.cur_bs == new_bs and skip:
return
self.cur_bs = new_bs
self.batch_num = int(self.items / self.cur_bs)
self.batched_data = []
self.labels = []
for i in tqdm(range(self.batch_num)):
split_data = {
'input_segment:0':
self.feed_dict["batch_segment_ids"][i * self.cur_bs:(i + 1) *
self.cur_bs],
'input_token:0':
self.feed_dict["batch_token_ids"][i * self.cur_bs:(i + 1) *
self.cur_bs],
}
self.labels.append(
self.feed_dict["label"][i * self.cur_bs:(i + 1) * self.cur_bs])
self.batched_data.append(split_data)
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tqdm import tqdm
import json
import collections
import numpy as np
from bert4keras.tokenizers import Tokenizer
import jieba
jieba.initialize()
test_data = []
with open("test.json", encoding='utf-8') as f:
for l in f:
l = json.loads(l)
assert l['label'] in 'BC'
if l['label'] == 'B':
test_data.append((l['A'], l['B'], l['C']))
else:
test_data.append((l['A'], l['C'], l['B']))
tokenizer = Tokenizer("vocab.txt",
do_lower_case=True,
pre_tokenize=lambda s: jieba.cut(s, HMM=False))
feed_dict = collections.defaultdict(list)
maxlen = 1024
for i in tqdm(range(len(test_data))):
(text1, text2, text3) = test_data[i]
token_ids, segment_ids = tokenizer.encode(text1, text2, maxlen=maxlen)
feed_dict["batch_token_ids"].append(token_ids)
feed_dict["batch_segment_ids"].append(segment_ids)
feed_dict["label"].append([1])
token_ids, segment_ids = tokenizer.encode(text1, text3, maxlen=maxlen)
feed_dict["batch_token_ids"].append(token_ids)
feed_dict["batch_segment_ids"].append(segment_ids)
feed_dict["label"].append([0])
np.save("{}.npy".format('batch_token_ids'),
feed_dict["batch_token_ids"],
allow_pickle=True)
np.save("{}.npy".format('batch_segment_ids'),
feed_dict["batch_segment_ids"],
allow_pickle=True)
np.save("{}.npy".format('label'), feed_dict["label"], allow_pickle=True)
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import numpy as np
from general_perf.datasets import test_accuracy
from tqdm import tqdm
log = logging.getLogger("TestAccuracy")
class AccuracyChecker(test_accuracy.AccuracyChecker):
def calculate_acc(self, data_percent):
log.info("Start to calculate accuracy...")
num = int((data_percent / 100) * self.dataloader.get_batch_count()
) if data_percent else self.dataloader.get_batch_count()
good, total = 0, 0
diffs = []
for i in tqdm(range(num)):
test_data, labels = self.dataloader.get_samples(i)
results = self.runtime_backend.predict(test_data)
results = results[list(results)[0]]
diffs.append(results)
total += len(results) // 2
good += (results[::2] > results[1::2]).sum()
accuracy = round((good / total), 5)
np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
diffs)
log.info('Batch size is {}, Accuracy: {}'.format(
self.dataloader.cur_bs, accuracy))
return {"Top-1": accuracy}
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import logging
import numpy as np
import os
import pickle
from tqdm import tqdm
from typing import Any
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image
try:
from torchvision.transforms import InterpolationMode
BICUBIC = InterpolationMode.BICUBIC
except ImportError:
BICUBIC = Image.BICUBIC
from general_perf.datasets import data_loader
log = logging.getLogger("CIFAR100")
INPUT_TYPE = {
"UINT8": np.uint8,
"FLOAT32": np.float32,
"LONG": np.long,
"INT32": np.int32,
"INT64": np.int64
}
class DataLoader(data_loader.Dataset):
def __init__(self, config):
super(DataLoader, self).__init__(config)
log.info("Initial...")
base_folder = "general_perf/datasets/{}/cifar-100-python".format(
self.config['dataset_name'])
test_list = [
['test', 'f0ef6b0ae62326f3e7ffdfab6717acfc'],
]
meta = {
'filename': 'meta',
'key': 'fine_label_names',
'md5': '7973b15100ade9c7d40fb424638fde48',
}
self.data: Any = []
self.targets = []
# now load the picked numpy arrays
for file_name, checksum in test_list:
file_path = os.path.join(base_folder, file_name)
with open(file_path, 'rb') as f:
entry = pickle.load(f, encoding='latin1')
self.data.append(entry['data'])
if 'labels' in entry:
self.targets.extend(entry['labels'])
else:
self.targets.extend(entry['fine_labels'])
self.data = np.vstack(self.data).reshape(-1, 3, 32, 32)
self.data = self.data.transpose((0, 2, 3, 1)) # convert to HWC
transformer = _transform()
path = os.path.join(base_folder, meta['filename'])
with open(path, 'rb') as infile:
data = pickle.load(infile, encoding='latin1')
self.classes = data[meta['key']]
self.class_to_idx = {
_class: i
for i, _class in enumerate(self.classes)
}
self.test_data = []
for i in tqdm(range(len(self.data))):
img = self.data[i]
img = Image.fromarray(img)
img = transformer(img).detach().numpy()
self.test_data.append(img)
self.text_input = np.load(os.path.join(base_folder, 'text.npy'))
self.config = config
self.cur_bs = 1
self.items = len(self.data)
self.batch_num = int(self.items / self.cur_bs)
def name(self):
return self.config['dataset_name']
def preprocess(self):
log.info("Preprocessing...")
self.rebatch(self.cur_bs, skip=False)
def rebatch(self, new_bs, skip=True):
log.info("Rebatching batch size to: {} ...".format(new_bs))
if self.cur_bs == new_bs and skip:
return
self.cur_bs = new_bs
self.batch_num = int(self.items / self.cur_bs)
self.batched_data = []
self.labels = []
for i in tqdm(range(self.batch_num)):
split_data = {
'image': self.test_data[i * self.cur_bs:(i + 1) * self.cur_bs],
'text': self.text_input,
}
self.labels.append(self.targets[i * self.cur_bs:(i + 1) *
self.cur_bs])
self.batched_data.append(split_data)
def get_fake_samples(self, batch_size, shape, input_type):
data = {}
if input_type:
i = 0
for key, val in shape.items():
if key == "image":
val = [val[0] * batch_size] + val[1:]
data[key] = np.random.random(size=val).astype(
INPUT_TYPE[input_type[i]])
else:
data[key] = np.random.random(size=val).astype(
INPUT_TYPE[input_type[i]])
i += 1
return data
else:
raise ValueError("Please provide input type")
def _convert_image_to_rgb(image):
return image.convert("RGB")
def _transform():
return Compose([
Resize(224, interpolation=BICUBIC),
CenterCrop(224),
_convert_image_to_rgb,
ToTensor(),
Normalize((0.48145466, 0.4578275, 0.40821073),
(0.26862954, 0.26130258, 0.27577711)),
])
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import numpy as np
from general_perf.datasets import test_accuracy
from tqdm import tqdm
log = logging.getLogger("TestAccuracy")
class AccuracyChecker(test_accuracy.AccuracyChecker):
def calculate_acc(self, data_percent):
log.info("Start to calculate accuracy...")
num = int((data_percent / 100) * self.dataloader.get_batch_count()
) if data_percent else self.dataloader.get_batch_count()
good, total = 0, 0
diffs = []
for i in tqdm(range(num)):
test_data, labels = self.dataloader.get_samples(i)
logits_per_image, logits_per_text = self.runtime_backend.predict(
test_data)
diffs.append(logits_per_image)
for j in range(len(logits_per_image)):
probs = logits_per_image[j]
if np.argmax(probs) == labels[j]:
good += 1
total += 1
accuracy = round((good / total), 5)
np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
diffs,
allow_pickle=True)
log.info('Batch size is {}, Accuracy: {}'.format(
self.dataloader.cur_bs, accuracy))
return {"Top-1": accuracy}
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from re import T
import numpy as np
from general_perf.datasets import data_loader
from tqdm import tqdm
log = logging.getLogger("CriteoKaggle")
class DataLoader(data_loader.Dataset):
def __init__(self, config):
super(DataLoader, self).__init__(config)
log.info("Initial...")
self.config = config
self.cur_bs = 1
if not os.path.exists("general_perf/datasets/{}/numeric.npy".format(
self.config['dataset_name'])):
from general_perf.datasets.open_criteo_kaggle.preprocess_dataset import csv_to_numpy
csv_to_numpy(
"general_perf/datasets/{}/eval.csv".format(
self.config['dataset_name']),
"general_perf/datasets/{}/".format(self.config['dataset_name']))
num = np.load("general_perf/datasets/{}/numeric.npy".format(
self.config['dataset_name']))
cat = np.load("general_perf/datasets/{}/categorical.npy".format(
self.config['dataset_name']))
label = np.load("general_perf/datasets/{}/label.npy".format(
self.config['dataset_name']))
self.items = len(num)
self.batch_num = int(self.items / self.cur_bs)
self.feed_dict = {}
for i in tqdm(range(cat.shape[0])):
if i == 0:
self.feed_dict["new_categorical_placeholder:0"] = list(
cat[i].reshape(-1, 2))
self.feed_dict["new_numeric_placeholder:0"] = list(
num[i].reshape(1, -1))
self.feed_dict["label"] = list(label[i])
else:
self.feed_dict["new_categorical_placeholder:0"].extend(
cat[i].reshape(-1, 2))
self.feed_dict["new_numeric_placeholder:0"].extend(
num[i].reshape(1, -1))
self.feed_dict["label"].extend(label[i])
self.feed_dict['new_categorical_placeholder:0'] = np.array(
self.feed_dict['new_categorical_placeholder:0'], dtype=np.int64)
self.feed_dict['new_numeric_placeholder:0'] = np.array(
self.feed_dict['new_numeric_placeholder:0'], dtype=np.float32)
self.feed_dict['label'] = np.array(self.feed_dict['label'],
dtype=np.int64)
def name(self):
return self.config['dataset_name']
def preprocess(self):
log.info("Preprocessing...")
self.rebatch(self.cur_bs, skip=False)
def rebatch(self, new_bs, skip=True):
log.info("Rebatching batch size to: {} ...".format(new_bs))
if self.cur_bs == new_bs and skip:
return
self.cur_bs = new_bs
self.batch_num = int(self.items / self.cur_bs)
self.batched_data = []
self.labels = []
for i in tqdm(range(self.batch_num)):
split_data = {
'new_categorical_placeholder:0':
self.feed_dict["new_categorical_placeholder:0"][i *
self.cur_bs *
26:(i + 1) *
self.cur_bs *
26, ],
'new_numeric_placeholder:0':
self.feed_dict["new_numeric_placeholder:0"][
i * self.cur_bs:(i + 1) * self.cur_bs, ],
}
self.labels.append(
self.feed_dict["label"][i * self.cur_bs:(i + 1) *
self.cur_bs, ])
self.batched_data.append(split_data)
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pandas
import argparse
import numpy as np
import tensorflow as tf
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--input',
type=str,
default="eval.csv",
help='full path of data file e.g. eval.csv',
dest='evaldatafile_path',
required=True)
args = parser.parse_args()
return args
def version_is_less_than(a, b):
a_parts = a.split('.')
b_parts = b.split('.')
for i in range(len(a_parts)):
if int(a_parts[i]) < int(b_parts[i]):
print('{} < {}, version_is_less_than() returning False'.format(
a_parts[i], b_parts[i]))
return True
return False
def csv_to_numpy(eval_csv_file, output):
print("TensorFlow version {}".format(tf.__version__))
required_tf_version = '2.0.0'
if version_is_less_than(tf.__version__, required_tf_version):
tf.compat.v1.enable_eager_execution()
# args = parse_args()
# eval_csv_file = args.evaldatafile_path
csv = pandas.read_csv(eval_csv_file, header=None)
if len(csv.columns) == 39:
dataset_type = 'test'
else:
dataset_type = 'eval'
fill_na_dict = {}
if dataset_type == 'test':
for i in range(0, 13):
fill_na_dict[i] = 0.0
for i in range(13, 39):
fill_na_dict[i] = ""
else:
for i in range(1, 14):
fill_na_dict[i] = 0.0
for i in range(14, 40):
fill_na_dict[i] = ""
csv = csv.fillna(value=fill_na_dict).values
LABEL_COLUMN = ["clicked"]
CATEGORICAL_COLUMNS1 = ["C" + str(i) + "_embedding" for i in range(1, 27)]
NUMERIC_COLUMNS1 = ["I" + str(i) for i in range(1, 14)]
CATEGORICAL_COLUMNS2 = ["C" + str(i) + "_embedding" for i in range(1, 27)]
NUMERIC_COLUMNS2 = ["I" + str(i) for i in range(1, 14)]
DATA_COLUMNS = LABEL_COLUMN + NUMERIC_COLUMNS1 + CATEGORICAL_COLUMNS1
CATEGORICAL_COLUMNS1.sort()
NUMERIC_COLUMNS1.sort()
with open(eval_csv_file, 'r') as f:
nums = [line.strip('\n\r').split(',') for line in f.readlines()]
numpy_arr = np.array(nums)
numpy_arr[numpy_arr == ''] = '0'
min_list, max_list, range_list = [], [], []
for i in range(len(DATA_COLUMNS)):
if DATA_COLUMNS[i] in NUMERIC_COLUMNS1:
col_min = numpy_arr[:, i].astype(np.float32).min()
col_max = numpy_arr[:, i].astype(np.float32).max()
min_list.append(col_min)
max_list.append(col_max)
range_list.append(col_max - col_min)
print('min list', min_list)
print('max list', max_list)
print('range list', range_list)
all_data = []
no_of_rows = 0
for row in csv:
no_of_rows = no_of_rows + 1
unnormalized_vals = np.array(row[1:14])
normalized_vals = (unnormalized_vals - min_list) / range_list
new_categorical_dict = dict(zip(CATEGORICAL_COLUMNS2, row[14:40]))
new_categorical_list = []
for i in CATEGORICAL_COLUMNS1:
if pandas.isnull(new_categorical_dict[i]):
new_categorical_list.append("")
else:
new_categorical_list.append(new_categorical_dict[i])
if tf.executing_eagerly():
hash_values = tf.strings.to_hash_bucket_fast(
new_categorical_list, 1000).numpy()
else:
hash_tensor = tf.strings.to_hash_bucket_fast(
new_categorical_list, 1000)
with tf.compat.v1.Session() as sess:
hash_values = hash_tensor.eval()
new_numerical_dict = dict(zip(NUMERIC_COLUMNS2, normalized_vals))
item_data = {
"new_numeric_placeholder": [],
"new_categorical_placeholder": [],
"label": []
}
for i in NUMERIC_COLUMNS1:
item_data["new_numeric_placeholder"].extend(
[new_numerical_dict[i]])
for i in range(0, 26):
item_data["new_categorical_placeholder"].extend([i])
item_data["new_categorical_placeholder"].extend([hash_values[i]])
item_data["label"].append(row[0])
all_data.append(item_data)
wnd_num = []
wnd_cate = []
wnd_lable = []
for data in all_data:
wnd_num.append(data["new_numeric_placeholder"])
wnd_cate.append(data["new_categorical_placeholder"])
wnd_lable.append(data["label"])
np.save(os.path.join(output, "numeric.npy"), np.array(wnd_num))
np.save(os.path.join(output, "categorical.npy"), np.array(wnd_cate))
np.save(os.path.join(output, "label.npy"), np.array(wnd_lable))
print('Total number of rows ', no_of_rows)
print(
'Generated output file name : wnd_num.npy, wnd_cate.npy, wnd_label.npy'
)
if __name__ == "__main__":
csv_to_numpy()
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import numpy as np
from general_perf.datasets import test_accuracy
from tqdm import tqdm
log = logging.getLogger("TestAccuracy")
class AccuracyChecker(test_accuracy.AccuracyChecker):
def calculate_acc(self, data_percent):
log.info("Start to calculate accuracy...")
num = int((data_percent / 100) * self.dataloader.get_batch_count()
) if data_percent else self.dataloader.get_batch_count()
good, total = 0, 0
diffs = []
for i in tqdm(range(num)):
test_data, labels = self.dataloader.get_samples(i)
results = self.runtime_backend.predict(test_data)
results = results[list(results)[0]]
diffs.append(results)
for j in range(len(results)):
if np.argmax(results[j].round()) == labels[j].round():
good += 1
total += 1
accuracy = round((good / total), 5)
np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
diffs)
log.info('Batch size is {}, Accuracy: {}'.format(
self.dataloader.cur_bs, accuracy))
return {"Top-1": accuracy}
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from os.path import split
import re
import time
import cv2
import numpy as np
import random
from tqdm import tqdm
from general_perf.datasets import data_loader
log = logging.getLogger("Imagenet")
INPUT_TYPE = {
"UINT8": np.uint8,
"FLOAT32": np.float32,
}
class DataLoader(data_loader.Dataset):
def __init__(self, config):
super(DataLoader, self).__init__(config)
log.info("Initial...")
self.config = config
self.cur_bs = 1
self.image_size = [224, 224, 3]
if self.config['framework'] == 'Tensorflow':
image_format = "NHWC"
pre_process = pre_process_vgg
else:
image_format = "NCHW"
if 'resnet50' in self.config['model']:
pre_process = pre_process_imagenet_pytorch
else:
pre_process = pre_process_imagenet_vit
cache_dir = os.getcwd() + \
"/general_perf/datasets/{}".format(self.config['dataset_name'])
self.input_name = self.config['inputs']
self.image_list = []
self.label_list = []
self.count = None
self.use_cache = 0
self.cache_dir = os.path.join(cache_dir, "preprocessed",
self.config['model'])
self.data_path = "general_perf/datasets/{}/ILSVRC2012_img_val".format(
self.config['dataset_name'])
self.pre_process = pre_process
self.items = 0
# input images are in HWC
self.need_transpose = True if image_format == "NCHW" else False
not_found = 0
os.makedirs(self.cache_dir, exist_ok=True)
image_list = 'general_perf/datasets/{}/val_map.txt'.format(
self.config['dataset_name'])
start = time.time()
with open(image_list, 'r') as f:
for s in tqdm(f):
image_name, label = re.split(r"\s+", s.strip())
src = os.path.join(self.data_path, image_name)
if not os.path.exists(src):
# if the image does not exists ignore it
not_found += 1
continue
os.makedirs(os.path.dirname(
os.path.join(self.cache_dir, image_name)),
exist_ok=True)
dst = os.path.join(self.cache_dir, image_name)
if not os.path.exists(dst + ".npy"):
img_org = cv2.imread(src)
processed = self.pre_process(
img_org,
need_transpose=self.need_transpose,
dims=self.image_size)
np.save(dst, processed)
self.image_list.append(image_name)
self.label_list.append(int(label) + 1)
self.items = len(self.image_list)
# limit the dataset if requested
if self.count and len(self.image_list) >= self.count:
break
time_taken = time.time() - start
if not self.image_list:
log.error("no images in image list found")
raise ValueError("no images in image list found")
if not_found > 0:
log.info("reduced image list, %d images not found", not_found)
log.info("loaded {} images, cache={}, took={:.1f}sec".format(
len(self.image_list), self.use_cache, time_taken))
self.label_list = np.array(self.label_list)
self.batch_num = int(self.items / self.cur_bs)
self.shuffle_index = [i for i in range(self.items)]
random.seed(7)
random.shuffle(self.shuffle_index)
def name(self):
return self.config['dataset_name']
def preprocess(self):
log.info("Preprocessing...")
self.rebatch(self.cur_bs, skip=False)
def rebatch(self, new_bs, skip=True):
log.info("Rebatching batch size to: {} ...".format(new_bs))
if self.cur_bs == new_bs and skip:
return
self.cur_bs = new_bs
self.batch_num = int(self.items / self.cur_bs)
self.batched_data = []
self.labels = []
for i in tqdm(range(self.batch_num)):
split_data, labels = [], []
for j in range(i * self.cur_bs, (i + 1) * self.cur_bs):
output, label = self.get_item(self.shuffle_index[j])
split_data.append(output)
labels.append(label)
self.labels.append(labels)
self.batched_data.append({self.input_name: np.array(split_data)})
def get_samples(self, sample_id):
if sample_id >= len(self.batched_data) or sample_id < 0:
raise ValueError("Your Input ID: {} is out of range: {}".format(
sample_id, len(self.batched_data)))
return self.batched_data[sample_id], self.labels[sample_id]
def get_item(self, nr):
"""Get image by number in the list."""
dst = os.path.join(self.cache_dir, self.image_list[nr])
img = np.load(dst + ".npy")
return img, self.label_list[nr]
#
# pre-processing
#
def center_crop(img, out_height, out_width):
height, width, _ = img.shape
left = int((width - out_width) / 2)
right = int((width + out_width) / 2)
top = int((height - out_height) / 2)
bottom = int((height + out_height) / 2)
img = img[top:bottom, left:right]
return img
def resize_with_aspectratio(img,
out_height,
out_width,
scale=87.5,
inter_pol=cv2.INTER_LINEAR):
height, width, _ = img.shape
new_height = int(100. * out_height / scale)
new_width = int(100. * out_width / scale)
if height > width:
w = new_width
h = int(new_height * height / width)
else:
h = new_height
w = int(new_width * width / height)
img = cv2.resize(img, (w, h), interpolation=inter_pol)
return img
def pre_process_vgg(img, dims=None, need_transpose=False):
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
output_height, output_width, _ = dims
cv2_interpol = cv2.INTER_AREA
img = resize_with_aspectratio(img,
output_height,
output_width,
inter_pol=cv2_interpol)
img = center_crop(img, output_height, output_width)
img = np.asarray(img, dtype='float32')
# normalize image
means = np.array([123.68, 116.78, 103.94], dtype=np.float32)
img -= means
# transpose if needed
if need_transpose:
img = img.transpose([2, 0, 1])
return img
def pre_process_imagenet_pytorch(img, dims=None, need_transpose=False):
from PIL import Image
import torchvision.transforms.functional as F
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = Image.fromarray(img)
img = F.resize(img, 256, Image.BILINEAR)
img = F.center_crop(img, 224)
img = F.to_tensor(img)
img = F.normalize(img,
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
inplace=False)
if not need_transpose:
img = img.permute(1, 2, 0) # NHWC
img = np.asarray(img, dtype='float32')
return img
def pre_process_imagenet_vit(img, dims=None, need_transpose=False):
from PIL import Image
import torchvision.transforms.functional as F
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = Image.fromarray(img)
img = F.resize(img, 256, Image.BILINEAR)
img = F.center_crop(img, 384)
img = F.to_tensor(img)
img = F.normalize(img,
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
inplace=False)
if not need_transpose:
img = img.permute(1, 2, 0) # NHWC
img = np.asarray(img, dtype='float32')
return img
def maybe_resize(img, dims):
img = np.array(img, dtype=np.float32)
if len(img.shape) < 3 or img.shape[2] != 3:
# some images might be grayscale
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
if dims != None:
im_height, im_width, _ = dims
img = cv2.resize(img, (im_width, im_height),
interpolation=cv2.INTER_LINEAR)
return img
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import numpy as np
from general_perf.datasets import test_accuracy
from tqdm import tqdm
import torch
log = logging.getLogger("TestAccuracy")
class AccuracyChecker(test_accuracy.AccuracyChecker):
def calculate_acc(self, data_percent):
log.info("Start to calculate accuracy...")
num = int((data_percent / 100) * self.dataloader.get_batch_count()
) if data_percent else self.dataloader.get_batch_count()
good, total = 0, 0
diffs = []
for i in tqdm(range(num)):
test_data, labels = self.dataloader.get_samples(i)
results = self.runtime_backend.predict(test_data)
if "resnet50-tf-fp16" in self.configs["model"]:
if 'classes' in results:
del results['classes']
results = self._post_processing(results, self.configs['framework'])
diffs.append(results)
for j in range(len(results)):
if np.argmax(results[j]) == labels[j]:
good += 1
total += 1
accuracy = round((good / total), 5)
log.info('Batch size is {}, Accuracy: {}'.format(
self.dataloader.cur_bs, accuracy))
np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
diffs)
return {"Top-1": accuracy}
def _post_processing(self, inputs, framework):
if framework == "Onnx":
if self.configs["model"] != "resnet50-onnxruntime-fp16":
if isinstance(inputs, list):
inputs = list(inputs[0])
elif isinstance(inputs, dict):
key = list(inputs.keys())[0]
inputs = list(inputs[key])
else:
if isinstance(inputs, tuple):
inputs = inputs[0].float().cpu().numpy().astype(float) if inputs[0].dtype==torch.bfloat16 else inputs[0].cpu().numpy().astype(float)
else:
inputs = inputs[list(inputs)[0]]
if framework == "Pytorch" or framework == "Onnx":
inputs = np.array(
[np.insert(inputs[i], 0, 0) for i in range(len(inputs))])
return inputs
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import collections
import json
import math
import os
import sys
sys.path.insert(0, os.path.dirname(__file__))
import numpy as np
import six
from bert import tokenization
# To support feature cache.
import pickle
max_seq_length = 384
max_query_length = 64
doc_stride = 128
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
dtype_map = {
"int8": np.int8,
"int16": np.int16,
"int32": np.int32,
"int64": np.int64,
"float16": np.float16,
"float32": np.float32,
"float64": np.float64
}
def get_final_text(pred_text, orig_text, do_lower_case):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(enumerate(logits),
key=lambda x: x[1],
reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
def write_predictions(all_examples,
all_features,
all_results,
n_best_size,
max_answer_length,
do_lower_case,
output_prediction_file,
max_examples=None):
"""Write final predictions to the json file and log-odds of null if needed."""
print("Writing predictions to: %s" % (output_prediction_file))
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction", [
"feature_index", "start_index", "end_index", "start_logit",
"end_logit"
])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
if max_examples and example_index == max_examples: break
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
# FIX: During compliance/audit runs, we only generate a small subset of
# all entries from the dataset. As a result, sometimes dict retrieval
# fails because a key is missing.
# result = unique_id_to_result[feature.unique_id]
result = unique_id_to_result.get(feature.unique_id, None)
if result is None:
continue
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
for start_index in start_indexes:
for end_index in end_indexes:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(
start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
prelim_predictions = sorted(prelim_predictions,
key=lambda x:
(x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, do_lower_case)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
all_predictions[example.qas_id] = nbest_json[0]["text"]
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions, num):
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(f1_score, prediction,
ground_truths)
total = num
exact_match = round(100.0 * exact_match / total, 5)
f1 = round(100.0 * f1 / total, 5)
return {'Exact Match': exact_match, 'F1 Score': f1}
def check_accuracy(dataset_file, prediction_file, num):
expected_version = '1.1'
with open(dataset_file) as dataset_file:
dataset_json = json.load(dataset_file)
if (dataset_json['version'] != expected_version):
print('Evaluation expects v-' + expected_version +
', but got dataset with v-' + dataset_json['version'],
file=sys.stderr)
dataset = dataset_json['data']
with open(prediction_file) as prediction_file:
predictions = json.load(prediction_file)
return evaluate(dataset, predictions, num)
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import json
import tokenization
import six
class SquadExample(object):
"""A single training/test example for simple sequence classification.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (tokenization.printable_text(
self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def read_squad_examples(input_file,
is_training,
version_2_with_negative=False):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file) as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer."
)
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset +
answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(
doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
print("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def convert_examples_to_features(examples,
tokenizer,
max_seq_length,
doc_stride,
max_query_length,
is_training,
output_fn,
verbose_logging=False):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
for (example_index, example) in enumerate(examples):
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position +
1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position,
tokenizer, example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(
tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans,
doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start
and tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
if verbose_logging and example_index < 20:
print("*** Example ***")
print("unique_id: %s" % (unique_id))
print("example_index: %s" % (example_index))
print("doc_span_index: %s" % (doc_span_index))
print(
"tokens: %s" %
" ".join([tokenization.printable_text(x) for x in tokens]))
print("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y)
for (x, y) in six.iteritems(token_to_orig_map)
]))
print("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y)
for (x, y) in six.iteritems(token_is_max_context)
]))
print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
print("input_mask: %s" % " ".join([str(x)
for x in input_mask]))
print("segment_ids: %s" %
" ".join([str(x) for x in segment_ids]))
if is_training and example.is_impossible:
print("impossible example")
if is_training and not example.is_impossible:
answer_text = " ".join(
tokens[start_position:(end_position + 1)])
print("start_position: %d" % (start_position))
print("end_position: %d" % (end_position))
print("answer: %s" %
(tokenization.printable_text(answer_text)))
feature = InputFeatures(unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible)
# Run callback
output_fn(feature)
unique_id += 1
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
# To support feature cache.
import pickle
from transformers import BertTokenizer, AutoTokenizer
from general_perf.datasets.open_squad.create_squad_data import read_squad_examples, convert_examples_to_features
import collections
from general_perf.datasets import data_loader
import logging
from tqdm import tqdm
import numpy as np
INPUT_TYPE = {
"UINT8": np.uint8,
"FLOAT32": np.float32,
"LONG": np.long,
"INT32": np.int32,
"INT64": np.int64
}
max_seq_length = 384
max_query_length = 64
doc_stride = 128
log = logging.getLogger("SQUAD")
class DataLoader(data_loader.Dataset):
def __init__(self, config):
super(DataLoader, self).__init__(config)
log.info("Initial...")
self.config = config
model = self.config["model"]
total_count_override = None
perf_count_override = None
eval_features = []
# Load features if cached, convert from examples otherwise.
input_file = "general_perf/datasets/open_squad/dev-v1.1.json"
cache_path = 'general_perf/datasets/open_squad/eval_features_' + self.config[
'model'] + '.pickle'
if os.path.exists(cache_path):
with open(cache_path, 'rb') as cache_file:
eval_features = pickle.load(cache_file)
eval_examples = read_squad_examples(input_file=input_file,
is_training=False,
version_2_with_negative=False)
else:
log.info("Start to generate data")
if "roberta" in self.config['model']:
tokenizer = AutoTokenizer.from_pretrained(
"csarron/roberta-base-squad-v1")
elif "albert" in self.config['model']:
tokenizer = AutoTokenizer.from_pretrained(
"madlag/albert-base-v2-squad")
elif "deberta" in self.config['model']:
tokenizer = AutoTokenizer.from_pretrained(
"Palak/microsoft_deberta-base_squad")
else:
tokenizer = BertTokenizer(
"general_perf/datasets/open_squad/vocab.txt")
eval_examples = read_squad_examples(input_file=input_file,
is_training=False,
version_2_with_negative=False)
def append_feature(feature):
eval_features.append(feature)
convert_examples_to_features(examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
max_query_length=max_query_length,
is_training=False,
output_fn=append_feature,
verbose_logging=False)
with open(cache_path, 'wb') as cache_file:
pickle.dump(eval_features, cache_file)
self.eval_features = eval_features
self.eval_examples = eval_examples
self.count = total_count_override or len(self.eval_features)
self.items = len(self.eval_features)
self.perf_count = perf_count_override or self.count
self.model = model
self.cur_bs = 1
self.batch_num = int(self.items / self.cur_bs)
# save mask name to help setting the the results at unmasked positions to zero
if "roberta" in self.model or "torch" in self.model or "onnxruntime" in self.model:
self.mask_name = "attention_mask.1"
else:
self.mask_name = "input_mask:0"
def name(self):
return self.config['dataset_name']
def preprocess(self):
log.info("Preprocessing...")
self.rebatch(self.batch_num, skip=False)
def rebatch(self, new_bs, skip=True):
log.info("Rebatching batch size to: {} ...".format(new_bs))
if self.cur_bs == new_bs and skip:
return
self.cur_bs = new_bs
self.batch_num = int(self.items / self.cur_bs)
self.batched_data = []
for i in tqdm(range(self.batch_num)):
features = collections.defaultdict(list)
for j in range(i * self.cur_bs, (i + 1) * self.cur_bs):
if "torch" in self.model or "onnxruntime" in self.model:
features['input_ids.1'].append(
self.eval_features[j].input_ids)
features['attention_mask.1'].append(
self.eval_features[j].input_mask)
if "roberta" in self.model:
features['token_type_ids.1'].append(
np.zeros((384,)))
elif "deberta" in self.model:
features['token_type_ids'].append(
self.eval_features[j].segment_ids)
else:
features['token_type_ids.1'].append(
self.eval_features[j].segment_ids)
else:
features['input_ids:0'].append(
self.eval_features[j].input_ids)
features['input_mask:0'].append(
self.eval_features[j].input_mask)
features['segment_ids:0'].append(
self.eval_features[j].segment_ids)
self.batched_data.append(features)
def get_samples(self, sample_id):
if sample_id >= len(self.batched_data) or sample_id < 0:
raise ValueError("Your Input ID is out of range")
return self.batched_data[sample_id], []
def get_id(self, sample_id):
if sample_id >= len(self.batched_data) or sample_id < 0:
raise ValueError("Your Input ID is out of range")
return [
self.eval_features[i].unique_id
for i in range(sample_id * self.cur_bs, (sample_id + 1) *
self.cur_bs)
]
def get_fake_samples(self, batch_size, shape, input_type):
data = {}
avg_seq_len = 192
max_seq_len = 384
if input_type:
i = 0
for key, val in shape.items():
val = [val[0] * batch_size] + val[1:]
if i == 0:
# fake input id and mask
input_ids = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
data[key] = input_ids
elif i == 1:
# fake input array length
input_len = np.random.randint(low=2 * avg_seq_len -
max_seq_len,
high=max_seq_len + 1,
size=(batch_size),
dtype=np.int32)
input_mask = np.zeros(val).astype(
INPUT_TYPE[input_type[i]])
for b_idx, s_len in enumerate(input_len):
input_mask[b_idx][:s_len] = 1
data[key] = input_mask
else:
data[key] = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
i += 1
return data
else:
raise ValueError("Please provide input type")
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import collections
import numpy as np
import tensorflow as tf
import torch
from tqdm import tqdm
from general_perf.datasets.open_squad.bert.accuracy_squad import write_predictions
from general_perf.datasets.open_squad.bert.evaluate import check_accuracy
from general_perf.datasets import test_accuracy
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
log = logging.getLogger("TestAccuracy")
class AccuracyChecker(test_accuracy.AccuracyChecker):
def calculate_acc(self, data_percent):
log.info("Start to calculate accuracy...")
results, diffs = [], []
num = int((data_percent / 100) * self.dataloader.get_batch_count()
) if data_percent else self.dataloader.get_batch_count()
for i in tqdm(range(num)):
test_data, _ = self.dataloader.get_samples(i)
unique_ids = self.dataloader.get_id(i)
result = self.runtime_backend.predict(test_data)
start_logits, end_logits = self._post_processing(
result, self.configs['framework'])
# set results at unmasked positions to zero since the vendor's result may have different value at those meaningless positions
def set_unmask_to_zero(res, mask):
arr = np.array(res)
arr[mask == 0] = 0.0
return list(arr)
for i, mask in enumerate(np.array(test_data[self.dataloader.mask_name])):
for i, sl in enumerate(start_logits):
start_logits[i] = set_unmask_to_zero(sl, mask)
for i, el in enumerate(end_logits):
end_logits[i] = set_unmask_to_zero(el, mask)
for i, u_id in enumerate(unique_ids):
results.append(
RawResult(unique_id=u_id,
start_logits=start_logits[i],
end_logits=end_logits[i]))
diffs.append(start_logits + end_logits)
np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
diffs)
data_file = os.path.abspath('.') + "/general_perf/datasets/open_squad/dev-v1.1.json"
predict_file = self.output_dir[:self.output_dir.
rindex('/')] + "/predictions.json"
write_predictions(self.dataloader.eval_examples,
self.dataloader.eval_features, results, 20, 30, True,
predict_file)
result = check_accuracy(data_file, predict_file,
num * self.dataloader.cur_bs)
log.info('Batch size is {}, F1: {}, Exact Match:{}'.format(
self.dataloader.cur_bs, result['F1 Score'], result['Exact Match']))
return result
def _post_processing(self, inputs, framework):
start_results, end_results = [], []
if framework == "Tensorflow":
if 'distill' in self.configs['model']:
(start_logits, end_logits) = (inputs["output_0"],
inputs["output_1"])
for i in range(self.dataloader.cur_bs):
start_logit = [float(x) for x in start_logits[i].flat]
end_logit = [float(x) for x in end_logits[i].flat]
start_results.append(start_logit)
end_results.append(end_logit)
else:
tensor_name = list(inputs)[0]
for i in range(len(inputs[tensor_name])):
logits = tf.transpose(np.array([inputs[tensor_name][i]]),
[2, 0, 1])
unstacked_logits = tf.unstack(logits, axis=0)
if tf.executing_eagerly():
(start_logit,
end_logit) = (unstacked_logits[0].numpy(),
unstacked_logits[1].numpy())
else:
with tf.compat.v1.Session():
(start_logit,
end_logit) = (unstacked_logits[0].eval(),
unstacked_logits[1].eval())
start_logit = [float(x) for x in start_logit.flat]
end_logit = [float(x) for x in end_logit.flat]
start_results.append(start_logit)
end_results.append(end_logit)
else:
if isinstance(inputs, dict):
(start_logits, end_logits) = (
inputs["start_logits"],
inputs["end_logits"],
)
elif isinstance(inputs[0], torch.Tensor):
(start_logits, end_logits) = (
inputs[0].float().cpu().detach().numpy() if inputs[0].dtype==torch.bfloat16 else inputs[0].cpu().detach().numpy(),
inputs[1].float().cpu().detach().numpy() if inputs[1].dtype==torch.bfloat16 else inputs[1].cpu().detach().numpy(),
)
else:
(start_logits, end_logits) = (inputs[0], inputs[1])
for i in range(self.dataloader.cur_bs):
start_logit = [float(x) for x in start_logits[i].flat]
end_logit = [float(x) for x in end_logits[i].flat]
start_results.append(start_logit)
end_results.append(end_logit)
return start_results, end_results
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment