uopdate datasets

347dae81 · wangkaixiong · e4cefa34 · 347dae81 · 347dae81 · 347dae81
Commit 347dae81 authored Nov 27, 2024 by wangkaixiong 🚴🏼
20 changed files
--- a/.gitignore
+++ b/.gitignore
+__pycache__
 *.tar.gz
 *.whl
 *.zip
@@ -7,4 +8,4 @@
 *.torrent
 *.pyc
 *.npy
-*.csv
\ No newline at end of file
+*.csv
--- a/ByteMLPerf/.gitignore
+++ b/ByteMLPerf/.gitignore
-__pycache__
-*.pyc
-*.prototxt
-*.deploy
-.vscode/
-*.npy
-*.tar
-span.log
-byte_micro_perf/backends/*/venv/
-byte_micro_perf/reports/
-byte_infer_perf/general_perf/tools/venv/
-byte_infer_perf/general_perf/backends/*/venv/
-byte_infer_perf/general_perf/model_zoo/*
-!byte_infer_perf/general_perf/model_zoo/*.json
-byte_infer_perf/general_perf/download/*.*
-!byte_infer_perf/general_perf/download/README.md
-byte_infer_perf/general_perf/datasets/open_imagenet/preprocessed/
-byte_infer_perf/general_perf/datasets/*
-!byte_infer_perf/general_perf/datasets/fake_dataset
-!*.py
-byte_infer_perf/general_perf/reports/*
-!byte_infer_perf/general_perf/_inference/general_perf/reports/README
-format_code.sh
-init_env.sh
+# __pycache__
+# *.pyc
+# *.prototxt
+# *.deploy
+# .vscode/
+# *.npy
+# *.tar
+# span.log
+# byte_micro_perf/backends/*/venv/
+# byte_micro_perf/reports/
+# byte_infer_perf/general_perf/tools/venv/
+# byte_infer_perf/general_perf/backends/*/venv/
+# byte_infer_perf/general_perf/model_zoo/*
+# !byte_infer_perf/general_perf/model_zoo/*.json
+# byte_infer_perf/general_perf/download/*.*
+# !byte_infer_perf/general_perf/download/README.md
+# byte_infer_perf/general_perf/datasets/open_imagenet/preprocessed/
+# byte_infer_perf/general_perf/datasets/*
+# !byte_infer_perf/general_perf/datasets/fake_dataset
+# !*.py
+# byte_infer_perf/general_perf/reports/*
+# !byte_infer_perf/general_perf/_inference/general_perf/reports/README
+# format_code.sh
+# init_env.sh

-byte_infer_perf/llm_perf/download
-byte_infer_perf/llm_perf/model_zoo/sota
-byte_infer_perf/llm_perf/reports
+# byte_infer_perf/llm_perf/download
+# byte_infer_perf/llm_perf/model_zoo/sota
+# byte_infer_perf/llm_perf/reports


-workspace
-test
\ No newline at end of file
+# workspace
+# test
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/data_loader.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/data_loader.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import numpy as np
+from general_perf.datasets import data_loader
+from tqdm import tqdm
+import collections
+
+log = logging.getLogger("CAIL2019")
+
+maxlen = 1024
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+
+        log.info("Initial...")
+        self.config = config
+        self.cur_bs = 2
+
+        batch_token_ids = np.load(
+            "general_perf/datasets/{}/batch_token_ids.npy".format(
+                self.config['dataset_name']),
+            allow_pickle=True)
+        batch_segment_ids = np.load(
+            "general_perf/datasets/{}/batch_segment_ids.npy".format(
+                self.config['dataset_name']),
+            allow_pickle=True)
+        labels = np.load("general_perf/datasets/{}/label.npy".format(
+            self.config['dataset_name']),
+                         allow_pickle=True)
+        self.feed_dict = collections.defaultdict(list)
+        self.feed_dict['batch_token_ids'] = batch_token_ids.tolist()
+        self.feed_dict['batch_segment_ids'] = batch_segment_ids.tolist()
+        self.feed_dict['label'] = labels.tolist()
+
+        self.items = len(self.feed_dict['label'])
+        self.batch_num = int(self.items / self.cur_bs)
+
+        for i in range(self.items):
+            batch_token_id = np.pad(
+                self.feed_dict['batch_token_ids'][i],
+                (0, 1024 - len(self.feed_dict['batch_token_ids'][i])),
+                'constant').astype(np.float32)
+            batch_segment_id = np.pad(
+                self.feed_dict['batch_segment_ids'][i],
+                (0, 1024 - len(self.feed_dict['batch_segment_ids'][i])),
+                'constant').astype(np.float32)
+            self.feed_dict['batch_token_ids'][i] = batch_token_id.tolist()
+            self.feed_dict['batch_segment_ids'][i] = batch_segment_id.tolist()
+
+    def name(self):
+        return self.config['dataset_name']
+
+    def preprocess(self):
+        log.info("Preprocessing...")
+
+        self.rebatch(self.cur_bs, skip=False)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+        self.batch_num = int(self.items / self.cur_bs)
+        self.batched_data = []
+        self.labels = []
+        for i in tqdm(range(self.batch_num)):
+            split_data = {
+                'input_segment:0':
+                self.feed_dict["batch_segment_ids"][i * self.cur_bs:(i + 1) *
+                                                    self.cur_bs],
+                'input_token:0':
+                self.feed_dict["batch_token_ids"][i * self.cur_bs:(i + 1) *
+                                                  self.cur_bs],
+            }
+            self.labels.append(
+                self.feed_dict["label"][i * self.cur_bs:(i + 1) * self.cur_bs])
+            self.batched_data.append(split_data)
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/pre_process_data.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/pre_process_data.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tqdm import tqdm
+import json
+import collections
+import numpy as np
+from bert4keras.tokenizers import Tokenizer
+import jieba
+jieba.initialize()
+
+test_data = []
+with open("test.json", encoding='utf-8') as f:
+    for l in f:
+        l = json.loads(l)
+        assert l['label'] in 'BC'
+        if l['label'] == 'B':
+            test_data.append((l['A'], l['B'], l['C']))
+        else:
+            test_data.append((l['A'], l['C'], l['B']))
+
+tokenizer = Tokenizer("vocab.txt",
+                      do_lower_case=True,
+                      pre_tokenize=lambda s: jieba.cut(s, HMM=False))
+
+feed_dict = collections.defaultdict(list)
+maxlen = 1024
+for i in tqdm(range(len(test_data))):
+    (text1, text2, text3) = test_data[i]
+    token_ids, segment_ids = tokenizer.encode(text1, text2, maxlen=maxlen)
+    feed_dict["batch_token_ids"].append(token_ids)
+    feed_dict["batch_segment_ids"].append(segment_ids)
+    feed_dict["label"].append([1])
+    token_ids, segment_ids = tokenizer.encode(text1, text3, maxlen=maxlen)
+    feed_dict["batch_token_ids"].append(token_ids)
+    feed_dict["batch_segment_ids"].append(segment_ids)
+    feed_dict["label"].append([0])
+
+np.save("{}.npy".format('batch_token_ids'),
+        feed_dict["batch_token_ids"],
+        allow_pickle=True)
+np.save("{}.npy".format('batch_segment_ids'),
+        feed_dict["batch_segment_ids"],
+        allow_pickle=True)
+np.save("{}.npy".format('label'), feed_dict["label"], allow_pickle=True)
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/test_accuracy.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/test_accuracy.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+        good, total = 0, 0
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data, labels = self.dataloader.get_samples(i)
+
+            results = self.runtime_backend.predict(test_data)
+            results = results[list(results)[0]]
+            diffs.append(results)
+
+            total += len(results) // 2
+            good += (results[::2] > results[1::2]).sum()
+
+        accuracy = round((good / total), 5)
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                diffs)
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, accuracy))
+        return {"Top-1": accuracy}
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/vocab.txt
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/vocab.txt
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/data_loader.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/data_loader.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import logging
+
+import numpy as np
+import os
+import pickle
+from tqdm import tqdm
+from typing import Any
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from PIL import Image
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+from general_perf.datasets import data_loader
+
+log = logging.getLogger("CIFAR100")
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64
+}
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+        log.info("Initial...")
+
+        base_folder = "general_perf/datasets/{}/cifar-100-python".format(
+            self.config['dataset_name'])
+        test_list = [
+            ['test', 'f0ef6b0ae62326f3e7ffdfab6717acfc'],
+        ]
+        meta = {
+            'filename': 'meta',
+            'key': 'fine_label_names',
+            'md5': '7973b15100ade9c7d40fb424638fde48',
+        }
+
+        self.data: Any = []
+        self.targets = []
+
+        # now load the picked numpy arrays
+        for file_name, checksum in test_list:
+            file_path = os.path.join(base_folder, file_name)
+            with open(file_path, 'rb') as f:
+                entry = pickle.load(f, encoding='latin1')
+                self.data.append(entry['data'])
+                if 'labels' in entry:
+                    self.targets.extend(entry['labels'])
+                else:
+                    self.targets.extend(entry['fine_labels'])
+
+        self.data = np.vstack(self.data).reshape(-1, 3, 32, 32)
+        self.data = self.data.transpose((0, 2, 3, 1))  # convert to HWC
+
+        transformer = _transform()
+        path = os.path.join(base_folder, meta['filename'])
+        with open(path, 'rb') as infile:
+            data = pickle.load(infile, encoding='latin1')
+            self.classes = data[meta['key']]
+        self.class_to_idx = {
+            _class: i
+            for i, _class in enumerate(self.classes)
+        }
+        self.test_data = []
+        for i in tqdm(range(len(self.data))):
+            img = self.data[i]
+            img = Image.fromarray(img)
+            img = transformer(img).detach().numpy()
+            self.test_data.append(img)
+        self.text_input = np.load(os.path.join(base_folder, 'text.npy'))
+        self.config = config
+        self.cur_bs = 1
+        self.items = len(self.data)
+        self.batch_num = int(self.items / self.cur_bs)
+
+    def name(self):
+        return self.config['dataset_name']
+
+    def preprocess(self):
+        log.info("Preprocessing...")
+
+        self.rebatch(self.cur_bs, skip=False)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+        self.batch_num = int(self.items / self.cur_bs)
+        self.batched_data = []
+        self.labels = []
+        for i in tqdm(range(self.batch_num)):
+            split_data = {
+                'image': self.test_data[i * self.cur_bs:(i + 1) * self.cur_bs],
+                'text': self.text_input,
+            }
+            self.labels.append(self.targets[i * self.cur_bs:(i + 1) *
+                                            self.cur_bs])
+            self.batched_data.append(split_data)
+
+    def get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                if key == "image":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
+
+
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+
+
+def _transform():
+    return Compose([
+        Resize(224, interpolation=BICUBIC),
+        CenterCrop(224),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/test_accuracy.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/test_accuracy.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+        good, total = 0, 0
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data, labels = self.dataloader.get_samples(i)
+            logits_per_image, logits_per_text = self.runtime_backend.predict(
+                test_data)
+            diffs.append(logits_per_image)
+
+            for j in range(len(logits_per_image)):
+                probs = logits_per_image[j]
+
+                if np.argmax(probs) == labels[j]:
+                    good += 1
+                total += 1
+
+        accuracy = round((good / total), 5)
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                diffs,
+                allow_pickle=True)
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, accuracy))
+        return {"Top-1": accuracy}
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/data_loader.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/data_loader.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from re import T
+import numpy as np
+from general_perf.datasets import data_loader
+from tqdm import tqdm
+
+log = logging.getLogger("CriteoKaggle")
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+
+        log.info("Initial...")
+        self.config = config
+        self.cur_bs = 1
+        if not os.path.exists("general_perf/datasets/{}/numeric.npy".format(
+                self.config['dataset_name'])):
+            from general_perf.datasets.open_criteo_kaggle.preprocess_dataset import csv_to_numpy
+            csv_to_numpy(
+                "general_perf/datasets/{}/eval.csv".format(
+                    self.config['dataset_name']),
+                "general_perf/datasets/{}/".format(self.config['dataset_name']))
+
+        num = np.load("general_perf/datasets/{}/numeric.npy".format(
+            self.config['dataset_name']))
+        cat = np.load("general_perf/datasets/{}/categorical.npy".format(
+            self.config['dataset_name']))
+        label = np.load("general_perf/datasets/{}/label.npy".format(
+            self.config['dataset_name']))
+        self.items = len(num)
+        self.batch_num = int(self.items / self.cur_bs)
+        self.feed_dict = {}
+        for i in tqdm(range(cat.shape[0])):
+            if i == 0:
+                self.feed_dict["new_categorical_placeholder:0"] = list(
+                    cat[i].reshape(-1, 2))
+                self.feed_dict["new_numeric_placeholder:0"] = list(
+                    num[i].reshape(1, -1))
+                self.feed_dict["label"] = list(label[i])
+            else:
+                self.feed_dict["new_categorical_placeholder:0"].extend(
+                    cat[i].reshape(-1, 2))
+                self.feed_dict["new_numeric_placeholder:0"].extend(
+                    num[i].reshape(1, -1))
+                self.feed_dict["label"].extend(label[i])
+        self.feed_dict['new_categorical_placeholder:0'] = np.array(
+            self.feed_dict['new_categorical_placeholder:0'], dtype=np.int64)
+        self.feed_dict['new_numeric_placeholder:0'] = np.array(
+            self.feed_dict['new_numeric_placeholder:0'], dtype=np.float32)
+        self.feed_dict['label'] = np.array(self.feed_dict['label'],
+                                           dtype=np.int64)
+
+    def name(self):
+        return self.config['dataset_name']
+
+    def preprocess(self):
+        log.info("Preprocessing...")
+
+        self.rebatch(self.cur_bs, skip=False)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+        self.batch_num = int(self.items / self.cur_bs)
+        self.batched_data = []
+        self.labels = []
+        for i in tqdm(range(self.batch_num)):
+            split_data = {
+                'new_categorical_placeholder:0':
+                self.feed_dict["new_categorical_placeholder:0"][i *
+                                                                self.cur_bs *
+                                                                26:(i + 1) *
+                                                                self.cur_bs *
+                                                                26, ],
+                'new_numeric_placeholder:0':
+                self.feed_dict["new_numeric_placeholder:0"][
+                    i * self.cur_bs:(i + 1) * self.cur_bs, ],
+            }
+            self.labels.append(
+                self.feed_dict["label"][i * self.cur_bs:(i + 1) *
+                                        self.cur_bs, ])
+            self.batched_data.append(split_data)
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/preprocess_dataset.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/preprocess_dataset.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import pandas
+import argparse
+import numpy as np
+import tensorflow as tf
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input',
+                        type=str,
+                        default="eval.csv",
+                        help='full path of data file e.g. eval.csv',
+                        dest='evaldatafile_path',
+                        required=True)
+
+    args = parser.parse_args()
+    return args
+
+
+def version_is_less_than(a, b):
+    a_parts = a.split('.')
+    b_parts = b.split('.')
+
+    for i in range(len(a_parts)):
+        if int(a_parts[i]) < int(b_parts[i]):
+            print('{} < {}, version_is_less_than() returning False'.format(
+                a_parts[i], b_parts[i]))
+            return True
+    return False
+
+
+def csv_to_numpy(eval_csv_file, output):
+    print("TensorFlow version {}".format(tf.__version__))
+    required_tf_version = '2.0.0'
+
+    if version_is_less_than(tf.__version__, required_tf_version):
+        tf.compat.v1.enable_eager_execution()
+
+    # args = parse_args()
+    # eval_csv_file = args.evaldatafile_path
+
+    csv = pandas.read_csv(eval_csv_file, header=None)
+    if len(csv.columns) == 39:
+        dataset_type = 'test'
+    else:
+        dataset_type = 'eval'
+
+    fill_na_dict = {}
+    if dataset_type == 'test':
+        for i in range(0, 13):
+            fill_na_dict[i] = 0.0
+        for i in range(13, 39):
+            fill_na_dict[i] = ""
+    else:
+        for i in range(1, 14):
+            fill_na_dict[i] = 0.0
+        for i in range(14, 40):
+            fill_na_dict[i] = ""
+
+    csv = csv.fillna(value=fill_na_dict).values
+
+    LABEL_COLUMN = ["clicked"]
+    CATEGORICAL_COLUMNS1 = ["C" + str(i) + "_embedding" for i in range(1, 27)]
+    NUMERIC_COLUMNS1 = ["I" + str(i) for i in range(1, 14)]
+    CATEGORICAL_COLUMNS2 = ["C" + str(i) + "_embedding" for i in range(1, 27)]
+    NUMERIC_COLUMNS2 = ["I" + str(i) for i in range(1, 14)]
+
+    DATA_COLUMNS = LABEL_COLUMN + NUMERIC_COLUMNS1 + CATEGORICAL_COLUMNS1
+
+    CATEGORICAL_COLUMNS1.sort()
+    NUMERIC_COLUMNS1.sort()
+
+    with open(eval_csv_file, 'r') as f:
+        nums = [line.strip('\n\r').split(',') for line in f.readlines()]
+        numpy_arr = np.array(nums)
+        numpy_arr[numpy_arr == ''] = '0'
+        min_list, max_list, range_list = [], [], []
+
+        for i in range(len(DATA_COLUMNS)):
+            if DATA_COLUMNS[i] in NUMERIC_COLUMNS1:
+                col_min = numpy_arr[:, i].astype(np.float32).min()
+                col_max = numpy_arr[:, i].astype(np.float32).max()
+                min_list.append(col_min)
+                max_list.append(col_max)
+                range_list.append(col_max - col_min)
+
+        print('min list', min_list)
+        print('max list', max_list)
+        print('range list', range_list)
+
+    all_data = []
+    no_of_rows = 0
+    for row in csv:
+        no_of_rows = no_of_rows + 1
+        unnormalized_vals = np.array(row[1:14])
+        normalized_vals = (unnormalized_vals - min_list) / range_list
+        new_categorical_dict = dict(zip(CATEGORICAL_COLUMNS2, row[14:40]))
+
+        new_categorical_list = []
+        for i in CATEGORICAL_COLUMNS1:
+            if pandas.isnull(new_categorical_dict[i]):
+                new_categorical_list.append("")
+            else:
+                new_categorical_list.append(new_categorical_dict[i])
+
+        if tf.executing_eagerly():
+            hash_values = tf.strings.to_hash_bucket_fast(
+                new_categorical_list, 1000).numpy()
+        else:
+            hash_tensor = tf.strings.to_hash_bucket_fast(
+                new_categorical_list, 1000)
+            with tf.compat.v1.Session() as sess:
+                hash_values = hash_tensor.eval()
+
+        new_numerical_dict = dict(zip(NUMERIC_COLUMNS2, normalized_vals))
+
+        item_data = {
+            "new_numeric_placeholder": [],
+            "new_categorical_placeholder": [],
+            "label": []
+        }
+
+        for i in NUMERIC_COLUMNS1:
+            item_data["new_numeric_placeholder"].extend(
+                [new_numerical_dict[i]])
+
+        for i in range(0, 26):
+            item_data["new_categorical_placeholder"].extend([i])
+            item_data["new_categorical_placeholder"].extend([hash_values[i]])
+
+        item_data["label"].append(row[0])
+
+        all_data.append(item_data)
+
+    wnd_num = []
+    wnd_cate = []
+    wnd_lable = []
+
+    for data in all_data:
+        wnd_num.append(data["new_numeric_placeholder"])
+        wnd_cate.append(data["new_categorical_placeholder"])
+        wnd_lable.append(data["label"])
+
+    np.save(os.path.join(output, "numeric.npy"), np.array(wnd_num))
+    np.save(os.path.join(output, "categorical.npy"), np.array(wnd_cate))
+    np.save(os.path.join(output, "label.npy"), np.array(wnd_lable))
+
+    print('Total number of rows ', no_of_rows)
+    print(
+        'Generated output file name : wnd_num.npy, wnd_cate.npy, wnd_label.npy'
+    )
+
+
+if __name__ == "__main__":
+    csv_to_numpy()
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/test_accuracy.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/test_accuracy.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+        good, total = 0, 0
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data, labels = self.dataloader.get_samples(i)
+
+            results = self.runtime_backend.predict(test_data)
+            results = results[list(results)[0]]
+            diffs.append(results)
+
+            for j in range(len(results)):
+                if np.argmax(results[j].round()) == labels[j].round():
+                    good += 1
+                total += 1
+
+        accuracy = round((good / total), 5)
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                diffs)
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, accuracy))
+        return {"Top-1": accuracy}
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/data_loader.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/data_loader.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from os.path import split
+import re
+import time
+
+import cv2
+import numpy as np
+import random
+from tqdm import tqdm
+
+from general_perf.datasets import data_loader
+
+log = logging.getLogger("Imagenet")
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+}
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+        log.info("Initial...")
+
+        self.config = config
+        self.cur_bs = 1
+        self.image_size = [224, 224, 3]
+
+        if self.config['framework'] == 'Tensorflow':
+            image_format = "NHWC"
+            pre_process = pre_process_vgg
+        else:
+            image_format = "NCHW"
+            if 'resnet50' in self.config['model']:
+                pre_process = pre_process_imagenet_pytorch
+            else:
+                pre_process = pre_process_imagenet_vit
+
+        cache_dir = os.getcwd() + \
+            "/general_perf/datasets/{}".format(self.config['dataset_name'])
+        self.input_name = self.config['inputs']
+        self.image_list = []
+        self.label_list = []
+        self.count = None
+        self.use_cache = 0
+        self.cache_dir = os.path.join(cache_dir, "preprocessed",
+                                      self.config['model'])
+        self.data_path = "general_perf/datasets/{}/ILSVRC2012_img_val".format(
+            self.config['dataset_name'])
+        self.pre_process = pre_process
+        self.items = 0
+        # input images are in HWC
+        self.need_transpose = True if image_format == "NCHW" else False
+        not_found = 0
+        os.makedirs(self.cache_dir, exist_ok=True)
+
+        image_list = 'general_perf/datasets/{}/val_map.txt'.format(
+            self.config['dataset_name'])
+
+        start = time.time()
+        with open(image_list, 'r') as f:
+            for s in tqdm(f):
+                image_name, label = re.split(r"\s+", s.strip())
+                src = os.path.join(self.data_path, image_name)
+                if not os.path.exists(src):
+                    # if the image does not exists ignore it
+                    not_found += 1
+                    continue
+                os.makedirs(os.path.dirname(
+                    os.path.join(self.cache_dir, image_name)),
+                            exist_ok=True)
+                dst = os.path.join(self.cache_dir, image_name)
+                if not os.path.exists(dst + ".npy"):
+                    img_org = cv2.imread(src)
+                    processed = self.pre_process(
+                        img_org,
+                        need_transpose=self.need_transpose,
+                        dims=self.image_size)
+                    np.save(dst, processed)
+
+                self.image_list.append(image_name)
+                self.label_list.append(int(label) + 1)
+                self.items = len(self.image_list)
+
+                # limit the dataset if requested
+                if self.count and len(self.image_list) >= self.count:
+                    break
+
+        time_taken = time.time() - start
+        if not self.image_list:
+            log.error("no images in image list found")
+            raise ValueError("no images in image list found")
+        if not_found > 0:
+            log.info("reduced image list, %d images not found", not_found)
+
+        log.info("loaded {} images, cache={}, took={:.1f}sec".format(
+            len(self.image_list), self.use_cache, time_taken))
+
+        self.label_list = np.array(self.label_list)
+        self.batch_num = int(self.items / self.cur_bs)
+        self.shuffle_index = [i for i in range(self.items)]
+        random.seed(7)
+        random.shuffle(self.shuffle_index)
+
+    def name(self):
+        return self.config['dataset_name']
+
+    def preprocess(self):
+        log.info("Preprocessing...")
+
+        self.rebatch(self.cur_bs, skip=False)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+        self.batch_num = int(self.items / self.cur_bs)
+        self.batched_data = []
+        self.labels = []
+        for i in tqdm(range(self.batch_num)):
+            split_data, labels = [], []
+            for j in range(i * self.cur_bs, (i + 1) * self.cur_bs):
+                output, label = self.get_item(self.shuffle_index[j])
+                split_data.append(output)
+                labels.append(label)
+
+            self.labels.append(labels)
+            self.batched_data.append({self.input_name: np.array(split_data)})
+
+    def get_samples(self, sample_id):
+        if sample_id >= len(self.batched_data) or sample_id < 0:
+            raise ValueError("Your Input ID: {} is out of range: {}".format(
+                sample_id, len(self.batched_data)))
+        return self.batched_data[sample_id], self.labels[sample_id]
+
+    def get_item(self, nr):
+        """Get image by number in the list."""
+        dst = os.path.join(self.cache_dir, self.image_list[nr])
+        img = np.load(dst + ".npy")
+        return img, self.label_list[nr]
+
+
+#
+# pre-processing
+#
+def center_crop(img, out_height, out_width):
+    height, width, _ = img.shape
+    left = int((width - out_width) / 2)
+    right = int((width + out_width) / 2)
+    top = int((height - out_height) / 2)
+    bottom = int((height + out_height) / 2)
+    img = img[top:bottom, left:right]
+    return img
+
+
+def resize_with_aspectratio(img,
+                            out_height,
+                            out_width,
+                            scale=87.5,
+                            inter_pol=cv2.INTER_LINEAR):
+    height, width, _ = img.shape
+    new_height = int(100. * out_height / scale)
+    new_width = int(100. * out_width / scale)
+    if height > width:
+        w = new_width
+        h = int(new_height * height / width)
+    else:
+        h = new_height
+        w = int(new_width * width / height)
+    img = cv2.resize(img, (w, h), interpolation=inter_pol)
+    return img
+
+
+def pre_process_vgg(img, dims=None, need_transpose=False):
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+    output_height, output_width, _ = dims
+    cv2_interpol = cv2.INTER_AREA
+    img = resize_with_aspectratio(img,
+                                  output_height,
+                                  output_width,
+                                  inter_pol=cv2_interpol)
+    img = center_crop(img, output_height, output_width)
+    img = np.asarray(img, dtype='float32')
+
+    # normalize image
+    means = np.array([123.68, 116.78, 103.94], dtype=np.float32)
+    img -= means
+
+    # transpose if needed
+    if need_transpose:
+        img = img.transpose([2, 0, 1])
+    return img
+
+
+def pre_process_imagenet_pytorch(img, dims=None, need_transpose=False):
+    from PIL import Image
+    import torchvision.transforms.functional as F
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = Image.fromarray(img)
+    img = F.resize(img, 256, Image.BILINEAR)
+    img = F.center_crop(img, 224)
+    img = F.to_tensor(img)
+    img = F.normalize(img,
+                      mean=[0.485, 0.456, 0.406],
+                      std=[0.229, 0.224, 0.225],
+                      inplace=False)
+    if not need_transpose:
+        img = img.permute(1, 2, 0)  # NHWC
+    img = np.asarray(img, dtype='float32')
+    return img
+
+def pre_process_imagenet_vit(img, dims=None, need_transpose=False):
+    from PIL import Image
+    import torchvision.transforms.functional as F
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = Image.fromarray(img)
+    img = F.resize(img, 256, Image.BILINEAR)
+    img = F.center_crop(img, 384)
+    img = F.to_tensor(img)
+    img = F.normalize(img,
+                      mean=[0.485, 0.456, 0.406],
+                      std=[0.229, 0.224, 0.225],
+                      inplace=False)
+    if not need_transpose:
+        img = img.permute(1, 2, 0)  # NHWC
+    img = np.asarray(img, dtype='float32')
+    return img
+
+
+def maybe_resize(img, dims):
+    img = np.array(img, dtype=np.float32)
+    if len(img.shape) < 3 or img.shape[2] != 3:
+        # some images might be grayscale
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    if dims != None:
+        im_height, im_width, _ = dims
+        img = cv2.resize(img, (im_width, im_height),
+                         interpolation=cv2.INTER_LINEAR)
+    return img
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/test_accuracy.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/test_accuracy.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+import torch
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+        good, total = 0, 0
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data, labels = self.dataloader.get_samples(i)
+
+            results = self.runtime_backend.predict(test_data)
+            if "resnet50-tf-fp16" in self.configs["model"]:
+                if 'classes' in results:
+                    del results['classes']
+            results = self._post_processing(results, self.configs['framework'])
+            diffs.append(results)
+            for j in range(len(results)):
+                if np.argmax(results[j]) == labels[j]:
+                    good += 1
+                total += 1
+        accuracy = round((good / total), 5)
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, accuracy))
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                diffs)
+        return {"Top-1": accuracy}
+
+    def _post_processing(self, inputs, framework):
+        if framework == "Onnx":
+            if self.configs["model"] != "resnet50-onnxruntime-fp16":
+                if isinstance(inputs, list):
+                    inputs = list(inputs[0])
+                elif isinstance(inputs, dict):
+                    key = list(inputs.keys())[0]
+                    inputs = list(inputs[key])
+        else:
+            if isinstance(inputs, tuple):
+                inputs = inputs[0].float().cpu().numpy().astype(float) if inputs[0].dtype==torch.bfloat16 else inputs[0].cpu().numpy().astype(float)
+            else:
+                inputs = inputs[list(inputs)[0]]
+        if framework == "Pytorch" or framework == "Onnx": 
+            inputs = np.array(
+                [np.insert(inputs[i], 0, 0) for i in range(len(inputs))])
+        return inputs
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/val_map.txt
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/val_map.txt
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/bert/accuracy_squad.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/bert/accuracy_squad.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import collections
+import json
+import math
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+import numpy as np
+import six
+from bert import tokenization
+
+# To support feature cache.
+import pickle
+
+max_seq_length = 384
+max_query_length = 64
+doc_stride = 128
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+dtype_map = {
+    "int8": np.int8,
+    "int16": np.int16,
+    "int32": np.int32,
+    "int64": np.int64,
+    "float16": np.float16,
+    "float32": np.float32,
+    "float64": np.float64
+}
+
+
+def get_final_text(pred_text, orig_text, do_lower_case):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #     pred_text = steve smith
+    #     orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits),
+                             key=lambda x: x[1],
+                             reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def write_predictions(all_examples,
+                      all_features,
+                      all_results,
+                      n_best_size,
+                      max_answer_length,
+                      do_lower_case,
+                      output_prediction_file,
+                      max_examples=None):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    print("Writing predictions to: %s" % (output_prediction_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        if max_examples and example_index == max_examples: break
+
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            # FIX: During compliance/audit runs, we only generate a small subset of
+            # all entries from the dataset. As a result, sometimes dict retrieval
+            # fails because a key is missing.
+            # result = unique_id_to_result[feature.unique_id]
+            result = unique_id_to_result.get(feature.unique_id, None)
+            if result is None:
+                continue
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(
+                            start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        prelim_predictions = sorted(prelim_predictions,
+                                    key=lambda x:
+                                    (x.start_logit + x.end_logit),
+                                    reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            tok_text = " ".join(tok_tokens)
+
+            # De-tokenize WordPieces that have been split off.
+            tok_text = tok_text.replace(" ##", "")
+            tok_text = tok_text.replace("##", "")
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case)
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+            nbest.append(
+                _NbestPrediction(text=final_text,
+                                 start_logit=pred.start_logit,
+                                 end_logit=pred.end_logit))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        all_predictions[example.qas_id] = nbest_json[0]["text"]
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/bert/evaluate.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/bert/evaluate.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from collections import Counter
+import string
+import re
+import argparse
+import json
+import sys
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return (normalize_answer(prediction) == normalize_answer(ground_truth))
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def evaluate(dataset, predictions, num):
+    f1 = exact_match = total = 0
+    for article in dataset:
+        for paragraph in article['paragraphs']:
+            for qa in paragraph['qas']:
+                total += 1
+                if qa['id'] not in predictions:
+                    message = 'Unanswered question ' + qa['id'] + \
+                              ' will receive score 0.'
+                    print(message, file=sys.stderr)
+                    continue
+                ground_truths = list(map(lambda x: x['text'], qa['answers']))
+                prediction = predictions[qa['id']]
+                exact_match += metric_max_over_ground_truths(
+                    exact_match_score, prediction, ground_truths)
+                f1 += metric_max_over_ground_truths(f1_score, prediction,
+                                                    ground_truths)
+    total = num
+    exact_match = round(100.0 * exact_match / total, 5)
+    f1 = round(100.0 * f1 / total, 5)
+
+    return {'Exact Match': exact_match, 'F1 Score': f1}
+
+
+def check_accuracy(dataset_file, prediction_file, num):
+    expected_version = '1.1'
+    with open(dataset_file) as dataset_file:
+        dataset_json = json.load(dataset_file)
+        if (dataset_json['version'] != expected_version):
+            print('Evaluation expects v-' + expected_version +
+                  ', but got dataset with v-' + dataset_json['version'],
+                  file=sys.stderr)
+        dataset = dataset_json['data']
+    with open(prediction_file) as prediction_file:
+        predictions = json.load(prediction_file)
+    return evaluate(dataset, predictions, num)
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/create_squad_data.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/create_squad_data.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import tokenization
+import six
+
+
+class SquadExample(object):
+    """A single training/test example for simple sequence classification.
+     For examples without an answer, the start and end position are -1.
+  """
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (tokenization.printable_text(
+            self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+
+def read_squad_examples(input_file,
+                        is_training,
+                        version_2_with_negative=False):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file) as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer."
+                        )
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[answer_offset +
+                                                           answer_length - 1]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(
+                            doc_tokens[start_position:(end_position + 1)])
+                        cleaned_answer_text = " ".join(
+                            tokenization.whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            print("Could not find answer: '%s' vs. '%s'",
+                                  actual_text, cleaned_answer_text)
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+
+                example = SquadExample(qas_id=qas_id,
+                                       question_text=question_text,
+                                       doc_tokens=doc_tokens,
+                                       orig_answer_text=orig_answer_text,
+                                       start_position=start_position,
+                                       end_position=end_position,
+                                       is_impossible=is_impossible)
+                examples.append(example)
+
+    return examples
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The SQuAD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in SQuAD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def convert_examples_to_features(examples,
+                                 tokenizer,
+                                 max_seq_length,
+                                 doc_stride,
+                                 max_query_length,
+                                 is_training,
+                                 output_fn,
+                                 verbose_logging=False):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    unique_id = 1000000000
+
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.tokenize(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        tok_start_position = None
+        tok_end_position = None
+        if is_training and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position +
+                                                     1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position,
+                tokenizer, example.orig_answer_text)
+
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+            "DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append("[CLS]")
+            segment_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append("[SEP]")
+            segment_ids.append(0)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(
+                    tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(doc_spans,
+                                                       doc_span_index,
+                                                       split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            start_position = None
+            end_position = None
+            if is_training and not example.is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start
+                        and tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+
+            if is_training and example.is_impossible:
+                start_position = 0
+                end_position = 0
+
+            if verbose_logging and example_index < 20:
+                print("*** Example ***")
+                print("unique_id: %s" % (unique_id))
+                print("example_index: %s" % (example_index))
+                print("doc_span_index: %s" % (doc_span_index))
+                print(
+                    "tokens: %s" %
+                    " ".join([tokenization.printable_text(x) for x in tokens]))
+                print("token_to_orig_map: %s" % " ".join([
+                    "%d:%d" % (x, y)
+                    for (x, y) in six.iteritems(token_to_orig_map)
+                ]))
+                print("token_is_max_context: %s" % " ".join([
+                    "%d:%s" % (x, y)
+                    for (x, y) in six.iteritems(token_is_max_context)
+                ]))
+                print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                print("input_mask: %s" % " ".join([str(x)
+                                                   for x in input_mask]))
+                print("segment_ids: %s" %
+                      " ".join([str(x) for x in segment_ids]))
+                if is_training and example.is_impossible:
+                    print("impossible example")
+                if is_training and not example.is_impossible:
+                    answer_text = " ".join(
+                        tokens[start_position:(end_position + 1)])
+                    print("start_position: %d" % (start_position))
+                    print("end_position: %d" % (end_position))
+                    print("answer: %s" %
+                          (tokenization.printable_text(answer_text)))
+
+            feature = InputFeatures(unique_id=unique_id,
+                                    example_index=example_index,
+                                    doc_span_index=doc_span_index,
+                                    tokens=tokens,
+                                    token_to_orig_map=token_to_orig_map,
+                                    token_is_max_context=token_is_max_context,
+                                    input_ids=input_ids,
+                                    input_mask=input_mask,
+                                    segment_ids=segment_ids,
+                                    start_position=start_position,
+                                    end_position=end_position,
+                                    is_impossible=example.is_impossible)
+
+            # Run callback
+            output_fn(feature)
+
+            unique_id += 1
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+# To support feature cache.
+import pickle
+from transformers import BertTokenizer, AutoTokenizer
+from general_perf.datasets.open_squad.create_squad_data import read_squad_examples, convert_examples_to_features
+import collections
+from general_perf.datasets import data_loader
+import logging
+from tqdm import tqdm
+import numpy as np
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64
+}
+
+max_seq_length = 384
+max_query_length = 64
+doc_stride = 128
+
+log = logging.getLogger("SQUAD")
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+
+        log.info("Initial...")
+        self.config = config
+        model = self.config["model"]
+        total_count_override = None
+        perf_count_override = None
+        eval_features = []
+        # Load features if cached, convert from examples otherwise.
+        input_file = "general_perf/datasets/open_squad/dev-v1.1.json"
+        cache_path = 'general_perf/datasets/open_squad/eval_features_' + self.config[
+            'model'] + '.pickle'
+        if os.path.exists(cache_path):
+            with open(cache_path, 'rb') as cache_file:
+                eval_features = pickle.load(cache_file)
+            eval_examples = read_squad_examples(input_file=input_file,
+                                                is_training=False,
+                                                version_2_with_negative=False)
+        else:
+            log.info("Start to generate data")
+            if "roberta" in self.config['model']:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    "csarron/roberta-base-squad-v1")
+            elif "albert" in self.config['model']:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    "madlag/albert-base-v2-squad")
+            elif "deberta" in self.config['model']:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    "Palak/microsoft_deberta-base_squad")
+            else:
+                tokenizer = BertTokenizer(
+                    "general_perf/datasets/open_squad/vocab.txt")
+            eval_examples = read_squad_examples(input_file=input_file,
+                                                is_training=False,
+                                                version_2_with_negative=False)
+
+            def append_feature(feature):
+                eval_features.append(feature)
+
+            convert_examples_to_features(examples=eval_examples,
+                                         tokenizer=tokenizer,
+                                         max_seq_length=max_seq_length,
+                                         doc_stride=doc_stride,
+                                         max_query_length=max_query_length,
+                                         is_training=False,
+                                         output_fn=append_feature,
+                                         verbose_logging=False)
+
+            with open(cache_path, 'wb') as cache_file:
+                pickle.dump(eval_features, cache_file)
+
+        self.eval_features = eval_features
+        self.eval_examples = eval_examples
+        self.count = total_count_override or len(self.eval_features)
+        self.items = len(self.eval_features)
+        self.perf_count = perf_count_override or self.count
+        self.model = model
+        self.cur_bs = 1
+        self.batch_num = int(self.items / self.cur_bs)
+
+        # save mask name to help setting the the results at unmasked positions to zero
+        if "roberta" in self.model or "torch" in self.model or "onnxruntime" in self.model:
+            self.mask_name = "attention_mask.1"
+        else:
+            self.mask_name = "input_mask:0"
+
+    def name(self):
+        return self.config['dataset_name']
+
+    def preprocess(self):
+        log.info("Preprocessing...")
+
+        self.rebatch(self.batch_num, skip=False)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+        self.batch_num = int(self.items / self.cur_bs)
+        self.batched_data = []
+        for i in tqdm(range(self.batch_num)):
+            features = collections.defaultdict(list)
+            for j in range(i * self.cur_bs, (i + 1) * self.cur_bs):
+                if "torch" in self.model or "onnxruntime" in self.model:
+                    features['input_ids.1'].append(
+                        self.eval_features[j].input_ids)
+                    features['attention_mask.1'].append(
+                        self.eval_features[j].input_mask)
+                    if "roberta" in self.model:
+                        features['token_type_ids.1'].append(
+                            np.zeros((384,)))
+                    elif "deberta" in self.model:
+                        features['token_type_ids'].append(
+                            self.eval_features[j].segment_ids)
+                    else:
+                        features['token_type_ids.1'].append(
+                            self.eval_features[j].segment_ids)
+                else:
+                    features['input_ids:0'].append(
+                        self.eval_features[j].input_ids)
+                    features['input_mask:0'].append(
+                        self.eval_features[j].input_mask)
+                    features['segment_ids:0'].append(
+                        self.eval_features[j].segment_ids)
+            self.batched_data.append(features)
+
+    def get_samples(self, sample_id):
+        if sample_id >= len(self.batched_data) or sample_id < 0:
+            raise ValueError("Your Input ID is out of range")
+        return self.batched_data[sample_id], []
+
+    def get_id(self, sample_id):
+        if sample_id >= len(self.batched_data) or sample_id < 0:
+            raise ValueError("Your Input ID is out of range")
+        return [
+            self.eval_features[i].unique_id
+            for i in range(sample_id * self.cur_bs, (sample_id + 1) *
+                           self.cur_bs)
+        ]
+
+    def get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+
+        avg_seq_len = 192
+        max_seq_len = 384
+
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                val = [val[0] * batch_size] + val[1:]
+                if i == 0:
+                    # fake input id and mask
+                    input_ids = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
+                    data[key] = input_ids
+                elif i == 1:
+                    # fake input array length
+                    input_len = np.random.randint(low=2 * avg_seq_len -
+                                                  max_seq_len,
+                                                  high=max_seq_len + 1,
+                                                  size=(batch_size),
+                                                  dtype=np.int32)
+
+                    input_mask = np.zeros(val).astype(
+                        INPUT_TYPE[input_type[i]])
+
+                    for b_idx, s_len in enumerate(input_len):
+                        input_mask[b_idx][:s_len] = 1
+                    data[key] = input_mask
+                else:
+                    data[key] = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/test_accuracy.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/test_accuracy.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import collections
+import numpy as np
+import tensorflow as tf
+import torch
+from tqdm import tqdm
+
+from general_perf.datasets.open_squad.bert.accuracy_squad import write_predictions
+from general_perf.datasets.open_squad.bert.evaluate import check_accuracy
+from general_perf.datasets import test_accuracy
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent):
+        log.info("Start to calculate accuracy...")
+        results, diffs = [], []
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+
+        for i in tqdm(range(num)):
+            test_data, _ = self.dataloader.get_samples(i)
+            unique_ids = self.dataloader.get_id(i)
+            result = self.runtime_backend.predict(test_data)
+            start_logits, end_logits = self._post_processing(
+                result, self.configs['framework'])
+
+            # set results at unmasked positions to zero since the vendor's result may have different value at those meaningless positions
+            def set_unmask_to_zero(res, mask):
+                arr = np.array(res)
+                arr[mask == 0] = 0.0
+                return list(arr)
+
+            for i, mask in enumerate(np.array(test_data[self.dataloader.mask_name])):
+                for i, sl in enumerate(start_logits):
+                    start_logits[i] = set_unmask_to_zero(sl, mask)
+
+                for i, el in enumerate(end_logits):
+                    end_logits[i] = set_unmask_to_zero(el, mask)
+
+            for i, u_id in enumerate(unique_ids):
+                results.append(
+                    RawResult(unique_id=u_id,
+                              start_logits=start_logits[i],
+                              end_logits=end_logits[i]))
+
+            diffs.append(start_logits + end_logits)
+
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                diffs)
+        data_file = os.path.abspath('.') + "/general_perf/datasets/open_squad/dev-v1.1.json"
+        predict_file = self.output_dir[:self.output_dir.
+                                       rindex('/')] + "/predictions.json"
+        write_predictions(self.dataloader.eval_examples,
+                          self.dataloader.eval_features, results, 20, 30, True,
+                          predict_file)
+        result = check_accuracy(data_file, predict_file,
+                                num * self.dataloader.cur_bs)
+        log.info('Batch size is {}, F1: {}, Exact Match:{}'.format(
+            self.dataloader.cur_bs, result['F1 Score'], result['Exact Match']))
+        return result
+
+    def _post_processing(self, inputs, framework):
+        start_results, end_results = [], []
+
+        if framework == "Tensorflow":
+            if 'distill' in self.configs['model']:
+                (start_logits, end_logits) = (inputs["output_0"],
+                                              inputs["output_1"])
+                for i in range(self.dataloader.cur_bs):
+                    start_logit = [float(x) for x in start_logits[i].flat]
+                    end_logit = [float(x) for x in end_logits[i].flat]
+                    start_results.append(start_logit)
+                    end_results.append(end_logit)
+            else:
+                tensor_name = list(inputs)[0]
+                for i in range(len(inputs[tensor_name])):
+                    logits = tf.transpose(np.array([inputs[tensor_name][i]]),
+                                          [2, 0, 1])
+                    unstacked_logits = tf.unstack(logits, axis=0)
+                    if tf.executing_eagerly():
+                        (start_logit,
+                         end_logit) = (unstacked_logits[0].numpy(),
+                                       unstacked_logits[1].numpy())
+                    else:
+                        with tf.compat.v1.Session():
+                            (start_logit,
+                             end_logit) = (unstacked_logits[0].eval(),
+                                           unstacked_logits[1].eval())
+                    start_logit = [float(x) for x in start_logit.flat]
+                    end_logit = [float(x) for x in end_logit.flat]
+                    start_results.append(start_logit)
+                    end_results.append(end_logit)
+        else:
+            if isinstance(inputs, dict):
+                (start_logits, end_logits) = (
+                    inputs["start_logits"],
+                    inputs["end_logits"],
+                )
+            elif isinstance(inputs[0], torch.Tensor):
+                (start_logits, end_logits) = (
+                    inputs[0].float().cpu().detach().numpy() if inputs[0].dtype==torch.bfloat16 else inputs[0].cpu().detach().numpy(),
+                    inputs[1].float().cpu().detach().numpy() if inputs[1].dtype==torch.bfloat16 else inputs[1].cpu().detach().numpy(),
+                )
+            else:
+                (start_logits, end_logits) = (inputs[0], inputs[1])
+            
+            for i in range(self.dataloader.cur_bs):
+                start_logit = [float(x) for x in start_logits[i].flat]
+                end_logit = [float(x) for x in end_logits[i].flat]
+                start_results.append(start_logit)
+                end_results.append(end_logit)
+
+        return start_results, end_results
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/vocab.txt
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/vocab.txt