uopdate datasets

e4cefa34 · wangkaixiong · 24b257f1 · e4cefa34 · e4cefa34 · e4cefa34
Commit e4cefa34 authored Nov 27, 2024 by wangkaixiong 🚴🏼
5 changed files
--- a/.gitignore
+++ b/.gitignore
+*.tar.gz
+*.whl
+*.zip
+*.json
+*.pyc
+*.pickle
+*.torrent
+*.pyc
+*.npy
+*.csv
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/data_loader.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/data_loader.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+
+log = logging.getLogger("Dataset")
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "FLOAT16": np.float16,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+
+class Dataset():
+    def __init__(self, config):
+        self.config = config
+        self.cur_bs = 1
+        self.batched_data = []
+        self.labels = []
+        self.items = 0
+        self.batch_num = int(self.items / self.cur_bs)
+
+    def name(self) -> str:
+        """
+        Return the name of dataset
+        """
+        raise NotImplementedError("Dataset:name")
+
+    def get_item_count(self) -> int:
+        """
+        Return the number of data loaded
+        """
+        return self.items
+
+    def get_batch_count(self) -> int:
+        """
+        Return the number of batched data
+        """
+        return self.batch_num
+
+    def preprocess(self):
+        """
+        Data preprocess will happened here
+        """
+        return
+
+    def get_samples(self, sample_id):
+        """
+        Query data with sample id
+        """
+        if sample_id >= len(self.batched_data) or sample_id < 0:
+            raise ValueError("Your Input ID is out of range")
+        return self.batched_data[sample_id], self.labels[sample_id]
+
+    def rebatch(self, new_bs, skip=True) -> None:
+        """
+        Rebatch Datasets to specified number
+        """
+        raise NotImplementedError("Dataset:rebatch")
+
+    def get_fake_samples(self, batch_size, shape, input_type):
+        """
+        Generate fake data for testing
+        """
+        data = {}
+        if not input_type:
+            raise ValueError("Please provide input type")
+        i = 0
+        for key, val in shape.items():
+            val = [val[0] * batch_size] + val[1:]
+            data[key] = np.random.random(size=val).astype(
+                INPUT_TYPE[input_type[i]])
+            i += 1
+        return data
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/data_loader.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/data_loader.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import data_loader
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT16": np.float16,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+log = logging.getLogger("FAKE_DATA")
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+        self.config = config
+        self.cur_bs = 1
+
+    def name(self):
+        return 'fake_dataset'
+
+    def get_batch_count(self):
+        # always return 100
+        return 100
+
+    def generate_fake_data(self):
+        input_shape = self.config["input_shape"]
+        input_type = self.config["input_type"].split(',')
+
+        return self.get_fake_samples_regular(self.cur_bs, input_shape,
+                                             input_type)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+
+    def get_samples(self, sample_id):
+        if sample_id > 99 or sample_id < 0:
+            raise ValueError("Your Input ID is out of range")
+
+        np.random.seed(sample_id)
+        return self.generate_fake_data()
+
+    def get_fake_samples_regular(self, batch_size, shape, input_type):
+        data = {}
+        if not input_type:
+            raise ValueError("Please provide input type")
+        i = 0
+        for key, val in shape.items():
+            val = [batch_size] + val[1:]
+            if 'LONG' in input_type[i] or 'INT' in input_type[i]:
+                if "mask" in key or "segment" in key:
+                    data[key] = np.random.randint(
+                        low=0, high=2,
+                        size=val).astype(INPUT_TYPE[input_type[i]])
+                elif self.config[
+                        "model"] == "internal_videobert01-onnx-fp32" and key == "1_input_1":
+                    data[key] = np.random.ones(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.randint(
+                        low=0, high=1000,
+                        size=val).astype(INPUT_TYPE[input_type[i]])
+
+            elif 'STRING' in input_type[i]:
+                data[key] = 'This is a test string.'
+            elif 'BOOL' in input_type[i]:
+                data[key] = np.zeros(shape=val, dtype=bool)
+            else:
+                sample_data = np.random.random(size=val) * 2 - 1
+                data[key] = sample_data.astype(INPUT_TYPE[input_type[i]])
+            i += 1
+
+        return data
+
+    def get_fake_samples_bert(self, batch_size, shape, input_type):
+        data = {}
+
+        avg_seq_len = 192
+        max_seq_len = 384
+
+        if not input_type:
+            raise ValueError("Please provide input type")
+        i = 0
+        for key, val in shape.items():
+            val = [val[0] * batch_size] + val[1:]
+            if i == 0:
+                # fake input id and mask
+                input_ids = np.random.randint(low=0, high=30523,
+                                              size=val).astype(
+                                                  INPUT_TYPE[input_type[i]])
+                data[key] = input_ids
+            elif i == 1:
+                # fake input array length
+                input_len = np.random.randint(low=2 * avg_seq_len -
+                                              max_seq_len,
+                                              high=max_seq_len + 1,
+                                              size=(batch_size),
+                                              dtype=np.int32)
+
+                input_mask = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
+
+                for b_idx, s_len in enumerate(input_len):
+                    input_mask[b_idx][:s_len] = 1
+                data[key] = input_mask
+            else:
+                data[key] = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
+            i += 1
+        return data
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/test_accuracy.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/test_accuracy.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent=10):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data = self.dataloader.get_samples(i)
+
+            results = self.runtime_backend.predict(test_data)
+            if isinstance(results, dict):
+                list_key = list(results.keys())
+                list_key.sort()
+                for key in list_key:
+                    diffs.extend(results[key].flatten())
+            elif isinstance(results, list):
+                for out in results:
+                    diffs.extend(out.flatten())
+            else:
+                diffs.extend(results)
+
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, 0.0))
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                np.array(diffs),
+                allow_pickle=True)
+        return {"Fake Dataset Accuracy": 0}
--- a/ByteMLPerf/byte_infer_perf/general_perf/datasets/test_accuracy.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/datasets/test_accuracy.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+from typing import Any, Dict
+import matplotlib.pyplot as plt
+import numpy as np
+
+log = logging.getLogger("TestAccuracy")
+
+
+def draw_all_diff(ori_outs, cur_outs, file_name) -> Dict[str, Any]:
+    ori_data = ori_outs.flatten()
+    cur_data = cur_outs.flatten()
+    '''
+    Nan & Inf is not compareable, replece with 0 
+    '''
+    ori_data[np.isnan(ori_data)] = 0.0
+    ori_data[np.isinf(ori_data)] = 0.0
+
+    cur_data[np.isnan(cur_data)] = 0.0
+    cur_data[np.isinf(cur_data)] = 0.0
+
+    length = min(ori_data.shape[0], 300)
+    diff = ori_data - cur_data
+
+    ori_data = np.where(ori_data == 0, 1, ori_data)
+    rel_diff = np.divide(diff, ori_data)
+    rel_diff = np.nan_to_num(rel_diff)
+
+    log.info('Mean Diff: {}, Std Diff: {}, Max Diff: {}, Max Rel-Diff: {}, Mean Rel-Diff: {}'.format(
+        np.mean(abs(diff)), np.std(abs(diff)),
+        abs(diff).max(), abs(rel_diff).max(), np.mean(abs(rel_diff))))
+
+    result = {}
+    result["Mean Diff"] = round(float(np.mean(abs(diff))), 5)
+    result["Std Diff"] = round(float(np.std(abs(diff))), 5)
+    result["Max Diff"] = round(float(abs(diff).max()), 5)
+    result["Max Rel-Diff"] = round(float(abs(rel_diff).max()), 5)
+    result["Mean Rel-Diff"] = round(float(np.mean(abs(rel_diff))), 5)
+
+    plt.figure(figsize=(16, 8))
+
+    plt.cla()
+
+    plt.subplot(1, 3, 1)
+    plt.yscale('log')
+    plt.hist(diff,
+             bins=length,
+             alpha=0.5,
+             label='Diff',
+             range=(diff.min(), diff.max()))
+    plt.xlabel("Diff Distribute")
+
+    plt.subplot(1, 3, 2)
+    plt.yscale('log')
+    plt.hist(ori_data,
+             bins=length,
+             alpha=0.5,
+             label='CPU',
+             range=(ori_data.min(), ori_data.max()))
+    plt.xlabel("CPU Result")
+
+    plt.subplot(1, 3, 3)
+    plt.yscale('log')
+    plt.hist(cur_data,
+             bins=length,
+             alpha=0.5,
+             label='Backend',
+             range=(cur_data.min(), cur_data.max()))
+    plt.xlabel("Backend Result")
+
+    plt.savefig(file_name, dpi=300)
+    return result
+
+
+class AccuracyChecker():
+    def __init__(self):
+        self.configs = None
+        self.dataloader = None
+        self.runtime_backend = None
+        self.output_dir = ""
+
+    def calculate_diff(self) -> Dict[str, float]:
+        """
+        Return a dictionary of Mean Diff, Std Diff and Max Diff
+
+        Args: None
+
+        Returns: Dict[str, float]
+        """
+        cpu_data_path = os.path.abspath('general_perf/reports/CPU/' +
+                                        self.configs["model"])
+        if not os.path.exists(cpu_data_path):
+            log.info("Fetch CPU Data Failed")
+            return {}
+        vendor_data = np.load(self.output_dir +
+                              "/{}.npy".format(self.dataloader.name()))
+        cpu_data = np.load(cpu_data_path +
+                           "/{}.npy".format(self.dataloader.name()))
+        return draw_all_diff(
+            cpu_data, vendor_data,
+            self.output_dir + "/" + self.configs["model"] + '-to-' + self.configs['compile_precision'].lower() + '.png')
+
+    def calculate_acc(self, data_percent) -> Dict[str, Any]:
+        raise NotImplementedError("Dataset: caculate_acc")