resnet50

b0f41f60 · yuguo960516 · 456eb360 · b0f41f60 · b0f41f60 · b0f41f60
Commit b0f41f60 authored Mar 29, 2023 by yuguo960516
10 changed files
--- a/utils/__pycache__/stat.cpython-37.pyc
+++ b/utils/__pycache__/stat.cpython-37.pyc
--- a/utils/__pycache__/stat.cpython-39.pyc
+++ b/utils/__pycache__/stat.cpython-39.pyc
--- a/utils/debug.py
+++ b/utils/debug.py
+import os
+import sys
+from shutil import copy
+import numpy as np
+import matplotlib.pyplot as plt
+from numpy.core.fromnumeric import var
+def npy_compare(lhs_path, rhs_path):
+    lhs = np.load(lhs_path)
+    rhs = np.load(rhs_path)
+    # if not np.allclose(lhs, rhs):
+    #    print(lhs)
+    #    print(rhs)
+    return np.allclose(lhs, rhs)
+def npy_diff(lhs_path, rhs_path):
+    lhs = np.load(lhs_path)
+    rhs = np.load(rhs_path)
+    diff = np.absolute(lhs - rhs)
+    return diff.mean(), diff.std(), diff.max(), diff.min()
+def walk_compare_npy(lhs, rhs):
+    assert os.path.isdir(lhs)
+    assert os.path.isdir(rhs)
+    same = 0
+    diff = 0
+    ignore = 0
+    for root, dirs, files in os.walk(lhs):
+        for name in filter(lambda f: f.endswith(".npy"), files):
+            lhs_path = os.path.join(root, name)
+            rhs_path = os.path.join(rhs, os.path.relpath(lhs_path, lhs))
+            if os.path.exists(rhs_path) and os.path.isfile(rhs_path):
+                if not npy_compare(lhs_path, rhs_path):
+                    mean, std, max_, min_ = npy_diff(lhs_path, rhs_path)
+                    print(lhs_path, f"mean={mean}, std={std}, max={max_}, min={min_}")
+                    diff += 1
+                else:
+                    same += 1
+            else:
+                print("{} ignore".format(lhs_path))
+                ignore += 1
+    print("same:", same)
+    print("diff:", diff)
+    print("ignore:", ignore)
+import zlib
+def crc32(filename):
+    with open(filename, "rb") as f:
+        data = f.read()
+        print(filename, zlib.crc32(data))
+        return zlib.crc32(data)
+def var_compare(lhs_path, rhs_path):
+    lhs = crc32(lhs_path)
+    rhs = crc32(rhs_path)
+    if lhs != rhs:
+        print(lhs)
+        print(rhs)
+    return lhs == rhs
+def walk_compare_of_variable(lhs, rhs):
+    assert os.path.isdir(lhs)
+    assert os.path.isdir(rhs)
+    same = 0
+    diff = 0
+    ignore = 0
+    for root, dirs, files in os.walk(lhs):
+        for name in filter(lambda f: f.endswith("out"), files):
+            lhs_path = os.path.join(root, name)
+            rhs_path = os.path.join(rhs, os.path.relpath(lhs_path, lhs))
+            if os.path.exists(rhs_path) and os.path.isfile(rhs_path):
+                if not var_compare(lhs_path, rhs_path):
+                    print("{} False".format(lhs_path))
+                    diff += 1
+                else:
+                    same += 1
+            else:
+                print("{} ignore".format(lhs_path))
+                ignore += 1
+    print("same:", same)
+    print("diff:", diff)
+    print("ignore:", ignore)
+def get_varible_name(var_org):
+    # for item in sys._getframe().f_locals.items():
+    #     print(item[0],item[1])
+    # for item in sys._getframe(1).f_locals.items():
+    #     print(item[0],item[1])
+    for item in sys._getframe(2).f_locals.items():
+        if var_org is item[1]:
+            return item[0]
+def dump_to_npy(tensor, root="./output", sub="", name=""):
+    if sub != "":
+        root = os.path.join(root, str(sub))
+    if not os.path.isdir(root):
+        os.makedirs(root)
+    var_org_name = get_varible_name(tensor) if name == "" else name
+    path = os.path.join(root, f"{var_org_name}.npy")
+    if not isinstance(tensor, np.ndarray):
+        tensor = tensor.to_local().numpy()
+    np.save(path, tensor)
+def save_param_npy(module, root="./output"):
+    for name, param in module.named_parameters():
+        # if name.endswith('bias'):
+        dump_to_npy(param.numpy(), root=root, sub=0, name=name)
+def param_hist(param, name, root="output"):
+    print(name, param.shape)
+    # print(param.flatten())
+    # the histogram of the data
+    n, bins, patches = plt.hist(param.flatten(), density=False, facecolor="g")
+    # plt.xlabel('Smarts')
+    # plt.ylabel('value')
+    plt.title(f"Histogram of {name}")
+    # plt.xlim(40, 160)
+    # plt.ylim(0, 0.03)
+    plt.grid(True)
+    plt.savefig(os.path.join(root, f"{name}.png"))
+    plt.close()
+def save_param_hist_pngs(module, root="output"):
+    for name, param in module.named_parameters():
+        # if name.endswith('bias'):
+        param_hist(param.numpy(), name, root=root)
+if __name__ == "__main__":
+    # walk_compare_of_variable('init_ckpt', '/ssd/xiexuan/OneFlow-Benchmark/Classification/cnns/loaded_init_ckpt')
+    walk_compare_npy(
+        "output", "/ssd/xiexuan/OneFlow-Benchmark/Classification/cnns/output"
+    )
--- a/utils/imagenet1000_clsidx_to_labels.py
+++ b/utils/imagenet1000_clsidx_to_labels.py
--- a/utils/logger.py
+++ b/utils/logger.py
+import time
+from datetime import datetime
+import numpy as np
+import oneflow as flow
+_GLOBAL_LOGGER = None
+def get_logger(rank, print_ranks):
+    global _GLOBAL_LOGGER
+    if _GLOBAL_LOGGER is None:
+        _GLOBAL_LOGGER = Logger(rank, print_ranks)
+    return _GLOBAL_LOGGER
+class Logger(object):
+    def __init__(self, rank, print_ranks):
+        self.rank = rank
+        self.print_ranks = print_ranks
+        # self.verbose = verbose
+        # self.b = backend
+        self.step = 0
+        self.m = dict()
+    def register_metric(
+        self, metric_key, meter, print_format=None, reset_after_print=False
+    ):
+        assert metric_key not in self.m
+        self.m[metric_key] = {
+            "meter": meter,
+            "print_format": print_format or (metric_key + ": {}"),
+            "reset_after_print": reset_after_print,
+        }
+    def metric(self, mkey):
+        if mkey not in self.m:
+            return None
+        return self.m[mkey]["meter"]
+    def meter(self, mkey, *args):
+        assert mkey in self.m
+        self.m[mkey]["meter"].record(*args)
+    def print_metrics(self, print_ranks=None):
+        fields = []
+        for m in self.m.values():
+            meter = m["meter"]
+            print_format = m["print_format"]
+            result = meter.get()
+            if isinstance(result, (list, tuple)):
+                field = print_format.format(*result)
+            else:
+                field = print_format.format(result)
+            fields.append(field)
+            if m["reset_after_print"]:
+                meter.reset()
+        do_print = self.rank in (print_ranks or self.print_ranks)
+        if do_print:
+            print(
+                "[rank:{}] {}".format(self.rank, ", ".join(fields)),
+                datetime.now().strftime("| %Y-%m-%d %H:%M:%S.%f")[:-3],
+            )
+    def print(self, *args, print_ranks=None):
+        do_print = self.rank in (print_ranks or self.print_ranks)
+        if do_print:
+            print(*args)
+class IterationMeter(object):
+    def __init__(self):
+        self.val = 0
+    def record(self, val):
+        self.val = val
+    def get(self):
+        return self.val
+def _zeros_by_val(val):
+    ret = 0
+    if isinstance(val, flow.Tensor):
+        ret = flow.zeros_like(val)
+    elif isinstance(val, np.ndarray):
+        ret = np.zeros_like(val)
+    elif isinstance(val, int):
+        ret = 0
+    elif isinstance(val, float):
+        ret = 0.0
+    else:
+        raise ValueError
+    return ret
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.sum = None
+        self.n = 0
+    def record(self, val, n=1):
+        self.n += n
+        if self.sum is None:
+            self.sum = _zeros_by_val(val)
+        if n == 1:
+            self.sum += val
+        else:
+            self.sum += val * n
+    def get(self):
+        if self.n == 0:
+            return 0
+        avg = self.sum / self.n
+        if isinstance(avg, flow.Tensor):
+            # NOTE(zwx): sync happen here
+            return avg.numpy().item()
+        elif isinstance(avg, np.ndarray):
+            return avg.item()
+        else:
+            return avg
+class RunningMeter(object):
+    pass
+class ProgressMeter(object):
+    def __init__(self, total):
+        self.cur = 0
+        self.total = total
+    def record(self, cur, total=None):
+        self.cur = cur
+        if total is not None:
+            self.total = total
+    def get(self):
+        return self.cur, self.total
+class TimeMeter(object):
+    def __init__(self, return_timestamp=False):
+        self.return_timestamp = return_timestamp
+        self.n = 0
+        self.ets = None
+        self.bts = None
+        self.reset()
+    def reset(self):
+        self.n = 0
+        if self.ets is None:
+            self.bts = time.perf_counter()
+        else:
+            self.bts = self.ets
+        self.ets = None
+    def record(self, n):
+        self.n += n
+    def get(self):
+        self.ets = time.perf_counter()
+        assert self.ets > self.bts, f"{self.ets} > {self.bts}"
+        throughput = self.n / (self.ets - self.bts)
+        if self.return_timestamp:
+            return throughput, self.ets
+        else:
+            return throughput
--- a/utils/numpy_data_utils.py
+++ b/utils/numpy_data_utils.py
+from PIL import Image
+import numpy as np
+import os
+import random
+def load_image(image_path="data/fish.jpg"):
+    rgb_mean = [123.68, 116.779, 103.939]
+    rgb_std = [58.393, 57.12, 57.375]
+    im = Image.open(image_path)
+    im = im.resize((224, 224))
+    im = im.convert("RGB")
+    im = np.array(im).astype("float32")
+    im = (im - rgb_mean) / rgb_std
+    im = np.transpose(im, (2, 0, 1))
+    im = np.expand_dims(im, axis=0)
+    return np.ascontiguousarray(im, "float32")
+class NumpyDataLoader(object):
+    def __init__(self, dataset_root: str, batch_size: int = 1):
+        self.dataset_root = dataset_root
+        sub_folders = os.listdir(self.dataset_root)
+        self.image_2_class_label_list = []
+        self.label_2_class_name = {}
+        self.batch_size = batch_size
+        label = -1
+        for sf in sub_folders:
+            label += 1
+            self.label_2_class_name[label] = sf
+            sub_root = os.path.join(self.dataset_root, sf)
+            image_names = os.listdir(sub_root)
+            for name in image_names:
+                self.image_2_class_label_list.append(
+                    (os.path.join(sub_root, name), label)
+                )
+        self.curr_idx = 0
+        self.shuffle_data()
+    def shuffle_data(self):
+        random.shuffle(self.image_2_class_label_list)
+        self.curr_idx = 0
+    def __getitem__(self, index):
+        batch_datas = []
+        batch_labels = []
+        for i in range(self.batch_size):
+            image_path, label = self.image_2_class_label_list[self.curr_idx]
+            batch_datas.append(load_image(image_path))
+            batch_labels.append(int(label))
+            self.curr_idx += 1
+        np_datas = np.concatenate(tuple(batch_datas), axis=0)
+        np_labels = np.array(batch_labels, dtype=np.int32)
+        return np.ascontiguousarray(np_datas, "float32"), np_labels
+    def __len__(self):
+        return len(self.image_2_class_label_list) // self.batch_size
--- a/utils/ofrecord_data_utils.py
+++ b/utils/ofrecord_data_utils.py
+from typing import List, Union
+import oneflow as flow
+import oneflow.nn as nn
+import os
+class OFRecordDataLoader(nn.Module):
+    def __init__(
+        self,
+        ofrecord_root: str = "./ofrecord",
+        mode: str = "train",  # "val"
+        dataset_size: int = 9469,
+        batch_size: int = 1,
+        total_batch_size: int = 1,
+        ofrecord_part_num: int = 1,
+        placement: flow.placement = None,
+        sbp: Union[flow.sbp.sbp, List[flow.sbp.sbp]] = None,
+    ):
+        super().__init__()
+        channel_last = False
+        output_layout = "NHWC" if channel_last else "NCHW"
+        self.train_record_reader = flow.nn.OfrecordReader(
+            os.path.join(ofrecord_root, mode),
+            batch_size=batch_size,
+            data_part_num=ofrecord_part_num,
+            part_name_suffix_length=5,
+            random_shuffle=True if mode == "train" else False,
+            shuffle_after_epoch=True if mode == "train" else False,
+            placement=placement,
+            sbp=sbp,
+        )
+        self.record_label_decoder = flow.nn.OFRecordRawDecoder(
+            "class/label", shape=(), dtype=flow.int32
+        )
+        color_space = "RGB"
+        height = 224
+        width = 224
+        self.record_image_decoder = (
+            flow.nn.OFRecordImageDecoderRandomCrop("encoded", color_space=color_space)
+            if mode == "train"
+            else flow.nn.OFRecordImageDecoder("encoded", color_space=color_space)
+        )
+        self.resize = (
+            flow.nn.image.Resize(target_size=[height, width])
+            if mode == "train"
+            else flow.nn.image.Resize(
+                resize_side="shorter", keep_aspect_ratio=True, target_size=256
+            )
+        )
+        self.flip = (
+            flow.nn.CoinFlip(batch_size=batch_size, placement=placement, sbp=sbp)
+            if mode == "train"
+            else None
+        )
+        rgb_mean = [123.68, 116.779, 103.939]
+        rgb_std = [58.393, 57.12, 57.375]
+        self.crop_mirror_norm = (
+            flow.nn.CropMirrorNormalize(
+                color_space=color_space,
+                output_layout=output_layout,
+                mean=rgb_mean,
+                std=rgb_std,
+                output_dtype=flow.float,
+            )
+            if mode == "train"
+            else flow.nn.CropMirrorNormalize(
+                color_space=color_space,
+                output_layout=output_layout,
+                crop_h=height,
+                crop_w=width,
+                crop_pos_y=0.5,
+                crop_pos_x=0.5,
+                mean=rgb_mean,
+                std=rgb_std,
+                output_dtype=flow.float,
+            )
+        )
+        self.batch_size = batch_size
+        self.total_batch_size = total_batch_size
+        self.dataset_size = dataset_size
+    def __len__(self):
+        return self.dataset_size // self.total_batch_size
+    def forward(self):
+        train_record = self.train_record_reader()
+        label = self.record_label_decoder(train_record)
+        image_raw_buffer = self.record_image_decoder(train_record)
+        image = self.resize(image_raw_buffer)[0]
+        rng = self.flip() if self.flip != None else None
+        image = self.crop_mirror_norm(image, rng)
+        return image, label
--- a/utils/plot.py
+++ b/utils/plot.py
+import matplotlib.pyplot as plt
+of_losses = []
+torch_losses = []
+with open("of_losses.txt", "r") as lines:
+    for line in lines:
+        line = line.strip()
+        of_losses.append(float(line))
+with open("torch_losses.txt", "r") as lines:
+    for line in lines:
+        line = line.strip()
+        torch_losses.append(float(line))
+indes = [i for i in range(len(of_losses))]
+plt.plot(indes, of_losses, label="oneflow")
+plt.plot(indes, torch_losses, label="pytorch")
+plt.xlabel("iter - axis")
+# Set the y axis label of the current axis.
+plt.ylabel("loss - axis")
+# Set a title of the current axes.
+plt.title("compare ")
+# show a legend on the plot
+plt.legend()
+# Display a figure.
+plt.show()
--- a/utils/printer.py
+++ b/utils/printer.py
+import pandas as pd
+from collections import namedtuple
+class Printer(object):
+    def __init__(self, field_names, print_format="table", persistent_file=None):
+        assert print_format in ("table", "normal")
+        self.field_names_ = field_names
+        self.format_ = print_format
+        self.records_ = []
+        self.handlers_ = dict()
+        self.str_lens_ = dict()
+        self.title_printed_ = False
+        if persistent_file is not None:
+            self.csv_ = open(persistent_file, "a")
+        else:
+            self.csv_ = None
+        self.Record = None
+    def __def__(self):
+        if self.csv_ is not None:
+            self.csv_.close()
+    def finish(self):
+        err = f"{len(self.field_names_)} vs. {len(self.handlers_)}"
+        assert len(self.field_names_) == len(self.handlers_), err
+        err = f"{len(self.field_names_)} vs. {len(self.str_lens_)}"
+        assert len(self.field_names_) == len(self.str_lens_), err
+        for fname in self.field_names_:
+            assert fname in self.handlers_, f"{fname} handler not register"
+            assert fname in self.str_lens_, f"{fname} str_len not register"
+        self.Record = namedtuple("Record", self.field_names_)
+        # DEBUG(zwx):
+        # dummy = self.Record(*(["-"] * len(self.field_names_)))
+        # df = pd.DataFrame(dummy)
+        # if self.persistent_file_ is not None:
+        #     df.to_csv(self.persistent_file_, mode='a', header=True)
+    def record(self, *args, **kwargs):
+        assert self.Record is not None
+        r = self.Record(*args, **kwargs)
+        self.records_.append(r)
+    def register_handler(self, field, handler):
+        assert callable(handler)
+        self.handlers_[field] = handler
+    def register_str_len(self, field, str_len):
+        assert isinstance(str_len, int)
+        self.str_lens_[field] = str_len
+    def reset_records(self):
+        self.records_ = []
+    def print_table_title(self):
+        fields = ""
+        sep = ""
+        for fname in self.field_names_:
+            str_len = self.str_lens_[fname]
+            fields += "| {} ".format(fname.ljust(str_len))
+            sep += f"| {'-' * str_len} "
+        fields += "|"
+        sep += "|"
+        print(fields)
+        print(sep)
+        self.title_printed_ = True
+    def reset_title_printed(self):
+        self.title_printed_ = False
+    def print(self):
+        df = pd.DataFrame(self.records_)
+        fields = []
+        for fname in self.field_names_:
+            assert fname in self.handlers_
+            handler = self.handlers_[fname]
+            field_value = handler(df[fname])
+            fields.append(field_value)
+        if self.format_ == "table":
+            if not self.title_printed_:
+                self.print_table_title()
+            record = ""
+            for i, str_len in enumerate(self.str_lens_.values()):
+                record += "| {} ".format(str(fields[i]).ljust(str_len))
+            record += "|"
+            print(record)
+        elif self.format_ == "normal":
+            record = ""
+            for i, fval in enumerate(fields):
+                fname = self.field_names_[i]
+                record += f"{fname}: {fval}, "
+            print(record)
+        else:
+            raise ValueError
+        if self.csv_ is not None:
+            df.to_csv(self.csv_, header=False)
+        self.reset_records()
--- a/utils/stat.py
+++ b/utils/stat.py
+import sys
+import signal
+import subprocess
+import threading
+class CudaUtilMemStat:
+    def __init__(
+        self, stat_file_path, stat_util=True, stat_mem=True, only_ordinal=None
+    ):
+        self.stat_file = open(stat_file_path, "wt")
+        self.stat_util = stat_util
+        self.stat_mem = stat_mem
+        self.only_ordinal = only_ordinal
+        self._write_titles()
+    def __del__(self):
+        self.stat_file.close()
+    def _write_titles(self):
+        proc = subprocess.Popen(
+            ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+        )
+        stdout, stderr = proc.communicate()
+        if stderr != b"":
+            raise RuntimeError(stderr)
+        gpus = []
+        lines = stdout.decode("utf-8").split("\n")
+        for line in lines:
+            if line.strip() == "":
+                continue
+            gpus.append(line.split(":")[0])
+        util_titles, mem_titles = [], []
+        for gpu in gpus:
+            if self.stat_util:
+                util_titles.append(gpu + " utilization")
+            if self.stat_mem:
+                mem_titles.append(gpu + " memory used")
+        if self.only_ordinal is None:
+            self.stat_file.write(",".join(util_titles + mem_titles) + "\n")
+        else:
+            titles = (
+                util_titles[self.only_ordinal] + "," + mem_titles[self.only_ordinal]
+            )
+            self.stat_file.write(titles + "\n")
+            self.stat_file.flush()
+    def stat(self):
+        # command: nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv
+        proc = subprocess.Popen(
+            ["nvidia-smi", "--query-gpu=utilization.gpu,memory.used", "--format=csv"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        stdout, stderr = proc.communicate()
+        if stderr != b"":
+            raise RuntimeError(stderr)
+        lines = stdout.decode("utf-8").split("\n")
+        util_vals = []
+        mem_vals = []
+        for line in lines[1:]:
+            if line.strip() == "":
+                continue
+            util_mem = line.split(",")
+            assert len(util_mem) == 2, lines
+            util = util_mem[0].strip()
+            mem = util_mem[1].strip()
+            assert "%" in util
+            assert "MiB" in mem
+            util = util.split(" ")
+            mem = mem.split(" ")
+            assert len(util) == 2
+            assert len(mem) == 2
+            util_val = util[0].strip()
+            mem_val = mem[0].strip()
+            if self.stat_util:
+                util_vals.append(util_val)
+            if self.stat_mem:
+                mem_vals.append(mem_val)
+        if self.only_ordinal is None:
+            self.stat_file.write(",".join(util_vals + mem_vals) + "\n")
+        else:
+            vals = util_vals[self.only_ordinal] + "," + mem_vals[self.only_ordinal]
+            self.stat_file.write(vals + "\n")
+            self.stat_file.flush()
+    def start(self, interval):
+        stop = threading.Event()
+        stat_thrd = StatThread(self.stat, interval, stop)
+        stat_thrd.start()
+        def close(signum, frame):
+            print("Closing...")
+            stop.set()
+        signal.signal(signal.SIGTERM, close)
+        signal.signal(signal.SIGINT, close)
+        print("Start stat")
+        print("Print Ctrl+C to stop")
+        stop.wait()
+class StatThread(threading.Thread):
+    def __init__(self, handler, interval, stop_event):
+        super().__init__()
+        self.handler = handler
+        self.interval = interval
+        self.stopped = stop_event
+        self.count = 0
+    def run(self):
+        while not self.stopped.wait(self.interval):
+            print(f"{self.count} th run stat")
+            self.handler()
+            self.count += 1
+if __name__ == "__main__":
+    stat_file_path = sys.argv[1] if len(sys.argv) > 1 else "gpu_stat.log"
+    interval = sys.argv[2] if len(sys.argv) > 2 else 1
+    stat = CudaUtilMemStat(stat_file_path)
+    stat.start(interval)