Commit b0f41f60 authored by yuguo960516's avatar yuguo960516
Browse files

resnet50

parent 456eb360
Pipeline #144 failed with stages
in 0 seconds
import os
import sys
from shutil import copy
import numpy as np
import matplotlib.pyplot as plt
from numpy.core.fromnumeric import var
def npy_compare(lhs_path, rhs_path):
lhs = np.load(lhs_path)
rhs = np.load(rhs_path)
# if not np.allclose(lhs, rhs):
# print(lhs)
# print(rhs)
return np.allclose(lhs, rhs)
def npy_diff(lhs_path, rhs_path):
lhs = np.load(lhs_path)
rhs = np.load(rhs_path)
diff = np.absolute(lhs - rhs)
return diff.mean(), diff.std(), diff.max(), diff.min()
def walk_compare_npy(lhs, rhs):
assert os.path.isdir(lhs)
assert os.path.isdir(rhs)
same = 0
diff = 0
ignore = 0
for root, dirs, files in os.walk(lhs):
for name in filter(lambda f: f.endswith(".npy"), files):
lhs_path = os.path.join(root, name)
rhs_path = os.path.join(rhs, os.path.relpath(lhs_path, lhs))
if os.path.exists(rhs_path) and os.path.isfile(rhs_path):
if not npy_compare(lhs_path, rhs_path):
mean, std, max_, min_ = npy_diff(lhs_path, rhs_path)
print(lhs_path, f"mean={mean}, std={std}, max={max_}, min={min_}")
diff += 1
else:
same += 1
else:
print("{} ignore".format(lhs_path))
ignore += 1
print("same:", same)
print("diff:", diff)
print("ignore:", ignore)
import zlib
def crc32(filename):
with open(filename, "rb") as f:
data = f.read()
print(filename, zlib.crc32(data))
return zlib.crc32(data)
def var_compare(lhs_path, rhs_path):
lhs = crc32(lhs_path)
rhs = crc32(rhs_path)
if lhs != rhs:
print(lhs)
print(rhs)
return lhs == rhs
def walk_compare_of_variable(lhs, rhs):
assert os.path.isdir(lhs)
assert os.path.isdir(rhs)
same = 0
diff = 0
ignore = 0
for root, dirs, files in os.walk(lhs):
for name in filter(lambda f: f.endswith("out"), files):
lhs_path = os.path.join(root, name)
rhs_path = os.path.join(rhs, os.path.relpath(lhs_path, lhs))
if os.path.exists(rhs_path) and os.path.isfile(rhs_path):
if not var_compare(lhs_path, rhs_path):
print("{} False".format(lhs_path))
diff += 1
else:
same += 1
else:
print("{} ignore".format(lhs_path))
ignore += 1
print("same:", same)
print("diff:", diff)
print("ignore:", ignore)
def get_varible_name(var_org):
# for item in sys._getframe().f_locals.items():
# print(item[0],item[1])
# for item in sys._getframe(1).f_locals.items():
# print(item[0],item[1])
for item in sys._getframe(2).f_locals.items():
if var_org is item[1]:
return item[0]
def dump_to_npy(tensor, root="./output", sub="", name=""):
if sub != "":
root = os.path.join(root, str(sub))
if not os.path.isdir(root):
os.makedirs(root)
var_org_name = get_varible_name(tensor) if name == "" else name
path = os.path.join(root, f"{var_org_name}.npy")
if not isinstance(tensor, np.ndarray):
tensor = tensor.to_local().numpy()
np.save(path, tensor)
def save_param_npy(module, root="./output"):
for name, param in module.named_parameters():
# if name.endswith('bias'):
dump_to_npy(param.numpy(), root=root, sub=0, name=name)
def param_hist(param, name, root="output"):
print(name, param.shape)
# print(param.flatten())
# the histogram of the data
n, bins, patches = plt.hist(param.flatten(), density=False, facecolor="g")
# plt.xlabel('Smarts')
# plt.ylabel('value')
plt.title(f"Histogram of {name}")
# plt.xlim(40, 160)
# plt.ylim(0, 0.03)
plt.grid(True)
plt.savefig(os.path.join(root, f"{name}.png"))
plt.close()
def save_param_hist_pngs(module, root="output"):
for name, param in module.named_parameters():
# if name.endswith('bias'):
param_hist(param.numpy(), name, root=root)
if __name__ == "__main__":
# walk_compare_of_variable('init_ckpt', '/ssd/xiexuan/OneFlow-Benchmark/Classification/cnns/loaded_init_ckpt')
walk_compare_npy(
"output", "/ssd/xiexuan/OneFlow-Benchmark/Classification/cnns/output"
)
This diff is collapsed.
import time
from datetime import datetime
import numpy as np
import oneflow as flow
_GLOBAL_LOGGER = None
def get_logger(rank, print_ranks):
global _GLOBAL_LOGGER
if _GLOBAL_LOGGER is None:
_GLOBAL_LOGGER = Logger(rank, print_ranks)
return _GLOBAL_LOGGER
class Logger(object):
def __init__(self, rank, print_ranks):
self.rank = rank
self.print_ranks = print_ranks
# self.verbose = verbose
# self.b = backend
self.step = 0
self.m = dict()
def register_metric(
self, metric_key, meter, print_format=None, reset_after_print=False
):
assert metric_key not in self.m
self.m[metric_key] = {
"meter": meter,
"print_format": print_format or (metric_key + ": {}"),
"reset_after_print": reset_after_print,
}
def metric(self, mkey):
if mkey not in self.m:
return None
return self.m[mkey]["meter"]
def meter(self, mkey, *args):
assert mkey in self.m
self.m[mkey]["meter"].record(*args)
def print_metrics(self, print_ranks=None):
fields = []
for m in self.m.values():
meter = m["meter"]
print_format = m["print_format"]
result = meter.get()
if isinstance(result, (list, tuple)):
field = print_format.format(*result)
else:
field = print_format.format(result)
fields.append(field)
if m["reset_after_print"]:
meter.reset()
do_print = self.rank in (print_ranks or self.print_ranks)
if do_print:
print(
"[rank:{}] {}".format(self.rank, ", ".join(fields)),
datetime.now().strftime("| %Y-%m-%d %H:%M:%S.%f")[:-3],
)
def print(self, *args, print_ranks=None):
do_print = self.rank in (print_ranks or self.print_ranks)
if do_print:
print(*args)
class IterationMeter(object):
def __init__(self):
self.val = 0
def record(self, val):
self.val = val
def get(self):
return self.val
def _zeros_by_val(val):
ret = 0
if isinstance(val, flow.Tensor):
ret = flow.zeros_like(val)
elif isinstance(val, np.ndarray):
ret = np.zeros_like(val)
elif isinstance(val, int):
ret = 0
elif isinstance(val, float):
ret = 0.0
else:
raise ValueError
return ret
class AverageMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.sum = None
self.n = 0
def record(self, val, n=1):
self.n += n
if self.sum is None:
self.sum = _zeros_by_val(val)
if n == 1:
self.sum += val
else:
self.sum += val * n
def get(self):
if self.n == 0:
return 0
avg = self.sum / self.n
if isinstance(avg, flow.Tensor):
# NOTE(zwx): sync happen here
return avg.numpy().item()
elif isinstance(avg, np.ndarray):
return avg.item()
else:
return avg
class RunningMeter(object):
pass
class ProgressMeter(object):
def __init__(self, total):
self.cur = 0
self.total = total
def record(self, cur, total=None):
self.cur = cur
if total is not None:
self.total = total
def get(self):
return self.cur, self.total
class TimeMeter(object):
def __init__(self, return_timestamp=False):
self.return_timestamp = return_timestamp
self.n = 0
self.ets = None
self.bts = None
self.reset()
def reset(self):
self.n = 0
if self.ets is None:
self.bts = time.perf_counter()
else:
self.bts = self.ets
self.ets = None
def record(self, n):
self.n += n
def get(self):
self.ets = time.perf_counter()
assert self.ets > self.bts, f"{self.ets} > {self.bts}"
throughput = self.n / (self.ets - self.bts)
if self.return_timestamp:
return throughput, self.ets
else:
return throughput
from PIL import Image
import numpy as np
import os
import random
def load_image(image_path="data/fish.jpg"):
rgb_mean = [123.68, 116.779, 103.939]
rgb_std = [58.393, 57.12, 57.375]
im = Image.open(image_path)
im = im.resize((224, 224))
im = im.convert("RGB")
im = np.array(im).astype("float32")
im = (im - rgb_mean) / rgb_std
im = np.transpose(im, (2, 0, 1))
im = np.expand_dims(im, axis=0)
return np.ascontiguousarray(im, "float32")
class NumpyDataLoader(object):
def __init__(self, dataset_root: str, batch_size: int = 1):
self.dataset_root = dataset_root
sub_folders = os.listdir(self.dataset_root)
self.image_2_class_label_list = []
self.label_2_class_name = {}
self.batch_size = batch_size
label = -1
for sf in sub_folders:
label += 1
self.label_2_class_name[label] = sf
sub_root = os.path.join(self.dataset_root, sf)
image_names = os.listdir(sub_root)
for name in image_names:
self.image_2_class_label_list.append(
(os.path.join(sub_root, name), label)
)
self.curr_idx = 0
self.shuffle_data()
def shuffle_data(self):
random.shuffle(self.image_2_class_label_list)
self.curr_idx = 0
def __getitem__(self, index):
batch_datas = []
batch_labels = []
for i in range(self.batch_size):
image_path, label = self.image_2_class_label_list[self.curr_idx]
batch_datas.append(load_image(image_path))
batch_labels.append(int(label))
self.curr_idx += 1
np_datas = np.concatenate(tuple(batch_datas), axis=0)
np_labels = np.array(batch_labels, dtype=np.int32)
return np.ascontiguousarray(np_datas, "float32"), np_labels
def __len__(self):
return len(self.image_2_class_label_list) // self.batch_size
from typing import List, Union
import oneflow as flow
import oneflow.nn as nn
import os
class OFRecordDataLoader(nn.Module):
def __init__(
self,
ofrecord_root: str = "./ofrecord",
mode: str = "train", # "val"
dataset_size: int = 9469,
batch_size: int = 1,
total_batch_size: int = 1,
ofrecord_part_num: int = 1,
placement: flow.placement = None,
sbp: Union[flow.sbp.sbp, List[flow.sbp.sbp]] = None,
):
super().__init__()
channel_last = False
output_layout = "NHWC" if channel_last else "NCHW"
self.train_record_reader = flow.nn.OfrecordReader(
os.path.join(ofrecord_root, mode),
batch_size=batch_size,
data_part_num=ofrecord_part_num,
part_name_suffix_length=5,
random_shuffle=True if mode == "train" else False,
shuffle_after_epoch=True if mode == "train" else False,
placement=placement,
sbp=sbp,
)
self.record_label_decoder = flow.nn.OFRecordRawDecoder(
"class/label", shape=(), dtype=flow.int32
)
color_space = "RGB"
height = 224
width = 224
self.record_image_decoder = (
flow.nn.OFRecordImageDecoderRandomCrop("encoded", color_space=color_space)
if mode == "train"
else flow.nn.OFRecordImageDecoder("encoded", color_space=color_space)
)
self.resize = (
flow.nn.image.Resize(target_size=[height, width])
if mode == "train"
else flow.nn.image.Resize(
resize_side="shorter", keep_aspect_ratio=True, target_size=256
)
)
self.flip = (
flow.nn.CoinFlip(batch_size=batch_size, placement=placement, sbp=sbp)
if mode == "train"
else None
)
rgb_mean = [123.68, 116.779, 103.939]
rgb_std = [58.393, 57.12, 57.375]
self.crop_mirror_norm = (
flow.nn.CropMirrorNormalize(
color_space=color_space,
output_layout=output_layout,
mean=rgb_mean,
std=rgb_std,
output_dtype=flow.float,
)
if mode == "train"
else flow.nn.CropMirrorNormalize(
color_space=color_space,
output_layout=output_layout,
crop_h=height,
crop_w=width,
crop_pos_y=0.5,
crop_pos_x=0.5,
mean=rgb_mean,
std=rgb_std,
output_dtype=flow.float,
)
)
self.batch_size = batch_size
self.total_batch_size = total_batch_size
self.dataset_size = dataset_size
def __len__(self):
return self.dataset_size // self.total_batch_size
def forward(self):
train_record = self.train_record_reader()
label = self.record_label_decoder(train_record)
image_raw_buffer = self.record_image_decoder(train_record)
image = self.resize(image_raw_buffer)[0]
rng = self.flip() if self.flip != None else None
image = self.crop_mirror_norm(image, rng)
return image, label
import matplotlib.pyplot as plt
of_losses = []
torch_losses = []
with open("of_losses.txt", "r") as lines:
for line in lines:
line = line.strip()
of_losses.append(float(line))
with open("torch_losses.txt", "r") as lines:
for line in lines:
line = line.strip()
torch_losses.append(float(line))
indes = [i for i in range(len(of_losses))]
plt.plot(indes, of_losses, label="oneflow")
plt.plot(indes, torch_losses, label="pytorch")
plt.xlabel("iter - axis")
# Set the y axis label of the current axis.
plt.ylabel("loss - axis")
# Set a title of the current axes.
plt.title("compare ")
# show a legend on the plot
plt.legend()
# Display a figure.
plt.show()
import pandas as pd
from collections import namedtuple
class Printer(object):
def __init__(self, field_names, print_format="table", persistent_file=None):
assert print_format in ("table", "normal")
self.field_names_ = field_names
self.format_ = print_format
self.records_ = []
self.handlers_ = dict()
self.str_lens_ = dict()
self.title_printed_ = False
if persistent_file is not None:
self.csv_ = open(persistent_file, "a")
else:
self.csv_ = None
self.Record = None
def __def__(self):
if self.csv_ is not None:
self.csv_.close()
def finish(self):
err = f"{len(self.field_names_)} vs. {len(self.handlers_)}"
assert len(self.field_names_) == len(self.handlers_), err
err = f"{len(self.field_names_)} vs. {len(self.str_lens_)}"
assert len(self.field_names_) == len(self.str_lens_), err
for fname in self.field_names_:
assert fname in self.handlers_, f"{fname} handler not register"
assert fname in self.str_lens_, f"{fname} str_len not register"
self.Record = namedtuple("Record", self.field_names_)
# DEBUG(zwx):
# dummy = self.Record(*(["-"] * len(self.field_names_)))
# df = pd.DataFrame(dummy)
# if self.persistent_file_ is not None:
# df.to_csv(self.persistent_file_, mode='a', header=True)
def record(self, *args, **kwargs):
assert self.Record is not None
r = self.Record(*args, **kwargs)
self.records_.append(r)
def register_handler(self, field, handler):
assert callable(handler)
self.handlers_[field] = handler
def register_str_len(self, field, str_len):
assert isinstance(str_len, int)
self.str_lens_[field] = str_len
def reset_records(self):
self.records_ = []
def print_table_title(self):
fields = ""
sep = ""
for fname in self.field_names_:
str_len = self.str_lens_[fname]
fields += "| {} ".format(fname.ljust(str_len))
sep += f"| {'-' * str_len} "
fields += "|"
sep += "|"
print(fields)
print(sep)
self.title_printed_ = True
def reset_title_printed(self):
self.title_printed_ = False
def print(self):
df = pd.DataFrame(self.records_)
fields = []
for fname in self.field_names_:
assert fname in self.handlers_
handler = self.handlers_[fname]
field_value = handler(df[fname])
fields.append(field_value)
if self.format_ == "table":
if not self.title_printed_:
self.print_table_title()
record = ""
for i, str_len in enumerate(self.str_lens_.values()):
record += "| {} ".format(str(fields[i]).ljust(str_len))
record += "|"
print(record)
elif self.format_ == "normal":
record = ""
for i, fval in enumerate(fields):
fname = self.field_names_[i]
record += f"{fname}: {fval}, "
print(record)
else:
raise ValueError
if self.csv_ is not None:
df.to_csv(self.csv_, header=False)
self.reset_records()
import sys
import signal
import subprocess
import threading
class CudaUtilMemStat:
def __init__(
self, stat_file_path, stat_util=True, stat_mem=True, only_ordinal=None
):
self.stat_file = open(stat_file_path, "wt")
self.stat_util = stat_util
self.stat_mem = stat_mem
self.only_ordinal = only_ordinal
self._write_titles()
def __del__(self):
self.stat_file.close()
def _write_titles(self):
proc = subprocess.Popen(
["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE,
)
stdout, stderr = proc.communicate()
if stderr != b"":
raise RuntimeError(stderr)
gpus = []
lines = stdout.decode("utf-8").split("\n")
for line in lines:
if line.strip() == "":
continue
gpus.append(line.split(":")[0])
util_titles, mem_titles = [], []
for gpu in gpus:
if self.stat_util:
util_titles.append(gpu + " utilization")
if self.stat_mem:
mem_titles.append(gpu + " memory used")
if self.only_ordinal is None:
self.stat_file.write(",".join(util_titles + mem_titles) + "\n")
else:
titles = (
util_titles[self.only_ordinal] + "," + mem_titles[self.only_ordinal]
)
self.stat_file.write(titles + "\n")
self.stat_file.flush()
def stat(self):
# command: nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv
proc = subprocess.Popen(
["nvidia-smi", "--query-gpu=utilization.gpu,memory.used", "--format=csv"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = proc.communicate()
if stderr != b"":
raise RuntimeError(stderr)
lines = stdout.decode("utf-8").split("\n")
util_vals = []
mem_vals = []
for line in lines[1:]:
if line.strip() == "":
continue
util_mem = line.split(",")
assert len(util_mem) == 2, lines
util = util_mem[0].strip()
mem = util_mem[1].strip()
assert "%" in util
assert "MiB" in mem
util = util.split(" ")
mem = mem.split(" ")
assert len(util) == 2
assert len(mem) == 2
util_val = util[0].strip()
mem_val = mem[0].strip()
if self.stat_util:
util_vals.append(util_val)
if self.stat_mem:
mem_vals.append(mem_val)
if self.only_ordinal is None:
self.stat_file.write(",".join(util_vals + mem_vals) + "\n")
else:
vals = util_vals[self.only_ordinal] + "," + mem_vals[self.only_ordinal]
self.stat_file.write(vals + "\n")
self.stat_file.flush()
def start(self, interval):
stop = threading.Event()
stat_thrd = StatThread(self.stat, interval, stop)
stat_thrd.start()
def close(signum, frame):
print("Closing...")
stop.set()
signal.signal(signal.SIGTERM, close)
signal.signal(signal.SIGINT, close)
print("Start stat")
print("Print Ctrl+C to stop")
stop.wait()
class StatThread(threading.Thread):
def __init__(self, handler, interval, stop_event):
super().__init__()
self.handler = handler
self.interval = interval
self.stopped = stop_event
self.count = 0
def run(self):
while not self.stopped.wait(self.interval):
print(f"{self.count} th run stat")
self.handler()
self.count += 1
if __name__ == "__main__":
stat_file_path = sys.argv[1] if len(sys.argv) > 1 else "gpu_stat.log"
interval = sys.argv[2] if len(sys.argv) > 2 else 1
stat = CudaUtilMemStat(stat_file_path)
stat.start(interval)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment