Commit e129194a authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model resnet50v1.5

parents
Pipeline #571 failed with stages
in 0 seconds
import collections
import itertools
import os
import pathlib
import re
import pynvml
from typing import Union
class Device:
# assume nvml returns list of 64 bit ints
_nvml_bit_affinity = 64
_nvml_affinity_elements = (
os.cpu_count() + _nvml_bit_affinity - 1
) // _nvml_bit_affinity
def __init__(self, device_idx):
super().__init__()
self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
def get_name(self):
return pynvml.nvmlDeviceGetName(self.handle)
def get_uuid(self):
return pynvml.nvmlDeviceGetUUID(self.handle)
def get_cpu_affinity(self):
affinity_string = ""
for j in pynvml.nvmlDeviceGetCpuAffinity(
self.handle, Device._nvml_affinity_elements
):
# assume nvml returns list of 64 bit ints
affinity_string = "{:064b}".format(j) + affinity_string
affinity_list = [int(x) for x in affinity_string]
affinity_list.reverse() # so core 0 is in 0th element of list
ret = [i for i, e in enumerate(affinity_list) if e != 0]
return ret
def get_thread_siblings_list():
"""
Returns a list of 2-element integer tuples representing pairs of
hyperthreading cores.
"""
path = "/sys/devices/system/cpu/cpu*/topology/thread_siblings_list"
thread_siblings_list = []
pattern = re.compile(r"(\d+)\D(\d+)")
for fname in pathlib.Path(path[0]).glob(path[1:]):
with open(fname) as f:
content = f.read().strip()
res = pattern.findall(content)
if res:
pair = tuple(sorted(map(int, res[0])))
thread_siblings_list.append(pair)
thread_siblings_list = list(set(thread_siblings_list))
return thread_siblings_list
def build_thread_siblings_dict(siblings_list):
siblings_dict = {}
for siblings_tuple in siblings_list:
for core in siblings_tuple:
siblings_dict[core] = siblings_tuple
return siblings_dict
def group_list_by_dict(affinity, siblings_dict):
sorted_affinity = sorted(affinity, key=lambda x: siblings_dict.get(x, (x,)))
grouped = itertools.groupby(
sorted_affinity, key=lambda x: siblings_dict.get(x, (x,))
)
grouped_affinity = []
for key, group in grouped:
grouped_affinity.append(tuple(group))
return grouped_affinity
def group_affinity_by_siblings(socket_affinities):
siblings_list = get_thread_siblings_list()
siblings_dict = build_thread_siblings_dict(siblings_list)
grouped_socket_affinities = []
for socket_affinity in socket_affinities:
grouped_socket_affinities.append(
group_list_by_dict(socket_affinity, siblings_dict)
)
return grouped_socket_affinities
def ungroup_affinities(affinities, cores):
ungrouped_affinities = []
for affinity in affinities:
if cores == "all_logical":
ungrouped_affinities.append(list(itertools.chain(*affinity)))
elif cores == "single_logical":
ungrouped_affinities.append([group[0] for group in affinity])
else:
raise RuntimeError("Unknown cores mode")
return ungrouped_affinities
def check_socket_affinities(socket_affinities):
# sets of cores should be either identical or disjoint
for i, j in itertools.product(socket_affinities, socket_affinities):
if not set(i) == set(j) and not set(i).isdisjoint(set(j)):
raise RuntimeError(
f"Sets of cores should be either identical or disjoint, "
f"but got {i} and {j}."
)
def get_socket_affinities(nproc_per_node, exclude_unavailable_cores=True):
devices = [Device(i) for i in range(nproc_per_node)]
socket_affinities = [dev.get_cpu_affinity() for dev in devices]
if exclude_unavailable_cores:
available_cores = os.sched_getaffinity(0)
socket_affinities = [
list(set(affinity) & available_cores) for affinity in socket_affinities
]
check_socket_affinities(socket_affinities)
return socket_affinities
def get_grouped_socket_affinities(nproc_per_node, exclude_unavailable_cores=True):
socket_affinities = get_socket_affinities(nproc_per_node, exclude_unavailable_cores)
grouped_socket_affinities = group_affinity_by_siblings(socket_affinities)
return grouped_socket_affinities
def set_socket_affinity(gpu_id, nproc_per_node, cores):
"""
The process is assigned with all available physical CPU cores from the CPU
socket connected to the GPU with a given id.
Args:
gpu_id: index of a GPU
nproc_per_node: number of processes per node
cores: 'all_logical' or 'single_logical'
"""
grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
ungrouped_affinities = ungroup_affinities(grouped_socket_affinities, cores)
os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
def set_socket_single_affinity(gpu_id, nproc_per_node, cores):
"""
The process is assigned with the first available physical CPU core from the
list of all CPU physical cores from the CPU socket connected to the GPU with
a given id.
Args:
gpu_id: index of a GPU
nproc_per_node: number of processes per node
cores: 'all_logical' or 'single_logical'
"""
grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
single_grouped_socket_affinities = [
group[:1] for group in grouped_socket_affinities
]
ungrouped_affinities = ungroup_affinities(single_grouped_socket_affinities, cores)
os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
def set_socket_single_unique_affinity(gpu_id, nproc_per_node, cores):
"""
The process is assigned with a single unique available physical CPU core
from the list of all CPU cores from the CPU socket connected to the GPU with
a given id.
Args:
gpu_id: index of a GPU
nproc_per_node: number of processes per node
cores: 'all_logical' or 'single_logical'
"""
grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
affinities = []
assigned_groups = set()
for grouped_socket_affinity in grouped_socket_affinities:
for group in grouped_socket_affinity:
if group not in assigned_groups:
affinities.append([group])
assigned_groups.add(group)
break
ungrouped_affinities = ungroup_affinities(affinities, cores)
os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
def set_socket_unique_affinity(gpu_id, nproc_per_node, cores, mode, balanced=True):
"""
The process is assigned with a unique subset of available physical CPU
cores from the CPU socket connected to a GPU with a given id.
Assignment automatically includes hyperthreading siblings (if siblings are
available).
Args:
gpu_id: index of a GPU
nproc_per_node: number of processes per node
cores: 'all_logical' or 'single_logical'
mode: 'contiguous' or 'interleaved'
balanced: assign an equal number of physical cores to each process,
"""
grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
grouped_socket_affinities_to_device_ids = collections.defaultdict(list)
for idx, grouped_socket_affinity in enumerate(grouped_socket_affinities):
grouped_socket_affinities_to_device_ids[tuple(grouped_socket_affinity)].append(
idx
)
# compute minimal number of physical cores per GPU across all GPUs and
# sockets, code assigns this number of cores per GPU if balanced == True
min_physical_cores_per_gpu = min(
[
len(cores) // len(gpus)
for cores, gpus in grouped_socket_affinities_to_device_ids.items()
]
)
grouped_unique_affinities = [None] * nproc_per_node
for (
grouped_socket_affinity,
device_ids,
) in grouped_socket_affinities_to_device_ids.items():
devices_per_group = len(device_ids)
if balanced:
cores_per_device = min_physical_cores_per_gpu
grouped_socket_affinity = grouped_socket_affinity[
: devices_per_group * min_physical_cores_per_gpu
]
else:
cores_per_device = len(grouped_socket_affinity) // devices_per_group
for socket_subgroup_id, device_id in enumerate(device_ids):
# In theory there should be no difference in performance between
# 'interleaved' and 'contiguous' pattern on Intel-based DGX-1,
# but 'contiguous' should be better for DGX A100 because on AMD
# Rome 4 consecutive cores are sharing L3 cache.
# TODO: code doesn't attempt to automatically detect layout of
# L3 cache, also external environment may already exclude some
# cores, this code makes no attempt to detect it and to align
# mapping to multiples of 4.
if mode == "interleaved":
unique_grouped_affinity = list(
grouped_socket_affinity[socket_subgroup_id::devices_per_group]
)
elif mode == "contiguous":
unique_grouped_affinity = list(
grouped_socket_affinity[
socket_subgroup_id
* cores_per_device : (socket_subgroup_id + 1)
* cores_per_device
]
)
else:
raise RuntimeError("Unknown set_socket_unique_affinity mode")
grouped_unique_affinities[device_id] = unique_grouped_affinity
ungrouped_affinities = ungroup_affinities(grouped_unique_affinities, cores)
os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
from enum import Enum, auto
class AffinityMode(Enum):
none = auto()
socket = auto()
socket_single = auto()
socket_single_unique = auto()
socket_unique_interleaved = auto()
socket_unique_contiguous = auto()
def set_affinity(
gpu_id,
nproc_per_node=None,
*,
mode: Union[str, AffinityMode] = AffinityMode.socket_unique_contiguous,
cores="all_logical",
balanced=True,
):
"""
The process is assigned with a proper CPU affinity that matches CPU-GPU
hardware architecture on a given platform. Usually, it improves and
stabilizes the performance of deep learning training workloads.
This function assumes that the workload runs in multi-process single-device
mode (there are multiple training processes, and each process is running on
a single GPU). This is typical for multi-GPU data-parallel training
workloads (e.g., using `torch.nn.parallel.DistributedDataParallel`).
Available affinity modes:
* 'socket' - the process is assigned with all available physical CPU cores
from the CPU socket connected to the GPU with a given id.
* 'socket_single' - the process is assigned with the first available
physical CPU core from the list of all CPU cores from the CPU socket
connected to the GPU with a given id (multiple GPUs could be assigned with
the same CPU core).
* 'socket_single_unique' - the process is assigned with a single unique
available physical CPU core from the list of all CPU cores from the CPU
socket connected to the GPU with a given id.
* 'socket_unique_interleaved' - the process is assigned with a unique
subset of available physical CPU cores from the CPU socket connected to a
GPU with a given id, cores are assigned with interleaved indexing pattern
* 'socket_unique_contiguous' - (the default) the process is assigned with a
unique subset of available physical CPU cores from the CPU socket connected
to a GPU with a given id, cores are assigned with contiguous indexing
pattern
Available "cores" modes:
* 'all_logical' - assigns the process with all logical cores associated with
a given corresponding physical core (i.e., automatically includes all
available hyperthreading siblings)
* 'single_logical' - assigns the process with only one logical core
associated with a given corresponding physical core (i.e., excludes
hyperthreading siblings)
'socket_unique_contiguous' is the recommended mode for deep learning
training workloads on NVIDIA DGX machines.
Args:
gpu_id: integer index of a GPU, value from 0 to 'nproc_per_node' - 1
nproc_per_node: number of processes per node
mode: affinity mode
balanced: assign an equal number of physical cores to each process,
affects only 'socket_unique_interleaved' and
'socket_unique_contiguous' affinity modes
cores: 'all_logical' or 'single_logical'
Returns a set of logical CPU cores on which the process is eligible to run.
Example:
import argparse
import os
import gpu_affinity
import torch
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'--local_rank',
type=int,
default=os.getenv('LOCAL_RANK', 0),
)
args = parser.parse_args()
nproc_per_node = torch.cuda.device_count()
affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node)
print(f'{args.local_rank}: core affinity: {affinity}')
if __name__ == "__main__":
main()
Launch the example with:
python -m torch.distributed.launch --nproc_per_node <#GPUs> example.py
WARNING: On DGX A100, only half of the CPU cores have direct access to GPUs.
This function restricts execution only to the CPU cores directly connected
to GPUs, so on DGX A100, it will limit the code to half of the CPU cores and
half of CPU memory bandwidth (which may be fine for many DL models).
WARNING: Intel's OpenMP implementation resets affinity on the first call to
an OpenMP function after a fork. It's recommended to run with env variable:
`KMP_AFFINITY=disabled` if the affinity set by gpu_affinity should be
preserved after a fork (e.g. in PyTorch DataLoader workers).
"""
if not isinstance(mode, AffinityMode):
mode = AffinityMode[mode]
pynvml.nvmlInit()
if nproc_per_node is None:
nproc_per_node = pynvml.nvmlDeviceGetCount()
if mode == AffinityMode.none:
pass
elif mode == AffinityMode.socket:
set_socket_affinity(gpu_id, nproc_per_node, cores)
elif mode == AffinityMode.socket_single:
set_socket_single_affinity(gpu_id, nproc_per_node, cores)
elif mode == AffinityMode.socket_single_unique:
set_socket_single_unique_affinity(gpu_id, nproc_per_node, cores)
elif mode == AffinityMode.socket_unique_interleaved:
set_socket_unique_affinity(
gpu_id, nproc_per_node, cores, "interleaved", balanced
)
elif mode == AffinityMode.socket_unique_contiguous:
set_socket_unique_affinity(
gpu_id, nproc_per_node, cores, "contiguous", balanced
)
else:
raise RuntimeError("Unknown affinity mode")
affinity = os.sched_getaffinity(0)
return affinity
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from collections import OrderedDict
from numbers import Number
import dllogger
import numpy as np
def format_step(step):
if isinstance(step, str):
return step
s = ""
if len(step) > 0:
if isinstance(step[0], Number):
s += "Epoch: {} ".format(step[0])
else:
s += "{} ".format(step[0])
if len(step) > 1:
s += "Iteration: {} ".format(step[1])
if len(step) > 2:
s += "Validation Iteration: {} ".format(step[2])
if len(step) == 0:
s = "Summary:"
return s
PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter())
LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter())
ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter())
LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter())
LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1))
LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99))
LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95))
class Meter(object):
def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator):
self.run_aggregator = run_aggregator
self.epoch_aggregator = epoch_aggregator
self.iteration_aggregator = iteration_aggregator
def record(self, val, n=1):
self.iteration_aggregator.record(val, n=n)
def get_iteration(self):
v, n = self.iteration_aggregator.get_val()
return v
def reset_iteration(self):
v, n = self.iteration_aggregator.get_data()
self.iteration_aggregator.reset()
if v is not None:
self.epoch_aggregator.record(v, n=n)
def get_epoch(self):
v, n = self.epoch_aggregator.get_val()
return v
def reset_epoch(self):
v, n = self.epoch_aggregator.get_data()
self.epoch_aggregator.reset()
if v is not None:
self.run_aggregator.record(v, n=n)
def get_run(self):
v, n = self.run_aggregator.get_val()
return v
def reset_run(self):
self.run_aggregator.reset()
class QuantileMeter(object):
def __init__(self, q):
self.q = q
self.reset()
def reset(self):
self.vals = []
self.n = 0
def record(self, val, n=1):
if isinstance(val, list):
self.vals += val
self.n += len(val)
else:
self.vals += [val] * n
self.n += n
def get_val(self):
if not self.vals:
return None, self.n
return np.quantile(self.vals, self.q, interpolation="nearest"), self.n
def get_data(self):
return self.vals, self.n
class MaxMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.max = None
self.n = 0
def record(self, val, n=1):
if self.max is None:
self.max = val
else:
self.max = max(self.max, val)
self.n = n
def get_val(self):
return self.max, self.n
def get_data(self):
return self.max, self.n
class MinMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.min = None
self.n = 0
def record(self, val, n=1):
if self.min is None:
self.min = val
else:
self.min = max(self.min, val)
self.n = n
def get_val(self):
return self.min, self.n
def get_data(self):
return self.min, self.n
class LastMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.last = None
self.n = 0
def record(self, val, n=1):
self.last = val
self.n = n
def get_val(self):
return self.last, self.n
def get_data(self):
return self.last, self.n
class AverageMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.n = 0
self.val = 0
def record(self, val, n=1):
self.n += n
self.val += val * n
def get_val(self):
if self.n == 0:
return None, 0
return self.val / self.n, self.n
def get_data(self):
if self.n == 0:
return None, 0
return self.val / self.n, self.n
class Logger(object):
def __init__(self, print_interval, backends, start_epoch=-1, verbose=False):
self.epoch = start_epoch
self.iteration = -1
self.val_iteration = -1
self.calib_iteration = -1
self.metrics = OrderedDict()
self.backends = backends
self.print_interval = print_interval
self.verbose = verbose
dllogger.init(backends)
def log_parameter(self, data, verbosity=0):
dllogger.log(step="PARAMETER", data=data, verbosity=verbosity)
def register_metric(self, metric_name, meter, verbosity=0, metadata={}):
if self.verbose:
print("Registering metric: {}".format(metric_name))
self.metrics[metric_name] = {"meter": meter, "level": verbosity}
dllogger.metadata(metric_name, metadata)
def log_metric(self, metric_name, val, n=1):
self.metrics[metric_name]["meter"].record(val, n=n)
def start_iteration(self, mode="train"):
if mode == "val":
self.val_iteration += 1
elif mode == "train":
self.iteration += 1
elif mode == "calib":
self.calib_iteration += 1
def end_iteration(self, mode="train"):
if mode == "val":
it = self.val_iteration
elif mode == "train":
it = self.iteration
elif mode == "calib":
it = self.calib_iteration
if it % self.print_interval == 0 or mode == "calib":
metrics = {n: m for n, m in self.metrics.items() if n.startswith(mode)}
if mode == "train":
step = (self.epoch, self.iteration)
elif mode == "val":
step = (self.epoch, self.iteration, self.val_iteration)
elif mode == "calib":
step = ("Calibration", self.calib_iteration)
verbositys = {m["level"] for _, m in metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in metrics.items() if m["level"] == ll}
dllogger.log(
step=step,
data={n: m["meter"].get_iteration() for n, m in llm.items()},
verbosity=ll,
)
for n, m in metrics.items():
m["meter"].reset_iteration()
dllogger.flush()
def start_epoch(self):
self.epoch += 1
self.iteration = 0
self.val_iteration = 0
for n, m in self.metrics.items():
if not n.startswith("calib"):
m["meter"].reset_epoch()
def end_epoch(self):
for n, m in self.metrics.items():
if not n.startswith("calib"):
m["meter"].reset_iteration()
verbositys = {m["level"] for _, m in self.metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
dllogger.log(
step=(self.epoch,),
data={n: m["meter"].get_epoch() for n, m in llm.items()},
)
def start_calibration(self):
self.calib_iteration = 0
for n, m in self.metrics.items():
if n.startswith("calib"):
m["meter"].reset_epoch()
def end_calibration(self):
for n, m in self.metrics.items():
if n.startswith("calib"):
m["meter"].reset_iteration()
def end(self):
for n, m in self.metrics.items():
m["meter"].reset_epoch()
verbositys = {m["level"] for _, m in self.metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
dllogger.log(
step=tuple(), data={n: m["meter"].get_run() for n, m in llm.items()}
)
for n, m in self.metrics.items():
m["meter"].reset_epoch()
dllogger.flush()
def iteration_generator_wrapper(self, gen, mode="train"):
for g in gen:
self.start_iteration(mode=mode)
yield g
self.end_iteration(mode=mode)
def epoch_generator_wrapper(self, gen):
for g in gen:
self.start_epoch()
yield g
self.end_epoch()
class Metrics:
ACC_METADATA = {"unit": "%", "format": ":.2f"}
IPS_METADATA = {"unit": "images/s", "format": ":.2f"}
TIME_METADATA = {"unit": "s", "format": ":.5f"}
LOSS_METADATA = {"unit": None, "format": ":.5f"}
LR_METADATA = {"unit": None, "format": ":.5f"}
def __init__(self, logger):
self.logger = logger
self.map = {}
def log(self, **kwargs):
if self.logger is None:
return
for k, v in kwargs.items():
tks = self.map.get(k, [k])
for tk in tks:
if isinstance(v, tuple):
self.logger.log_metric(tk, v[0], v[1])
else:
self.logger.log_metric(tk, v)
class TrainingMetrics(Metrics):
def __init__(self, logger):
super().__init__(logger)
if self.logger is not None:
self.map = {
"loss": ["train.loss"],
"compute_ips": ["train.compute_ips"],
"total_ips": ["train.total_ips"],
"data_time": ["train.data_time"],
"compute_time": ["train.compute_time"],
"lr": ["train.lr"],
"grad_scale": ["train.grad_scale"],
}
logger.register_metric(
"train.loss",
LOSS_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.LOSS_METADATA,
)
logger.register_metric(
"train.compute_ips",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.IPS_METADATA,
)
logger.register_metric(
"train.total_ips",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.IPS_METADATA,
)
logger.register_metric(
"train.data_time",
PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
"train.compute_time",
PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
"train.lr",
LR_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
)
logger.register_metric(
"train.grad_scale",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.LOSS_METADATA,
)
class ValidationMetrics(Metrics):
def __init__(self, logger, prefix, topk):
super().__init__(logger)
if self.logger is not None:
self.map = {
"loss": [f"{prefix}.loss"],
"top1": [f"{prefix}.top1"],
f"top{topk}": [f"{prefix}.top{topk}"],
"compute_ips": [f"{prefix}.compute_ips"],
"total_ips": [f"{prefix}.total_ips"],
"data_time": [f"{prefix}.data_time"],
"compute_time": [
f"{prefix}.compute_latency",
f"{prefix}.compute_latency_at100",
f"{prefix}.compute_latency_at99",
f"{prefix}.compute_latency_at95",
],
}
logger.register_metric(
f"{prefix}.top1",
ACC_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.ACC_METADATA,
)
logger.register_metric(
f"{prefix}.top{topk}",
ACC_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.ACC_METADATA,
)
logger.register_metric(
f"{prefix}.loss",
LOSS_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.LOSS_METADATA,
)
logger.register_metric(
f"{prefix}.compute_ips",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.IPS_METADATA,
)
logger.register_metric(
f"{prefix}.total_ips",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.IPS_METADATA,
)
logger.register_metric(
f"{prefix}.data_time",
PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
f"{prefix}.compute_latency",
PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
f"{prefix}.compute_latency_at100",
LAT_100(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
f"{prefix}.compute_latency_at99",
LAT_99(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
logger.register_metric(
f"{prefix}.compute_latency_at95",
LAT_95(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=Metrics.TIME_METADATA,
)
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
import numpy as np
def mixup(alpha, data, target):
with torch.no_grad():
bs = data.size(0)
c = np.random.beta(alpha, alpha)
perm = torch.randperm(bs).cuda()
md = c * data + (1 - c) * data[perm, :]
mt = c * target + (1 - c) * target[perm, :]
return md, mt
class MixUpWrapper(object):
def __init__(self, alpha, dataloader):
self.alpha = alpha
self.dataloader = dataloader
def mixup_loader(self, loader):
for input, target in loader:
i, t = mixup(self.alpha, input, target)
yield i, t
def __iter__(self):
return self.mixup_loader(self.dataloader)
def __len__(self):
return len(self.dataloader)
class NLLMultiLabelSmooth(nn.Module):
def __init__(self, smoothing=0.0):
super(NLLMultiLabelSmooth, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
def forward(self, x, target):
if self.training:
x = x.float()
target = target.float()
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
nll_loss = -logprobs * target
nll_loss = nll_loss.sum(-1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean()
else:
return torch.nn.functional.cross_entropy(x, target)
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .entrypoints import nvidia_convnets_processing_utils, nvidia_efficientnet
from .resnet import resnet50, resnext101_32x4d, se_resnext101_32x4d
from .efficientnet import (
efficientnet_b0,
efficientnet_b4,
efficientnet_widese_b0,
efficientnet_widese_b4,
efficientnet_quant_b0,
efficientnet_quant_b4,
)
import copy
from collections import OrderedDict
from dataclasses import dataclass
from typing import Optional
import torch
import warnings
from torch import nn
import torch.nn.functional as F
try:
from pytorch_quantization import nn as quant_nn
except ImportError as e:
warnings.warn(
"pytorch_quantization module not found, quantization will not be available"
)
quant_nn = None
# LayerBuilder {{{
class LayerBuilder(object):
@dataclass
class Config:
activation: str = "relu"
conv_init: str = "fan_in"
bn_momentum: Optional[float] = None
bn_epsilon: Optional[float] = None
def __init__(self, config: "LayerBuilder.Config"):
self.config = config
def conv(
self,
kernel_size,
in_planes,
out_planes,
groups=1,
stride=1,
bn=False,
zero_init_bn=False,
act=False,
):
conv = nn.Conv2d(
in_planes,
out_planes,
kernel_size=kernel_size,
groups=groups,
stride=stride,
padding=int((kernel_size - 1) / 2),
bias=False,
)
nn.init.kaiming_normal_(
conv.weight, mode=self.config.conv_init, nonlinearity="relu"
)
layers = [("conv", conv)]
if bn:
layers.append(("bn", self.batchnorm(out_planes, zero_init_bn)))
if act:
layers.append(("act", self.activation()))
if bn or act:
return nn.Sequential(OrderedDict(layers))
else:
return conv
def convDepSep(
self, kernel_size, in_planes, out_planes, stride=1, bn=False, act=False
):
"""3x3 depthwise separable convolution with padding"""
c = self.conv(
kernel_size,
in_planes,
out_planes,
groups=in_planes,
stride=stride,
bn=bn,
act=act,
)
return c
def conv3x3(self, in_planes, out_planes, stride=1, groups=1, bn=False, act=False):
"""3x3 convolution with padding"""
c = self.conv(
3, in_planes, out_planes, groups=groups, stride=stride, bn=bn, act=act
)
return c
def conv1x1(self, in_planes, out_planes, stride=1, groups=1, bn=False, act=False):
"""1x1 convolution with padding"""
c = self.conv(
1, in_planes, out_planes, groups=groups, stride=stride, bn=bn, act=act
)
return c
def conv7x7(self, in_planes, out_planes, stride=1, groups=1, bn=False, act=False):
"""7x7 convolution with padding"""
c = self.conv(
7, in_planes, out_planes, groups=groups, stride=stride, bn=bn, act=act
)
return c
def conv5x5(self, in_planes, out_planes, stride=1, groups=1, bn=False, act=False):
"""5x5 convolution with padding"""
c = self.conv(
5, in_planes, out_planes, groups=groups, stride=stride, bn=bn, act=act
)
return c
def batchnorm(self, planes, zero_init=False):
bn_cfg = {}
if self.config.bn_momentum is not None:
bn_cfg["momentum"] = self.config.bn_momentum
if self.config.bn_epsilon is not None:
bn_cfg["eps"] = self.config.bn_epsilon
bn = nn.BatchNorm2d(planes, **bn_cfg)
gamma_init_val = 0 if zero_init else 1
nn.init.constant_(bn.weight, gamma_init_val)
nn.init.constant_(bn.bias, 0)
return bn
def activation(self):
return {
"silu": lambda: nn.SiLU(inplace=True),
"relu": lambda: nn.ReLU(inplace=True),
"onnx-silu": ONNXSiLU,
}[self.config.activation]()
# LayerBuilder }}}
# LambdaLayer {{{
class LambdaLayer(nn.Module):
def __init__(self, lmbd):
super().__init__()
self.lmbd = lmbd
def forward(self, x):
return self.lmbd(x)
# }}}
# SqueezeAndExcitation {{{
class SqueezeAndExcitation(nn.Module):
def __init__(self, in_channels, squeeze, activation):
super(SqueezeAndExcitation, self).__init__()
self.squeeze = nn.Linear(in_channels, squeeze)
self.expand = nn.Linear(squeeze, in_channels)
self.activation = activation
self.sigmoid = nn.Sigmoid()
def forward(self, x):
return self._attention(x)
def _attention(self, x):
out = torch.mean(x, [2, 3])
out = self.squeeze(out)
out = self.activation(out)
out = self.expand(out)
out = self.sigmoid(out)
out = out.unsqueeze(2).unsqueeze(3)
return out
class SqueezeAndExcitationTRT(nn.Module):
def __init__(self, in_channels, squeeze, activation):
super(SqueezeAndExcitationTRT, self).__init__()
self.pooling = nn.AdaptiveAvgPool2d(1)
self.squeeze = nn.Conv2d(in_channels, squeeze, 1)
self.expand = nn.Conv2d(squeeze, in_channels, 1)
self.activation = activation
self.sigmoid = nn.Sigmoid()
def forward(self, x):
return self._attention(x)
def _attention(self, x):
out = self.pooling(x)
out = self.squeeze(out)
out = self.activation(out)
out = self.expand(out)
out = self.sigmoid(out)
return out
# }}}
# EMA {{{
class EMA:
def __init__(self, mu, module_ema):
self.mu = mu
self.module_ema = module_ema
def __call__(self, module, step=None):
if step is None:
mu = self.mu
else:
mu = min(self.mu, (1.0 + step) / (10 + step))
def strip_module(s: str) -> str:
return s
mesd = self.module_ema.state_dict()
with torch.no_grad():
for name, x in module.state_dict().items():
if name.endswith("num_batches_tracked"):
continue
n = strip_module(name)
mesd[n].mul_(mu)
mesd[n].add_((1.0 - mu) * x)
# }}}
# ONNXSiLU {{{
# Since torch.nn.SiLU is not supported in ONNX,
# it is required to use this implementation in exported model (15-20% more GPU memory is needed)
class ONNXSiLU(nn.Module):
def __init__(self, *args, **kwargs):
super(ONNXSiLU, self).__init__()
def forward(self, x):
return x * torch.sigmoid(x)
# }}}
class SequentialSqueezeAndExcitation(SqueezeAndExcitation):
def __init__(self, in_channels, squeeze, activation, quantized=False):
super().__init__(in_channels, squeeze, activation)
self.quantized = quantized
if quantized:
assert quant_nn is not None, "pytorch_quantization is not available"
self.mul_a_quantizer = quant_nn.TensorQuantizer(
quant_nn.QuantConv2d.default_quant_desc_input
)
self.mul_b_quantizer = quant_nn.TensorQuantizer(
quant_nn.QuantConv2d.default_quant_desc_input
)
else:
self.mul_a_quantizer = nn.Identity()
self.mul_b_quantizer = nn.Identity()
def forward(self, x):
out = self._attention(x)
if not self.quantized:
return out * x
else:
x_quant = self.mul_a_quantizer(out)
return x_quant * self.mul_b_quantizer(x)
class SequentialSqueezeAndExcitationTRT(SqueezeAndExcitationTRT):
def __init__(self, in_channels, squeeze, activation, quantized=False):
super().__init__(in_channels, squeeze, activation)
self.quantized = quantized
if quantized:
assert quant_nn is not None, "pytorch_quantization is not available"
self.mul_a_quantizer = quant_nn.TensorQuantizer(
quant_nn.QuantConv2d.default_quant_desc_input
)
self.mul_b_quantizer = quant_nn.TensorQuantizer(
quant_nn.QuantConv2d.default_quant_desc_input
)
else:
self.mul_a_quantizer = nn.Identity()
self.mul_b_quantizer = nn.Identity()
def forward(self, x):
out = self._attention(x)
if not self.quantized:
return out * x
else:
x_quant = self.mul_a_quantizer(out)
return x_quant * self.mul_b_quantizer(x)
class StochasticDepthResidual(nn.Module):
def __init__(self, survival_prob: float):
super().__init__()
self.survival_prob = survival_prob
self.register_buffer("mask", torch.ones(()), persistent=False)
def forward(self, residual: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
if not self.training:
return torch.add(residual, other=x)
else:
with torch.no_grad():
mask = F.dropout(
self.mask,
p=1 - self.survival_prob,
training=self.training,
inplace=False,
)
return torch.addcmul(residual, mask, x)
class Flatten(nn.Module):
def forward(self, x: torch.Tensor) -> torch.Tensor:
return x.squeeze(-1).squeeze(-1)
import argparse
import random
import math
import warnings
from typing import List, Any, Optional
from collections import namedtuple, OrderedDict
from dataclasses import dataclass, replace
import torch
from torch import nn
from functools import partial
try:
from pytorch_quantization import nn as quant_nn
from ..quantization import switch_on_quantization
except ImportError as e:
warnings.warn(
"pytorch_quantization module not found, quantization will not be available"
)
quant_nn = None
import contextlib
@contextlib.contextmanager
def switch_on_quantization(do_quantization=False):
assert not do_quantization, "quantization is not available"
try:
yield
finally:
pass
from .common import (
SequentialSqueezeAndExcitation,
SequentialSqueezeAndExcitationTRT,
LayerBuilder,
StochasticDepthResidual,
Flatten,
)
from .model import (
Model,
ModelParams,
ModelArch,
OptimizerParams,
create_entrypoint,
EntryPoint,
)
# EffNetArch {{{
@dataclass
class EffNetArch(ModelArch):
block: Any
stem_channels: int
feature_channels: int
kernel: List[int]
stride: List[int]
num_repeat: List[int]
expansion: List[int]
channels: List[int]
default_image_size: int
squeeze_excitation_ratio: float = 0.25
def enumerate(self):
return enumerate(
zip(
self.kernel, self.stride, self.num_repeat, self.expansion, self.channels
)
)
def num_layers(self):
_f = lambda l: len(set(map(len, l)))
l = [self.kernel, self.stride, self.num_repeat, self.expansion, self.channels]
assert _f(l) == 1
return len(self.kernel)
@staticmethod
def _scale_width(width_coeff, divisor=8):
def _sw(num_channels):
num_channels *= width_coeff
# Rounding should not go down by more than 10%
rounded_num_channels = max(
divisor, int(num_channels + divisor / 2) // divisor * divisor
)
if rounded_num_channels < 0.9 * num_channels:
rounded_num_channels += divisor
return rounded_num_channels
return _sw
@staticmethod
def _scale_depth(depth_coeff):
def _sd(num_repeat):
return int(math.ceil(num_repeat * depth_coeff))
return _sd
def scale(self, wc, dc, dis, divisor=8) -> "EffNetArch":
sw = EffNetArch._scale_width(wc, divisor=divisor)
sd = EffNetArch._scale_depth(dc)
return EffNetArch(
block=self.block,
stem_channels=sw(self.stem_channels),
feature_channels=sw(self.feature_channels),
kernel=self.kernel,
stride=self.stride,
num_repeat=list(map(sd, self.num_repeat)),
expansion=self.expansion,
channels=list(map(sw, self.channels)),
default_image_size=dis,
squeeze_excitation_ratio=self.squeeze_excitation_ratio,
)
# }}}
# EffNetParams {{{
@dataclass
class EffNetParams(ModelParams):
dropout: float
num_classes: int = 1000
activation: str = "silu"
conv_init: str = "fan_in"
bn_momentum: float = 1 - 0.99
bn_epsilon: float = 1e-3
survival_prob: float = 1
quantized: bool = False
trt: bool = False
def parser(self, name):
p = super().parser(name)
p.add_argument(
"--num_classes",
metavar="N",
default=self.num_classes,
type=int,
help="number of classes",
)
p.add_argument(
"--conv_init",
default=self.conv_init,
choices=["fan_in", "fan_out"],
type=str,
help="initialization mode for convolutional layers, see https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.kaiming_normal_",
)
p.add_argument(
"--bn_momentum",
default=self.bn_momentum,
type=float,
help="Batch Norm momentum",
)
p.add_argument(
"--bn_epsilon",
default=self.bn_epsilon,
type=float,
help="Batch Norm epsilon",
)
p.add_argument(
"--survival_prob",
default=self.survival_prob,
type=float,
help="Survival probability for stochastic depth",
)
p.add_argument(
"--dropout", default=self.dropout, type=float, help="Dropout drop prob"
)
p.add_argument("--trt", metavar="True|False", default=self.trt, type=bool)
return p
# }}}
class EfficientNet(nn.Module):
def __init__(
self,
arch: EffNetArch,
dropout: float,
num_classes: int = 1000,
activation: str = "silu",
conv_init: str = "fan_in",
bn_momentum: float = 1 - 0.99,
bn_epsilon: float = 1e-3,
survival_prob: float = 1,
quantized: bool = False,
trt: bool = False,
):
self.quantized = quantized
with switch_on_quantization(self.quantized):
super(EfficientNet, self).__init__()
self.arch = arch
self.num_layers = arch.num_layers()
self.num_blocks = sum(arch.num_repeat)
self.survival_prob = survival_prob
self.builder = LayerBuilder(
LayerBuilder.Config(
activation=activation,
conv_init=conv_init,
bn_momentum=bn_momentum,
bn_epsilon=bn_epsilon,
)
)
self.stem = self._make_stem(arch.stem_channels)
out_channels = arch.stem_channels
plc = 0
layers = []
for i, (k, s, r, e, c) in arch.enumerate():
layer, out_channels = self._make_layer(
block=arch.block,
kernel_size=k,
stride=s,
num_repeat=r,
expansion=e,
in_channels=out_channels,
out_channels=c,
squeeze_excitation_ratio=arch.squeeze_excitation_ratio,
prev_layer_count=plc,
trt=trt,
)
plc = plc + r
layers.append(layer)
self.layers = nn.Sequential(*layers)
self.features = self._make_features(out_channels, arch.feature_channels)
self.classifier = self._make_classifier(
arch.feature_channels, num_classes, dropout
)
def forward(self, x):
x = self.stem(x)
x = self.layers(x)
x = self.features(x)
x = self.classifier(x)
return x
def extract_features(self, x, layers=None):
if layers is None:
layers = [f"layer{i+1}" for i in range(self.num_layers)] + [
"features",
"classifier",
]
run = [
i
for i in range(self.num_layers)
if "classifier" in layers
or "features" in layers
or any([f"layer{j+1}" in layers for j in range(i, self.num_layers)])
]
output = {}
x = self.stem(x)
for l in run:
fn = self.layers[l]
x = fn(x)
if f"layer{l+1}" in layers:
output[f"layer{l+1}"] = x
if "features" in layers or "classifier" in layers:
x = self.features(x)
if "features" in layers:
output["features"] = x
if "classifier" in layers:
output["classifier"] = self.classifier(x)
return output
# helper functions {{{
def _make_stem(self, stem_width):
return nn.Sequential(
OrderedDict(
[
("conv", self.builder.conv3x3(3, stem_width, stride=2)),
("bn", self.builder.batchnorm(stem_width)),
("activation", self.builder.activation()),
]
)
)
def _get_survival_prob(self, block_id):
drop_rate = 1.0 - self.survival_prob
sp = 1.0 - drop_rate * float(block_id) / self.num_blocks
return sp
def _make_features(self, in_channels, num_features):
return nn.Sequential(
OrderedDict(
[
("conv", self.builder.conv1x1(in_channels, num_features)),
("bn", self.builder.batchnorm(num_features)),
("activation", self.builder.activation()),
]
)
)
def _make_classifier(self, num_features, num_classes, dropout):
return nn.Sequential(
OrderedDict(
[
("pooling", nn.AdaptiveAvgPool2d(1)),
("squeeze", Flatten()),
("dropout", nn.Dropout(dropout)),
("fc", nn.Linear(num_features, num_classes)),
]
)
)
def _make_layer(
self,
block,
kernel_size,
stride,
num_repeat,
expansion,
in_channels,
out_channels,
squeeze_excitation_ratio,
prev_layer_count,
trt,
):
layers = []
idx = 0
survival_prob = self._get_survival_prob(idx + prev_layer_count)
blk = block(
self.builder,
kernel_size,
in_channels,
out_channels,
expansion,
stride,
self.arch.squeeze_excitation_ratio,
survival_prob if stride == 1 and in_channels == out_channels else 1.0,
self.quantized,
trt=trt,
)
layers.append((f"block{idx}", blk))
for idx in range(1, num_repeat):
survival_prob = self._get_survival_prob(idx + prev_layer_count)
blk = block(
self.builder,
kernel_size,
out_channels,
out_channels,
expansion,
1, # stride
squeeze_excitation_ratio,
survival_prob,
self.quantized,
trt=trt,
)
layers.append((f"block{idx}", blk))
return nn.Sequential(OrderedDict(layers)), out_channels
def ngc_checkpoint_remap(self, url=None, version=None):
if version is None:
version = url.split("/")[8]
def to_sequential_remap(s):
splited = s.split(".")
if splited[0].startswith("layer"):
return ".".join(
["layers." + str(int(splited[0][len("layer") :]) - 1)] + splited[1:]
)
else:
return s
def no_remap(s):
return s
return {"20.12.0": to_sequential_remap, "21.03.0": to_sequential_remap}.get(
version, no_remap
)
# }}}
# MBConvBlock {{{
class MBConvBlock(nn.Module):
__constants__ = ["quantized"]
def __init__(
self,
builder: LayerBuilder,
depsep_kernel_size: int,
in_channels: int,
out_channels: int,
expand_ratio: int,
stride: int,
squeeze_excitation_ratio: float,
squeeze_hidden=False,
survival_prob: float = 1.0,
quantized: bool = False,
trt: bool = False,
):
super().__init__()
self.quantized = quantized
self.residual = stride == 1 and in_channels == out_channels
hidden_dim = in_channels * expand_ratio
squeeze_base = hidden_dim if squeeze_hidden else in_channels
squeeze_dim = max(1, int(squeeze_base * squeeze_excitation_ratio))
self.expand = (
None
if in_channels == hidden_dim
else builder.conv1x1(in_channels, hidden_dim, bn=True, act=True)
)
self.depsep = builder.convDepSep(
depsep_kernel_size, hidden_dim, hidden_dim, stride, bn=True, act=True
)
if trt or self.quantized:
# Need TRT mode for quantized in order to automatically insert quantization before pooling
self.se: nn.Module = SequentialSqueezeAndExcitationTRT(
hidden_dim, squeeze_dim, builder.activation(), self.quantized
)
else:
self.se: nn.Module = SequentialSqueezeAndExcitation(
hidden_dim, squeeze_dim, builder.activation(), self.quantized
)
self.proj = builder.conv1x1(hidden_dim, out_channels, bn=True)
if survival_prob == 1.0:
self.residual_add = torch.add
else:
self.residual_add = StochasticDepthResidual(survival_prob=survival_prob)
if self.quantized and self.residual:
assert quant_nn is not None, "pytorch_quantization is not available"
self.residual_quantizer = quant_nn.TensorQuantizer(
quant_nn.QuantConv2d.default_quant_desc_input
) # TODO QuantConv2d ?!?
else:
self.residual_quantizer = nn.Identity()
def forward(self, x: torch.Tensor) -> torch.Tensor:
if not self.residual:
return self.proj(
self.se(self.depsep(x if self.expand is None else self.expand(x)))
)
b = self.proj(
self.se(self.depsep(x if self.expand is None else self.expand(x)))
)
if self.quantized:
x = self.residual_quantizer(x)
return self.residual_add(x, b)
def original_mbconv(
builder: LayerBuilder,
depsep_kernel_size: int,
in_channels: int,
out_channels: int,
expand_ratio: int,
stride: int,
squeeze_excitation_ratio: int,
survival_prob: float,
quantized: bool,
trt: bool,
):
return MBConvBlock(
builder,
depsep_kernel_size,
in_channels,
out_channels,
expand_ratio,
stride,
squeeze_excitation_ratio,
squeeze_hidden=False,
survival_prob=survival_prob,
quantized=quantized,
trt=trt,
)
def widese_mbconv(
builder: LayerBuilder,
depsep_kernel_size: int,
in_channels: int,
out_channels: int,
expand_ratio: int,
stride: int,
squeeze_excitation_ratio: int,
survival_prob: float,
quantized: bool,
trt: bool,
):
return MBConvBlock(
builder,
depsep_kernel_size,
in_channels,
out_channels,
expand_ratio,
stride,
squeeze_excitation_ratio,
squeeze_hidden=True,
survival_prob=survival_prob,
quantized=quantized,
trt=trt,
)
# }}}
# EffNet configs {{{
# fmt: off
effnet_b0_layers = EffNetArch(
block = original_mbconv,
stem_channels = 32,
feature_channels=1280,
kernel = [ 3, 3, 5, 3, 5, 5, 3],
stride = [ 1, 2, 2, 2, 1, 2, 1],
num_repeat = [ 1, 2, 2, 3, 3, 4, 1],
expansion = [ 1, 6, 6, 6, 6, 6, 6],
channels = [16, 24, 40, 80, 112, 192, 320],
default_image_size=224,
)
effnet_b1_layers=effnet_b0_layers.scale(wc=1, dc=1.1, dis=240)
effnet_b2_layers=effnet_b0_layers.scale(wc=1.1, dc=1.2, dis=260)
effnet_b3_layers=effnet_b0_layers.scale(wc=1.2, dc=1.4, dis=300)
effnet_b4_layers=effnet_b0_layers.scale(wc=1.4, dc=1.8, dis=380)
effnet_b5_layers=effnet_b0_layers.scale(wc=1.6, dc=2.2, dis=456)
effnet_b6_layers=effnet_b0_layers.scale(wc=1.8, dc=2.6, dis=528)
effnet_b7_layers=effnet_b0_layers.scale(wc=2.0, dc=3.1, dis=600)
urls = {
"efficientnet-b0": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_b0_pyt_amp/versions/20.12.0/files/nvidia_efficientnet-b0_210412.pth",
"efficientnet-b4": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_b4_pyt_amp/versions/20.12.0/files/nvidia_efficientnet-b4_210412.pth",
"efficientnet-widese-b0": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_widese_b0_pyt_amp/versions/20.12.0/files/nvidia_efficientnet-widese-b0_210412.pth",
"efficientnet-widese-b4": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_widese_b4_pyt_amp/versions/20.12.0/files/nvidia_efficientnet-widese-b4_210412.pth",
"efficientnet-quant-b0": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_b0_pyt_qat_ckpt_fp32/versions/21.03.0/files/nvidia-efficientnet-quant-b0-130421.pth",
"efficientnet-quant-b4": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_b4_pyt_qat_ckpt_fp32/versions/21.03.0/files/nvidia-efficientnet-quant-b4-130421.pth",
}
def _m(*args, **kwargs):
return Model(constructor=EfficientNet, *args, **kwargs)
architectures = {
"efficientnet-b0": _m(arch=effnet_b0_layers, params=EffNetParams(dropout=0.2), checkpoint_url=urls["efficientnet-b0"]),
"efficientnet-b1": _m(arch=effnet_b1_layers, params=EffNetParams(dropout=0.2)),
"efficientnet-b2": _m(arch=effnet_b2_layers, params=EffNetParams(dropout=0.3)),
"efficientnet-b3": _m(arch=effnet_b3_layers, params=EffNetParams(dropout=0.3)),
"efficientnet-b4": _m(arch=effnet_b4_layers, params=EffNetParams(dropout=0.4, survival_prob=0.8), checkpoint_url=urls["efficientnet-b4"]),
"efficientnet-b5": _m(arch=effnet_b5_layers, params=EffNetParams(dropout=0.4)),
"efficientnet-b6": _m(arch=effnet_b6_layers, params=EffNetParams(dropout=0.5)),
"efficientnet-b7": _m(arch=effnet_b7_layers, params=EffNetParams(dropout=0.5)),
"efficientnet-widese-b0": _m(arch=replace(effnet_b0_layers, block=widese_mbconv), params=EffNetParams(dropout=0.2), checkpoint_url=urls["efficientnet-widese-b0"]),
"efficientnet-widese-b1": _m(arch=replace(effnet_b1_layers, block=widese_mbconv), params=EffNetParams(dropout=0.2)),
"efficientnet-widese-b2": _m(arch=replace(effnet_b2_layers, block=widese_mbconv), params=EffNetParams(dropout=0.3)),
"efficientnet-widese-b3": _m(arch=replace(effnet_b3_layers, block=widese_mbconv), params=EffNetParams(dropout=0.3)),
"efficientnet-widese-b4": _m(arch=replace(effnet_b4_layers, block=widese_mbconv), params=EffNetParams(dropout=0.4, survival_prob=0.8), checkpoint_url=urls["efficientnet-widese-b4"]),
"efficientnet-widese-b5": _m(arch=replace(effnet_b5_layers, block=widese_mbconv), params=EffNetParams(dropout=0.4)),
"efficientnet-widese-b6": _m(arch=replace(effnet_b6_layers, block=widese_mbconv), params=EffNetParams(dropout=0.5)),
"efficientnet-widese-b7": _m(arch=replace(effnet_b7_layers, block=widese_mbconv), params=EffNetParams(dropout=0.5)),
"efficientnet-quant-b0": _m(arch=effnet_b0_layers, params=EffNetParams(dropout=0.2, quantized=True), checkpoint_url=urls["efficientnet-quant-b0"]),
"efficientnet-quant-b1": _m(arch=effnet_b1_layers, params=EffNetParams(dropout=0.2, quantized=True)),
"efficientnet-quant-b2": _m(arch=effnet_b2_layers, params=EffNetParams(dropout=0.3, quantized=True)),
"efficientnet-quant-b3": _m(arch=effnet_b3_layers, params=EffNetParams(dropout=0.3, quantized=True)),
"efficientnet-quant-b4": _m(arch=effnet_b4_layers, params=EffNetParams(dropout=0.4, survival_prob=0.8, quantized=True), checkpoint_url=urls["efficientnet-quant-b4"]),
"efficientnet-quant-b5": _m(arch=effnet_b5_layers, params=EffNetParams(dropout=0.4, quantized=True)),
"efficientnet-quant-b6": _m(arch=effnet_b6_layers, params=EffNetParams(dropout=0.5, quantized=True)),
"efficientnet-quant-b7": _m(arch=effnet_b7_layers, params=EffNetParams(dropout=0.5, quantized=True)),
}
# fmt: on
# }}}
_ce = lambda n: EntryPoint.create(n, architectures[n])
efficientnet_b0 = _ce("efficientnet-b0")
efficientnet_b4 = _ce("efficientnet-b4")
efficientnet_widese_b0 = _ce("efficientnet-widese-b0")
efficientnet_widese_b4 = _ce("efficientnet-widese-b4")
efficientnet_quant_b0 = _ce("efficientnet-quant-b0")
efficientnet_quant_b4 = _ce("efficientnet-quant-b4")
# Copyright (c) 2018-2019, NVIDIA CORPORATION
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
def nvidia_efficientnet(type='efficient-b0', pretrained=True, **kwargs):
"""Constructs a EfficientNet model.
For detailed information on model input and output, training recipies, inference and performance
visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
Args:
pretrained (bool, True): If True, returns a model pretrained on IMAGENET dataset.
"""
from .efficientnet import _ce
return _ce(type)(pretrained=pretrained, **kwargs)
def nvidia_convnets_processing_utils():
import numpy as np
import torch
from PIL import Image
import torchvision.transforms as transforms
import numpy as np
import json
import requests
import validators
class Processing:
@staticmethod
def prepare_input_from_uri(uri, cuda=False):
img_transforms = transforms.Compose(
[transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
)
if (validators.url(uri)):
img = Image.open(requests.get(uri, stream=True).raw)
else:
img = Image.open(uri)
img = img_transforms(img)
with torch.no_grad():
# mean and std are not multiplied by 255 as they are in training script
# torch dataloader reads data into bytes whereas loading directly
# through PIL creates a tensor with floats in [0,1] range
mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
img = img.float()
if cuda:
mean = mean.cuda()
std = std.cuda()
img = img.cuda()
input = img.unsqueeze(0).sub_(mean).div_(std)
return input
@staticmethod
def pick_n_best(predictions, n=5):
predictions = predictions.float().cpu().numpy()
topN = np.argsort(-1*predictions, axis=-1)[:,:n]
imgnet_classes = Processing.get_imgnet_classes()
results=[]
for idx,case in enumerate(topN):
r = []
for c, v in zip(imgnet_classes[case], predictions[idx, case]):
r.append((f"{c}", f"{100*v:.1f}%"))
print(f"sample {idx}: {r}")
results.append(r)
return results
@staticmethod
def get_imgnet_classes():
import os
import json
imgnet_classes_json = "LOC_synset_mapping.json"
if not os.path.exists(imgnet_classes_json):
print("Downloading Imagenet Classes names.")
import urllib
urllib.request.urlretrieve(
"https://raw.githubusercontent.com/NVIDIA/DeepLearningExamples/master/PyTorch/Classification/ConvNets/LOC_synset_mapping.json",
filename=imgnet_classes_json)
print("Downloading finished.")
imgnet_classes = np.array(json.load(open(imgnet_classes_json, "r")))
return imgnet_classes
return Processing()
from dataclasses import dataclass, asdict, replace
from .common import (
SequentialSqueezeAndExcitationTRT,
SequentialSqueezeAndExcitation,
SqueezeAndExcitation,
SqueezeAndExcitationTRT,
)
from typing import Optional, Callable
import os
import torch
import argparse
from functools import partial
@dataclass
class ModelArch:
pass
@dataclass
class ModelParams:
def parser(self, name):
return argparse.ArgumentParser(
description=f"{name} arguments", add_help=False, usage=""
)
@dataclass
class OptimizerParams:
pass
@dataclass
class Model:
constructor: Callable
arch: ModelArch
params: Optional[ModelParams]
optimizer_params: Optional[OptimizerParams] = None
checkpoint_url: Optional[str] = None
def torchhub_docstring(name: str):
return f"""Constructs a {name} model.
For detailed information on model input and output, training recipies, inference and performance
visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
Args:
pretrained (bool, True): If True, returns a model pretrained on IMAGENET dataset.
"""
class EntryPoint:
@staticmethod
def create(name: str, model: Model):
ep = EntryPoint(name, model)
ep.__doc__ = torchhub_docstring(name)
return ep
def __init__(self, name: str, model: Model):
self.name = name
self.model = model
def __call__(
self,
pretrained=True,
pretrained_from_file=None,
state_dict_key_map_fn=None,
**kwargs,
):
assert not (pretrained and (pretrained_from_file is not None))
params = replace(self.model.params, **kwargs)
model = self.model.constructor(arch=self.model.arch, **asdict(params))
state_dict = None
if pretrained:
assert self.model.checkpoint_url is not None
state_dict = torch.hub.load_state_dict_from_url(
self.model.checkpoint_url,
map_location=torch.device("cpu"),
progress=True,
)
if pretrained_from_file is not None:
if os.path.isfile(pretrained_from_file):
print(
"=> loading pretrained weights from '{}'".format(
pretrained_from_file
)
)
state_dict = torch.load(
pretrained_from_file, map_location=torch.device("cpu")
)
else:
print(
"=> no pretrained weights found at '{}'".format(
pretrained_from_file
)
)
if state_dict is not None:
state_dict = {
k[len("module.") :] if k.startswith("module.") else k: v
for k, v in state_dict.items()
}
def reshape(t, conv):
if conv:
if len(t.shape) == 4:
return t
else:
return t.view(t.shape[0], -1, 1, 1)
else:
if len(t.shape) == 4:
return t.view(t.shape[0], t.shape[1])
else:
return t
if state_dict_key_map_fn is not None:
state_dict = {
state_dict_key_map_fn(k): v for k, v in state_dict.items()
}
if pretrained and hasattr(model, "ngc_checkpoint_remap"):
remap_fn = model.ngc_checkpoint_remap(url=self.model.checkpoint_url)
state_dict = {remap_fn(k): v for k, v in state_dict.items()}
def _se_layer_uses_conv(m):
return any(
map(
partial(isinstance, m),
[
SqueezeAndExcitationTRT,
SequentialSqueezeAndExcitationTRT,
],
)
)
state_dict = {
k: reshape(
v,
conv=_se_layer_uses_conv(
dict(model.named_modules())[".".join(k.split(".")[:-2])]
),
)
if is_se_weight(k, v)
else v
for k, v in state_dict.items()
}
model.load_state_dict(state_dict)
return model
def parser(self):
if self.model.params is None:
return None
parser = self.model.params.parser(self.name)
parser.add_argument(
"--pretrained-from-file",
default=None,
type=str,
metavar="PATH",
help="load weights from local file",
)
if self.model.checkpoint_url is not None:
parser.add_argument(
"--pretrained",
default=False,
action="store_true",
help="load pretrained weights from NGC",
)
return parser
def is_se_weight(key, value):
return key.endswith("squeeze.weight") or key.endswith("expand.weight")
def create_entrypoint(m: Model):
def _ep(**kwargs):
params = replace(m.params, **kwargs)
return m.constructor(arch=m.arch, **asdict(params))
return _ep
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import argparse
from collections import OrderedDict
from dataclasses import dataclass
from typing import List, Dict, Callable, Any, Type
import torch
import torch.nn as nn
from .common import (
SqueezeAndExcitation,
LayerBuilder,
SqueezeAndExcitationTRT,
)
from .model import (
Model,
ModelParams,
ModelArch,
EntryPoint,
)
__all__ = ["ResNet", "resnet_configs"]
# BasicBlock {{{
class BasicBlock(nn.Module):
def __init__(
self,
builder,
inplanes,
planes,
expansion,
stride=1,
cardinality=1,
downsample=None,
fused_se=True,
last_bn_0_init=False,
trt=False,
):
super(BasicBlock, self).__init__()
self.conv1 = builder.conv3x3(inplanes, planes, stride, groups=cardinality)
self.bn1 = builder.batchnorm(planes)
self.relu = builder.activation()
self.conv2 = builder.conv3x3(
planes, planes * expansion, groups=cardinality
)
self.bn2 = builder.batchnorm(planes * expansion, zero_init=last_bn_0_init)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
if self.bn1 is not None:
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
if self.bn2 is not None:
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
# BasicBlock }}}
# Bottleneck {{{
class Bottleneck(nn.Module):
def __init__(
self,
builder,
inplanes,
planes,
expansion,
stride=1,
cardinality=1,
se=False,
se_squeeze=16,
downsample=None,
fused_se=True,
last_bn_0_init=False,
trt=False,
):
super(Bottleneck, self).__init__()
self.conv1 = builder.conv1x1(inplanes, planes)
self.bn1 = builder.batchnorm(planes)
self.conv2 = builder.conv3x3(planes, planes, groups=cardinality, stride=stride)
self.bn2 = builder.batchnorm(planes)
self.conv3 = builder.conv1x1(planes, planes * expansion)
self.bn3 = builder.batchnorm(planes * expansion, zero_init=last_bn_0_init)
self.relu = builder.activation()
self.downsample = downsample
self.stride = stride
self.fused_se = fused_se
if se:
self.squeeze = (
SqueezeAndExcitation(
planes * expansion, se_squeeze, builder.activation()
)
if not trt
else SqueezeAndExcitationTRT(
planes * expansion, se_squeeze, builder.activation()
)
)
else:
self.squeeze = None
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
if self.squeeze is None:
out += residual
else:
if self.fused_se:
out = torch.addcmul(residual, out, self.squeeze(out), value=1)
else:
out = residual + out * self.squeeze(out)
out = self.relu(out)
return out
class SEBottleneck(Bottleneck):
def __init__(
self,
builder,
inplanes,
planes,
expansion,
stride=1,
cardinality=1,
downsample=None,
fused_se=True,
last_bn_0_init=False,
trt=False,
):
super(SEBottleneck, self).__init__(
builder,
inplanes,
planes,
expansion,
stride=stride,
cardinality=cardinality,
se=True,
se_squeeze=16,
downsample=downsample,
fused_se=fused_se,
last_bn_0_init=last_bn_0_init,
trt=trt,
)
# Bottleneck }}}
class ResNet(nn.Module):
@dataclass
class Arch(ModelArch):
block: Type[Bottleneck]
layers: List[int] # arch
widths: List[int] # arch
expansion: int
cardinality: int = 1
stem_width: int = 64
activation: str = "relu"
default_image_size: int = 224
@dataclass
class Params(ModelParams):
num_classes: int = 1000
last_bn_0_init: bool = False
conv_init: str = "fan_in"
trt: bool = False
fused_se: bool = True
def parser(self, name):
p = super().parser(name)
p.add_argument(
"--num_classes",
metavar="N",
default=self.num_classes,
type=int,
help="number of classes",
)
p.add_argument(
"--last_bn_0_init",
metavar="True|False",
default=self.last_bn_0_init,
type=bool,
)
p.add_argument(
"--conv_init",
default=self.conv_init,
choices=["fan_in", "fan_out"],
type=str,
help="initialization mode for convolutional layers, see https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.kaiming_normal_",
)
p.add_argument("--trt", metavar="True|False", default=self.trt, type=bool)
p.add_argument(
"--fused_se", metavar="True|False", default=self.fused_se, type=bool
)
return p
def __init__(
self,
arch: Arch,
num_classes: int = 1000,
last_bn_0_init: bool = False,
conv_init: str = "fan_in",
trt: bool = False,
fused_se: bool = True,
):
super(ResNet, self).__init__()
self.arch = arch
self.builder = LayerBuilder(
LayerBuilder.Config(activation=arch.activation, conv_init=conv_init)
)
self.last_bn_0_init = last_bn_0_init
self.conv1 = self.builder.conv7x7(3, arch.stem_width, stride=2)
self.bn1 = self.builder.batchnorm(arch.stem_width)
self.relu = self.builder.activation()
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
inplanes = arch.stem_width
assert len(arch.widths) == len(arch.layers)
self.num_layers = len(arch.widths)
layers = []
for i, (w, l) in enumerate(zip(arch.widths, arch.layers)):
layer, inplanes = self._make_layer(
arch.block,
arch.expansion,
inplanes,
w,
l,
cardinality=arch.cardinality,
stride=1 if i == 0 else 2,
trt=trt,
fused_se=fused_se,
)
layers.append(layer)
self.layers = nn.Sequential(*layers)
self.avgpool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(arch.widths[-1] * arch.expansion, num_classes)
def stem(self, x):
x = self.conv1(x)
if self.bn1 is not None:
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
return x
def classifier(self, x):
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def forward(self, x):
x = self.stem(x)
x = self.layers(x)
x = self.classifier(x)
return x
def extract_features(self, x, layers=None):
if layers is None:
layers = [f"layer{i+1}" for i in range(self.num_layers)] + ["classifier"]
run = [
i
for i in range(self.num_layers)
if "classifier" in layers
or any([f"layer{j+1}" in layers for j in range(i, self.num_layers)])
]
output = {}
x = self.stem(x)
for l in run:
fn = self.layers[l]
x = fn(x)
if f"layer{l+1}" in layers:
output[f"layer{l+1}"] = x
if "classifier" in layers:
output["classifier"] = self.classifier(x)
return output
# helper functions {{{
def _make_layer(
self,
block,
expansion,
inplanes,
planes,
blocks,
stride=1,
cardinality=1,
trt=False,
fused_se=True,
):
downsample = None
if stride != 1 or inplanes != planes * expansion:
dconv = self.builder.conv1x1(inplanes, planes * expansion, stride=stride)
dbn = self.builder.batchnorm(planes * expansion)
if dbn is not None:
downsample = nn.Sequential(dconv, dbn)
else:
downsample = dconv
layers = []
for i in range(blocks):
layers.append(
block(
self.builder,
inplanes,
planes,
expansion,
stride=stride if i == 0 else 1,
cardinality=cardinality,
downsample=downsample if i == 0 else None,
fused_se=fused_se,
last_bn_0_init=self.last_bn_0_init,
trt=trt,
)
)
inplanes = planes * expansion
return nn.Sequential(*layers), inplanes
def ngc_checkpoint_remap(self, url=None, version=None):
if version is None:
version = url.split("/")[8]
def to_sequential_remap(s):
splited = s.split(".")
if splited[0].startswith("layer"):
return ".".join(
["layers." + str(int(splited[0][len("layer") :]) - 1)] + splited[1:]
)
else:
return s
def no_remap(s):
return s
return {"20.06.0": to_sequential_remap}.get(version, no_remap)
# }}}
__models: Dict[str, Model] = {
"resnet50": Model(
constructor=ResNet,
arch=ResNet.Arch(
stem_width=64,
block=Bottleneck,
layers=[3, 4, 6, 3],
widths=[64, 128, 256, 512],
expansion=4,
default_image_size=224,
),
params=ResNet.Params(),
checkpoint_url="https://api.ngc.nvidia.com/v2/models/nvidia/resnet50_pyt_amp/versions/20.06.0/files/nvidia_resnet50_200821.pth.tar",
),
"resnext101-32x4d": Model(
constructor=ResNet,
arch=ResNet.Arch(
stem_width=64,
block=Bottleneck,
layers=[3, 4, 23, 3],
widths=[128, 256, 512, 1024],
expansion=2,
cardinality=32,
default_image_size=224,
),
params=ResNet.Params(),
checkpoint_url="https://api.ngc.nvidia.com/v2/models/nvidia/resnext101_32x4d_pyt_amp/versions/20.06.0/files/nvidia_resnext101-32x4d_200821.pth.tar",
),
"se-resnext101-32x4d": Model(
constructor=ResNet,
arch=ResNet.Arch(
stem_width=64,
block=SEBottleneck,
layers=[3, 4, 23, 3],
widths=[128, 256, 512, 1024],
expansion=2,
cardinality=32,
default_image_size=224,
),
params=ResNet.Params(),
checkpoint_url="https://api.ngc.nvidia.com/v2/models/nvidia/seresnext101_32x4d_pyt_amp/versions/20.06.0/files/nvidia_se-resnext101-32x4d_200821.pth.tar",
),
}
_ce = lambda n: EntryPoint.create(n, __models[n])
resnet50 = _ce("resnet50")
resnext101_32x4d = _ce("resnext101-32x4d")
se_resnext101_32x4d = _ce("se-resnext101-32x4d")
import math
import numpy as np
import torch
from torch import optim
def get_optimizer(parameters, lr, args, state=None):
if args.optimizer == "sgd":
optimizer = get_sgd_optimizer(
parameters,
lr,
momentum=args.momentum,
weight_decay=args.weight_decay,
nesterov=args.nesterov,
bn_weight_decay=args.bn_weight_decay,
)
elif args.optimizer == "rmsprop":
optimizer = get_rmsprop_optimizer(
parameters,
lr,
alpha=args.rmsprop_alpha,
momentum=args.momentum,
weight_decay=args.weight_decay,
eps=args.rmsprop_eps,
bn_weight_decay=args.bn_weight_decay,
)
if not state is None:
optimizer.load_state_dict(state)
return optimizer
def get_sgd_optimizer(
parameters, lr, momentum, weight_decay, nesterov=False, bn_weight_decay=False
):
if bn_weight_decay:
print(" ! Weight decay applied to BN parameters ")
params = [v for n, v in parameters]
else:
print(" ! Weight decay NOT applied to BN parameters ")
bn_params = [v for n, v in parameters if "bn" in n]
rest_params = [v for n, v in parameters if not "bn" in n]
print(len(bn_params))
print(len(rest_params))
params = [
{"params": bn_params, "weight_decay": 0},
{"params": rest_params, "weight_decay": weight_decay},
]
optimizer = torch.optim.SGD(
params, lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov
)
return optimizer
def get_rmsprop_optimizer(
parameters, lr, alpha, weight_decay, momentum, eps, bn_weight_decay=False
):
bn_params = [v for n, v in parameters if "bn" in n]
rest_params = [v for n, v in parameters if not "bn" in n]
params = [
{"params": bn_params, "weight_decay": weight_decay if bn_weight_decay else 0},
{"params": rest_params, "weight_decay": weight_decay},
]
optimizer = torch.optim.RMSprop(
params,
lr=lr,
alpha=alpha,
weight_decay=weight_decay,
momentum=momentum,
eps=eps,
)
return optimizer
def lr_policy(lr_fn):
def _alr(optimizer, iteration, epoch):
lr = lr_fn(iteration, epoch)
for param_group in optimizer.param_groups:
param_group["lr"] = lr
return lr
return _alr
def lr_step_policy(base_lr, steps, decay_factor, warmup_length):
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
lr = base_lr
for s in steps:
if epoch >= s:
lr *= decay_factor
return lr
return lr_policy(_lr_fn)
def lr_linear_policy(base_lr, warmup_length, epochs):
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
e = epoch - warmup_length
es = epochs - warmup_length
lr = base_lr * (1 - (e / es))
return lr
return lr_policy(_lr_fn)
def lr_cosine_policy(base_lr, warmup_length, epochs, end_lr=0):
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
e = epoch - warmup_length
es = epochs - warmup_length
lr = end_lr + (0.5 * (1 + np.cos(np.pi * e / es)) * (base_lr - end_lr))
return lr
return lr_policy(_lr_fn)
def lr_exponential_policy(
base_lr,
warmup_length,
epochs,
final_multiplier=0.001,
decay_factor=None,
decay_step=1,
logger=None,
):
"""Exponential lr policy. Setting decay factor parameter overrides final_multiplier"""
es = epochs - warmup_length
if decay_factor is not None:
epoch_decay = decay_factor
else:
epoch_decay = np.power(
2, np.log2(final_multiplier) / math.floor(es / decay_step)
)
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
e = epoch - warmup_length
lr = base_lr * (epoch_decay ** math.floor(e / decay_step))
return lr
return lr_policy(_lr_fn, logger=logger)
from tqdm import tqdm
import torch
import contextlib
import time
import logging
from pytorch_quantization import quant_modules
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import calib
from pytorch_quantization.tensor_quant import QuantDescriptor
from . import logger as log
from .utils import calc_ips
import dllogger
initialize = quant_modules.initialize
deactivate = quant_modules.deactivate
IPS_METADATA = {"unit": "img/s", "format": ":.2f"}
TIME_METADATA = {"unit": "s", "format": ":.5f"}
def select_default_calib_method(calib_method='histogram'):
"""Set up selected calibration method in whole network"""
quant_desc_input = QuantDescriptor(calib_method=calib_method)
quant_nn.QuantConv1d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantAdaptiveAvgPool2d.set_default_quant_desc_input(quant_desc_input)
def quantization_setup(calib_method='histogram'):
"""Change network into quantized version "automatically" and selects histogram as default quantization method"""
select_default_calib_method(calib_method)
initialize()
def disable_calibration(model):
"""Disables calibration in whole network. Should be run always before running interference."""
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.enable_quant()
module.disable_calib()
else:
module.enable()
def collect_stats(model, data_loader, logger, num_batches):
"""Feed data to the network and collect statistic"""
if logger is not None:
logger.register_metric(
f"calib.total_ips",
log.PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=IPS_METADATA,
)
logger.register_metric(
f"calib.data_time",
log.PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=TIME_METADATA,
)
logger.register_metric(
f"calib.compute_latency",
log.PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=TIME_METADATA,
)
# Enable calibrators
data_iter = enumerate(data_loader)
if logger is not None:
data_iter = logger.iteration_generator_wrapper(data_iter, mode='calib')
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
end = time.time()
if logger is not None:
logger.start_calibration()
for i, (image, _) in data_iter:
bs = image.size(0)
data_time = time.time() - end
model(image.cuda())
it_time = time.time() - end
if logger is not None:
logger.log_metric(f"calib.total_ips", calc_ips(bs, it_time))
logger.log_metric(f"calib.data_time", data_time)
logger.log_metric(f"calib.compute_latency", it_time - data_time)
if i >= num_batches:
time.sleep(5)
break
end = time.time()
if logger is not None:
logger.end_calibration()
logging.disable(logging.WARNING)
disable_calibration(model)
def compute_amax(model, **kwargs):
"""Loads statistics of data and calculates quantization parameters in whole network"""
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer) and module._calibrator is not None:
if isinstance(module._calibrator, calib.MaxCalibrator):
module.load_calib_amax()
else:
module.load_calib_amax(**kwargs)
model.cuda()
def calibrate(model, train_loader, logger, calib_iter=1, percentile=99.99):
"""Calibrates whole network i.e. gathers data for quantization and calculates quantization parameters"""
model.eval()
with torch.no_grad():
collect_stats(model, train_loader, logger, num_batches=calib_iter)
compute_amax(model, method="percentile", percentile=percentile)
logging.disable(logging.NOTSET)
@contextlib.contextmanager
def switch_on_quantization(do_quantization=True):
"""Context manager for quantization activation"""
if do_quantization:
initialize()
try:
yield
finally:
if do_quantization:
deactivate()
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
class LabelSmoothing(nn.Module):
"""
NLL loss with label smoothing.
"""
def __init__(self, smoothing=0.0):
"""
Constructor for the LabelSmoothing module.
:param smoothing: label smoothing factor
"""
super(LabelSmoothing, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
def forward(self, x, target):
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
nll_loss = nll_loss.squeeze(1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean()
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import time
from copy import deepcopy
from functools import wraps
from typing import Callable, Dict, Optional, Tuple
import torch
import torch.nn as nn
from torch.cuda.amp import autocast
from torch.nn.parallel import DistributedDataParallel as DDP
from . import logger as log
from . import utils
from .logger import TrainingMetrics, ValidationMetrics
from .models.common import EMA
class Executor:
def __init__(
self,
model: nn.Module,
loss: Optional[nn.Module],
cuda: bool = True,
memory_format: torch.memory_format = torch.contiguous_format,
amp: bool = False,
scaler: Optional[torch.cuda.amp.GradScaler] = None,
divide_loss: int = 1,
ts_script: bool = False,
):
assert not (amp and scaler is None), "Gradient Scaler is needed for AMP"
def xform(m: nn.Module) -> nn.Module:
if cuda:
m = m.cuda()
m.to(memory_format=memory_format)
return m
self.model = xform(model)
if ts_script:
self.model = torch.jit.script(self.model)
self.ts_script = ts_script
self.loss = xform(loss) if loss is not None else None
self.amp = amp
self.scaler = scaler
self.is_distributed = False
self.divide_loss = divide_loss
self._fwd_bwd = None
self._forward = None
def distributed(self, gpu_id):
self.is_distributed = True
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
self.model = DDP(self.model, device_ids=[gpu_id], output_device=gpu_id)
torch.cuda.current_stream().wait_stream(s)
def _fwd_bwd_fn(
self,
input: torch.Tensor,
target: torch.Tensor,
) -> torch.Tensor:
with autocast(enabled=self.amp):
loss = self.loss(self.model(input), target)
loss /= self.divide_loss
self.scaler.scale(loss).backward()
return loss
def _forward_fn(
self, input: torch.Tensor, target: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
with torch.no_grad(), autocast(enabled=self.amp):
output = self.model(input)
loss = None if self.loss is None else self.loss(output, target)
return output if loss is None else loss, output
def optimize(self, fn):
return fn
@property
def forward_backward(self):
if self._fwd_bwd is None:
if self.loss is None:
raise NotImplementedError(
"Loss must not be None for forward+backward step"
)
self._fwd_bwd = self.optimize(self._fwd_bwd_fn)
return self._fwd_bwd
@property
def forward(self):
if self._forward is None:
self._forward = self.optimize(self._forward_fn)
return self._forward
def train(self):
self.model.train()
if self.loss is not None:
self.loss.train()
def eval(self):
self.model.eval()
if self.loss is not None:
self.loss.eval()
class Trainer:
def __init__(
self,
executor: Executor,
optimizer: torch.optim.Optimizer,
grad_acc_steps: int,
ema: Optional[float] = None,
):
self.executor = executor
self.optimizer = optimizer
self.grad_acc_steps = grad_acc_steps
self.use_ema = False
if ema is not None:
self.ema_executor = deepcopy(self.executor)
self.ema = EMA(ema, self.ema_executor.model)
self.use_ema = True
self.optimizer.zero_grad(set_to_none=True)
self.steps_since_update = 0
def train(self):
self.executor.train()
if self.use_ema:
self.ema_executor.train()
def eval(self):
self.executor.eval()
if self.use_ema:
self.ema_executor.eval()
def train_step(self, input, target, step=None):
loss = self.executor.forward_backward(input, target)
self.steps_since_update += 1
if self.steps_since_update == self.grad_acc_steps:
if self.executor.scaler is not None:
self.executor.scaler.step(self.optimizer)
self.executor.scaler.update()
else:
self.optimizer.step()
self.optimizer.zero_grad()
self.steps_since_update = 0
torch.cuda.synchronize()
if self.use_ema:
self.ema(self.executor.model, step=step)
return loss
def validation_steps(self) -> Dict[str, Callable]:
vsd: Dict[str, Callable] = {"val": self.executor.forward}
if self.use_ema:
vsd["val_ema"] = self.ema_executor.forward
return vsd
def state_dict(self) -> dict:
res = {
"state_dict": self.executor.model.state_dict(),
"optimizer": self.optimizer.state_dict(),
}
if self.use_ema:
res["state_dict_ema"] = self.ema_executor.model.state_dict()
return res
def train(
train_step,
train_loader,
lr_scheduler,
grad_scale_fn,
log_fn,
timeout_handler,
prof=-1,
step=0,
):
interrupted = False
end = time.time()
data_iter = enumerate(train_loader)
for i, (input, target) in data_iter:
bs = input.size(0)
lr = lr_scheduler(i)
data_time = time.time() - end
loss = train_step(input, target, step=step + i)
it_time = time.time() - end
with torch.no_grad():
if torch.distributed.is_initialized():
reduced_loss = utils.reduce_tensor(loss.detach())
else:
reduced_loss = loss.detach()
log_fn(
compute_ips=utils.calc_ips(bs, it_time - data_time),
total_ips=utils.calc_ips(bs, it_time),
data_time=data_time,
compute_time=it_time - data_time,
lr=lr,
loss=reduced_loss.item(),
grad_scale=grad_scale_fn(),
)
end = time.time()
if prof > 0 and (i + 1 >= prof):
time.sleep(5)
break
if ((i + 1) % 20 == 0) and timeout_handler.interrupted:
time.sleep(5)
interrupted = True
break
return interrupted
def validate(infer_fn, val_loader, log_fn, prof=-1, with_loss=True, topk=5):
top1 = log.AverageMeter()
# switch to evaluate mode
end = time.time()
data_iter = enumerate(val_loader)
for i, (input, target) in data_iter:
bs = input.size(0)
data_time = time.time() - end
if with_loss:
loss, output = infer_fn(input, target)
else:
output = infer_fn(input)
with torch.no_grad():
precs = utils.accuracy(output.data, target, topk=(1, topk))
if torch.distributed.is_initialized():
if with_loss:
reduced_loss = utils.reduce_tensor(loss.detach())
precs = map(utils.reduce_tensor, precs)
else:
if with_loss:
reduced_loss = loss.detach()
precs = map(lambda t: t.item(), precs)
infer_result = {f"top{k}": (p, bs) for k, p in zip((1, topk), precs)}
if with_loss:
infer_result["loss"] = (reduced_loss.item(), bs)
torch.cuda.synchronize()
it_time = time.time() - end
top1.record(infer_result["top1"][0], bs)
log_fn(
compute_ips=utils.calc_ips(bs, it_time - data_time),
total_ips=utils.calc_ips(bs, it_time),
data_time=data_time,
compute_time=it_time - data_time,
**infer_result,
)
end = time.time()
if (prof > 0) and (i + 1 >= prof):
time.sleep(5)
break
return top1.get_val()
# Train loop {{{
def train_loop(
trainer: Trainer,
lr_scheduler,
train_loader,
train_loader_len,
val_loader,
logger,
best_prec1=0,
start_epoch=0,
end_epoch=0,
early_stopping_patience=-1,
prof=-1,
skip_training=False,
skip_validation=False,
save_checkpoints=True,
checkpoint_dir="./",
checkpoint_filename="checkpoint.pth.tar",
keep_last_n_checkpoints=0,
topk=5,
):
checkpointer = utils.Checkpointer(
last_filename=checkpoint_filename,
checkpoint_dir=checkpoint_dir,
keep_last_n=keep_last_n_checkpoints,
)
train_metrics = TrainingMetrics(logger)
val_metrics = {
k: ValidationMetrics(logger, k, topk) for k in trainer.validation_steps().keys()
}
training_step = trainer.train_step
prec1 = -1
if early_stopping_patience > 0:
epochs_since_improvement = 0
print(f"RUNNING EPOCHS FROM {start_epoch} TO {end_epoch}")
with utils.TimeoutHandler() as timeout_handler:
interrupted = False
for epoch in range(start_epoch, end_epoch):
if logger is not None:
logger.start_epoch()
if not skip_training:
if logger is not None:
data_iter = logger.iteration_generator_wrapper(
train_loader, mode="train"
)
else:
data_iter = train_loader
trainer.train()
interrupted = train(
training_step,
data_iter,
lambda i: lr_scheduler(trainer.optimizer, i, epoch),
trainer.executor.scaler.get_scale,
train_metrics.log,
timeout_handler,
prof=prof,
step=epoch * train_loader_len,
)
if not skip_validation:
trainer.eval()
for k, infer_fn in trainer.validation_steps().items():
if logger is not None:
data_iter = logger.iteration_generator_wrapper(
val_loader, mode="val"
)
else:
data_iter = val_loader
step_prec1, _ = validate(
infer_fn,
data_iter,
val_metrics[k].log,
prof=prof,
topk=topk,
)
if k == "val":
prec1 = step_prec1
if prec1 > best_prec1:
is_best = True
best_prec1 = prec1
else:
is_best = False
else:
is_best = False
best_prec1 = 0
if logger is not None:
logger.end_epoch()
if save_checkpoints and (
not torch.distributed.is_initialized()
or torch.distributed.get_rank() == 0
):
checkpoint_state = {
"epoch": epoch + 1,
"best_prec1": best_prec1,
**trainer.state_dict(),
}
checkpointer.save_checkpoint(
checkpoint_state,
is_best,
filename=f"checkpoint_{epoch:04}.pth.tar",
)
if early_stopping_patience > 0:
if not is_best:
epochs_since_improvement += 1
else:
epochs_since_improvement = 0
if epochs_since_improvement >= early_stopping_patience:
break
if interrupted:
break
# }}}
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import math
import os
import numpy as np
import torch
import shutil
import signal
import torch.distributed as dist
class Checkpointer:
def __init__(self, last_filename, checkpoint_dir="./", keep_last_n=0):
self.last_filename = last_filename
self.checkpoints = []
self.checkpoint_dir = checkpoint_dir
self.keep_last_n = keep_last_n
def cleanup(self):
to_delete = self.checkpoints[: -self.keep_last_n]
self.checkpoints = self.checkpoints[-self.keep_last_n :]
for f in to_delete:
full_path = os.path.join(self.checkpoint_dir, f)
os.remove(full_path)
def get_full_path(self, filename):
return os.path.join(self.checkpoint_dir, filename)
def save_checkpoint(
self,
state,
is_best,
filename,
):
if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0:
assert False
full_path = self.get_full_path(filename)
print("SAVING {}".format(full_path))
torch.save(state, full_path)
self.checkpoints.append(filename)
shutil.copyfile(
full_path, self.get_full_path(self.last_filename)
)
if is_best:
shutil.copyfile(
full_path, self.get_full_path("model_best.pth.tar")
)
self.cleanup()
def timed_generator(gen):
start = time.time()
for g in gen:
end = time.time()
t = end - start
yield g, t
start = time.time()
def timed_function(f):
def _timed_function(*args, **kwargs):
start = time.time()
ret = f(*args, **kwargs)
return ret, time.time() - start
return _timed_function
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].float().sum()
res.append(correct_k.mul_(100.0 / batch_size))
return res
def reduce_tensor(tensor):
rt = tensor.clone().detach()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
rt /= (
torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
)
return rt
def first_n(n, generator):
for i, d in zip(range(n), generator):
yield d
class TimeoutHandler:
def __init__(self, sig=signal.SIGTERM):
self.sig = sig
self.device = torch.device("cuda")
@property
def interrupted(self):
if not dist.is_initialized():
return self._interrupted
interrupted = torch.tensor(self._interrupted).int().to(self.device)
dist.broadcast(interrupted, 0)
interrupted = bool(interrupted.item())
return interrupted
def __enter__(self):
self._interrupted = False
self.released = False
self.original_handler = signal.getsignal(self.sig)
def master_handler(signum, frame):
self.release()
self._interrupted = True
print(f"Received SIGTERM")
def ignoring_handler(signum, frame):
self.release()
print("Received SIGTERM, ignoring")
rank = dist.get_rank() if dist.is_initialized() else 0
if rank == 0:
signal.signal(self.sig, master_handler)
else:
signal.signal(self.sig, ignoring_handler)
return self
def __exit__(self, type, value, tb):
self.release()
def release(self):
if self.released:
return False
signal.signal(self.sig, self.original_handler)
self.released = True
return True
def calc_ips(batch_size, time):
world_size = (
torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
)
tbs = world_size * batch_size
return tbs / time
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment