add new model resnet50v1.5

e129194a · Sugon_ldc · e129194a · e129194a · e129194a · e129194a
Commit e129194a authored Sep 26, 2023 by Sugon_ldc
20 changed files
--- a/image_classification/gpu_affinity.py.bak
+++ b/image_classification/gpu_affinity.py.bak
+import collections
+import itertools
+import os
+import pathlib
+import re
+
+import pynvml
+from typing import Union
+
+
+class Device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_bit_affinity = 64
+
+    _nvml_affinity_elements = (
+        os.cpu_count() + _nvml_bit_affinity - 1
+    ) // _nvml_bit_affinity
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def get_name(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def get_uuid(self):
+        return pynvml.nvmlDeviceGetUUID(self.handle)
+
+    def get_cpu_affinity(self):
+        affinity_string = ""
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, Device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = "{:064b}".format(j) + affinity_string
+
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def get_thread_siblings_list():
+    """
+    Returns a list of 2-element integer tuples representing pairs of
+    hyperthreading cores.
+    """
+    path = "/sys/devices/system/cpu/cpu*/topology/thread_siblings_list"
+    thread_siblings_list = []
+    pattern = re.compile(r"(\d+)\D(\d+)")
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(sorted(map(int, res[0])))
+                thread_siblings_list.append(pair)
+    thread_siblings_list = list(set(thread_siblings_list))
+    return thread_siblings_list
+
+
+def build_thread_siblings_dict(siblings_list):
+    siblings_dict = {}
+    for siblings_tuple in siblings_list:
+        for core in siblings_tuple:
+            siblings_dict[core] = siblings_tuple
+
+    return siblings_dict
+
+
+def group_list_by_dict(affinity, siblings_dict):
+    sorted_affinity = sorted(affinity, key=lambda x: siblings_dict.get(x, (x,)))
+    grouped = itertools.groupby(
+        sorted_affinity, key=lambda x: siblings_dict.get(x, (x,))
+    )
+    grouped_affinity = []
+    for key, group in grouped:
+        grouped_affinity.append(tuple(group))
+    return grouped_affinity
+
+
+def group_affinity_by_siblings(socket_affinities):
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = build_thread_siblings_dict(siblings_list)
+
+    grouped_socket_affinities = []
+
+    for socket_affinity in socket_affinities:
+        grouped_socket_affinities.append(
+            group_list_by_dict(socket_affinity, siblings_dict)
+        )
+    return grouped_socket_affinities
+
+
+def ungroup_affinities(affinities, cores):
+    ungrouped_affinities = []
+
+    for affinity in affinities:
+        if cores == "all_logical":
+            ungrouped_affinities.append(list(itertools.chain(*affinity)))
+        elif cores == "single_logical":
+            ungrouped_affinities.append([group[0] for group in affinity])
+        else:
+            raise RuntimeError("Unknown cores mode")
+    return ungrouped_affinities
+
+
+def check_socket_affinities(socket_affinities):
+    # sets of cores should be either identical or disjoint
+    for i, j in itertools.product(socket_affinities, socket_affinities):
+        if not set(i) == set(j) and not set(i).isdisjoint(set(j)):
+            raise RuntimeError(
+                f"Sets of cores should be either identical or disjoint, "
+                f"but got {i} and {j}."
+            )
+
+
+def get_socket_affinities(nproc_per_node, exclude_unavailable_cores=True):
+    devices = [Device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.get_cpu_affinity() for dev in devices]
+
+    if exclude_unavailable_cores:
+        available_cores = os.sched_getaffinity(0)
+        socket_affinities = [
+            list(set(affinity) & available_cores) for affinity in socket_affinities
+        ]
+
+    check_socket_affinities(socket_affinities)
+
+    return socket_affinities
+
+
+def get_grouped_socket_affinities(nproc_per_node, exclude_unavailable_cores=True):
+    socket_affinities = get_socket_affinities(nproc_per_node, exclude_unavailable_cores)
+    grouped_socket_affinities = group_affinity_by_siblings(socket_affinities)
+    return grouped_socket_affinities
+
+
+def set_socket_affinity(gpu_id, nproc_per_node, cores):
+    """
+    The process is assigned with all available physical CPU cores from the CPU
+    socket connected to the GPU with a given id.
+
+    Args:
+        gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        cores: 'all_logical' or 'single_logical'
+    """
+    grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
+    ungrouped_affinities = ungroup_affinities(grouped_socket_affinities, cores)
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
+
+
+def set_socket_single_affinity(gpu_id, nproc_per_node, cores):
+    """
+    The process is assigned with the first available physical CPU core from the
+    list of all CPU physical cores from the CPU socket connected to the GPU with
+    a given id.
+
+    Args:
+        gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        cores: 'all_logical' or 'single_logical'
+    """
+    grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
+    single_grouped_socket_affinities = [
+        group[:1] for group in grouped_socket_affinities
+    ]
+    ungrouped_affinities = ungroup_affinities(single_grouped_socket_affinities, cores)
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
+
+
+def set_socket_single_unique_affinity(gpu_id, nproc_per_node, cores):
+    """
+    The process is assigned with a single unique available physical CPU core
+    from the list of all CPU cores from the CPU socket connected to the GPU with
+    a given id.
+
+    Args:
+        gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        cores: 'all_logical' or 'single_logical'
+    """
+    grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
+
+    affinities = []
+    assigned_groups = set()
+
+    for grouped_socket_affinity in grouped_socket_affinities:
+        for group in grouped_socket_affinity:
+            if group not in assigned_groups:
+                affinities.append([group])
+                assigned_groups.add(group)
+                break
+
+    ungrouped_affinities = ungroup_affinities(affinities, cores)
+
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, cores, mode, balanced=True):
+    """
+    The process is assigned with a unique subset of available physical CPU
+    cores from the CPU socket connected to a GPU with a given id.
+    Assignment automatically includes hyperthreading siblings (if siblings are
+    available).
+
+    Args:
+        gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        cores: 'all_logical' or 'single_logical'
+        mode: 'contiguous' or 'interleaved'
+        balanced: assign an equal number of physical cores to each process,
+    """
+    grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
+
+    grouped_socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, grouped_socket_affinity in enumerate(grouped_socket_affinities):
+        grouped_socket_affinities_to_device_ids[tuple(grouped_socket_affinity)].append(
+            idx
+        )
+
+    # compute minimal number of physical cores per GPU across all GPUs and
+    # sockets, code assigns this number of cores per GPU if balanced == True
+    min_physical_cores_per_gpu = min(
+        [
+            len(cores) // len(gpus)
+            for cores, gpus in grouped_socket_affinities_to_device_ids.items()
+        ]
+    )
+
+    grouped_unique_affinities = [None] * nproc_per_node
+
+    for (
+        grouped_socket_affinity,
+        device_ids,
+    ) in grouped_socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        if balanced:
+            cores_per_device = min_physical_cores_per_gpu
+            grouped_socket_affinity = grouped_socket_affinity[
+                : devices_per_group * min_physical_cores_per_gpu
+            ]
+        else:
+            cores_per_device = len(grouped_socket_affinity) // devices_per_group
+
+        for socket_subgroup_id, device_id in enumerate(device_ids):
+            # In theory there should be no difference in performance between
+            # 'interleaved' and 'contiguous' pattern on Intel-based DGX-1,
+            # but 'contiguous' should be better for DGX A100 because on AMD
+            # Rome 4 consecutive cores are sharing L3 cache.
+            # TODO: code doesn't attempt to automatically detect layout of
+            # L3 cache, also external environment may already exclude some
+            # cores, this code makes no attempt to detect it and to align
+            # mapping to multiples of 4.
+
+            if mode == "interleaved":
+                unique_grouped_affinity = list(
+                    grouped_socket_affinity[socket_subgroup_id::devices_per_group]
+                )
+            elif mode == "contiguous":
+                unique_grouped_affinity = list(
+                    grouped_socket_affinity[
+                        socket_subgroup_id
+                        * cores_per_device : (socket_subgroup_id + 1)
+                        * cores_per_device
+                    ]
+                )
+            else:
+                raise RuntimeError("Unknown set_socket_unique_affinity mode")
+
+            grouped_unique_affinities[device_id] = unique_grouped_affinity
+
+    ungrouped_affinities = ungroup_affinities(grouped_unique_affinities, cores)
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
+
+
+from enum import Enum, auto
+
+
+class AffinityMode(Enum):
+    none = auto()
+    socket = auto()
+    socket_single = auto()
+    socket_single_unique = auto()
+    socket_unique_interleaved = auto()
+    socket_unique_contiguous = auto()
+
+
+def set_affinity(
+    gpu_id,
+    nproc_per_node=None,
+    *,
+    mode: Union[str, AffinityMode] = AffinityMode.socket_unique_contiguous,
+    cores="all_logical",
+    balanced=True,
+):
+    """
+    The process is assigned with a proper CPU affinity that matches CPU-GPU
+    hardware architecture on a given platform. Usually, it improves and
+    stabilizes the performance of deep learning training workloads.
+
+    This function assumes that the workload runs in multi-process single-device
+    mode (there are multiple training processes, and each process is running on
+    a single GPU). This is typical for multi-GPU data-parallel training
+    workloads (e.g., using `torch.nn.parallel.DistributedDataParallel`).
+
+    Available affinity modes:
+    * 'socket' - the process is assigned with all available physical CPU cores
+    from the CPU socket connected to the GPU with a given id.
+    * 'socket_single' - the process is assigned with the first available
+    physical CPU core from the list of all CPU cores from the CPU socket
+    connected to the GPU with a given id (multiple GPUs could be assigned with
+    the same CPU core).
+    * 'socket_single_unique' - the process is assigned with a single unique
+    available physical CPU core from the list of all CPU cores from the CPU
+    socket connected to the GPU with a given id.
+    * 'socket_unique_interleaved' - the process is assigned with a unique
+    subset of available physical CPU cores from the CPU socket connected to a
+    GPU with a given id, cores are assigned with interleaved indexing pattern
+    * 'socket_unique_contiguous' - (the default) the process is assigned with a
+    unique subset of available physical CPU cores from the CPU socket connected
+    to a GPU with a given id, cores are assigned with contiguous indexing
+    pattern
+
+    Available "cores" modes:
+    * 'all_logical' - assigns the process with all logical cores associated with
+    a given corresponding physical core (i.e., automatically includes all
+    available hyperthreading siblings)
+    * 'single_logical' - assigns the process with only one logical core
+    associated with a given corresponding physical core (i.e., excludes
+    hyperthreading siblings)
+
+    'socket_unique_contiguous' is the recommended mode for deep learning
+    training workloads on NVIDIA DGX machines.
+
+    Args:
+        gpu_id: integer index of a GPU, value from 0 to 'nproc_per_node' - 1
+        nproc_per_node: number of processes per node
+        mode: affinity mode
+        balanced: assign an equal number of physical cores to each process,
+            affects only 'socket_unique_interleaved' and
+            'socket_unique_contiguous' affinity modes
+        cores: 'all_logical' or 'single_logical'
+
+    Returns a set of logical CPU cores on which the process is eligible to run.
+
+    Example:
+
+    import argparse
+    import os
+
+    import gpu_affinity
+    import torch
+
+
+    def main():
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            '--local_rank',
+            type=int,
+            default=os.getenv('LOCAL_RANK', 0),
+        )
+        args = parser.parse_args()
+
+        nproc_per_node = torch.cuda.device_count()
+
+        affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node)
+        print(f'{args.local_rank}: core affinity: {affinity}')
+
+
+    if __name__ == "__main__":
+        main()
+
+    Launch the example with:
+    python -m torch.distributed.launch --nproc_per_node <#GPUs> example.py
+
+
+    WARNING: On DGX A100, only half of the CPU cores have direct access to GPUs.
+    This function restricts execution only to the CPU cores directly connected
+    to GPUs, so on DGX A100, it will limit the code to half of the CPU cores and
+    half of CPU memory bandwidth (which may be fine for many DL models).
+
+    WARNING: Intel's OpenMP implementation resets affinity on the first call to
+    an OpenMP function after a fork. It's recommended to run with env variable:
+    `KMP_AFFINITY=disabled` if the affinity set by gpu_affinity should be
+    preserved after a fork (e.g. in PyTorch DataLoader workers).
+    """
+    if not isinstance(mode, AffinityMode):
+        mode = AffinityMode[mode]
+    pynvml.nvmlInit()
+    if nproc_per_node is None:
+        nproc_per_node = pynvml.nvmlDeviceGetCount()
+
+    if mode == AffinityMode.none:
+        pass
+    elif mode == AffinityMode.socket:
+        set_socket_affinity(gpu_id, nproc_per_node, cores)
+    elif mode == AffinityMode.socket_single:
+        set_socket_single_affinity(gpu_id, nproc_per_node, cores)
+    elif mode == AffinityMode.socket_single_unique:
+        set_socket_single_unique_affinity(gpu_id, nproc_per_node, cores)
+    elif mode == AffinityMode.socket_unique_interleaved:
+        set_socket_unique_affinity(
+            gpu_id, nproc_per_node, cores, "interleaved", balanced
+        )
+    elif mode == AffinityMode.socket_unique_contiguous:
+        set_socket_unique_affinity(
+            gpu_id, nproc_per_node, cores, "contiguous", balanced
+        )
+    else:
+        raise RuntimeError("Unknown affinity mode")
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
--- a/image_classification/logger.py
+++ b/image_classification/logger.py
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from collections import OrderedDict
+from numbers import Number
+import dllogger
+import numpy as np
+
+
+def format_step(step):
+    if isinstance(step, str):
+        return step
+    s = ""
+    if len(step) > 0:
+        if isinstance(step[0], Number):
+            s += "Epoch: {} ".format(step[0])
+        else:
+            s += "{} ".format(step[0])
+    if len(step) > 1:
+        s += "Iteration: {} ".format(step[1])
+    if len(step) > 2:
+        s += "Validation Iteration: {} ".format(step[2])
+    if len(step) == 0:
+        s = "Summary:"
+    return s
+
+
+PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter())
+LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter())
+ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter())
+LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter())
+
+LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1))
+LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99))
+LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95))
+
+
+class Meter(object):
+    def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator):
+        self.run_aggregator = run_aggregator
+        self.epoch_aggregator = epoch_aggregator
+        self.iteration_aggregator = iteration_aggregator
+
+    def record(self, val, n=1):
+        self.iteration_aggregator.record(val, n=n)
+
+    def get_iteration(self):
+        v, n = self.iteration_aggregator.get_val()
+        return v
+
+    def reset_iteration(self):
+        v, n = self.iteration_aggregator.get_data()
+        self.iteration_aggregator.reset()
+        if v is not None:
+            self.epoch_aggregator.record(v, n=n)
+
+    def get_epoch(self):
+        v, n = self.epoch_aggregator.get_val()
+        return v
+
+    def reset_epoch(self):
+        v, n = self.epoch_aggregator.get_data()
+        self.epoch_aggregator.reset()
+        if v is not None:
+            self.run_aggregator.record(v, n=n)
+
+    def get_run(self):
+        v, n = self.run_aggregator.get_val()
+        return v
+
+    def reset_run(self):
+        self.run_aggregator.reset()
+
+
+class QuantileMeter(object):
+    def __init__(self, q):
+        self.q = q
+        self.reset()
+
+    def reset(self):
+        self.vals = []
+        self.n = 0
+
+    def record(self, val, n=1):
+        if isinstance(val, list):
+            self.vals += val
+            self.n += len(val)
+        else:
+            self.vals += [val] * n
+            self.n += n
+
+    def get_val(self):
+        if not self.vals:
+            return None, self.n
+        return np.quantile(self.vals, self.q, interpolation="nearest"), self.n
+
+    def get_data(self):
+        return self.vals, self.n
+
+
+class MaxMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.max = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        if self.max is None:
+            self.max = val
+        else:
+            self.max = max(self.max, val)
+        self.n = n
+
+    def get_val(self):
+        return self.max, self.n
+
+    def get_data(self):
+        return self.max, self.n
+
+
+class MinMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.min = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        if self.min is None:
+            self.min = val
+        else:
+            self.min = max(self.min, val)
+        self.n = n
+
+    def get_val(self):
+        return self.min, self.n
+
+    def get_data(self):
+        return self.min, self.n
+
+
+class LastMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.last = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        self.last = val
+        self.n = n
+
+    def get_val(self):
+        return self.last, self.n
+
+    def get_data(self):
+        return self.last, self.n
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.n = 0
+        self.val = 0
+
+    def record(self, val, n=1):
+        self.n += n
+        self.val += val * n
+
+    def get_val(self):
+        if self.n == 0:
+            return None, 0
+        return self.val / self.n, self.n
+
+    def get_data(self):
+        if self.n == 0:
+            return None, 0
+        return self.val / self.n, self.n
+
+
+class Logger(object):
+    def __init__(self, print_interval, backends, start_epoch=-1, verbose=False):
+        self.epoch = start_epoch
+        self.iteration = -1
+        self.val_iteration = -1
+        self.calib_iteration = -1
+        self.metrics = OrderedDict()
+        self.backends = backends
+        self.print_interval = print_interval
+        self.verbose = verbose
+        dllogger.init(backends)
+
+    def log_parameter(self, data, verbosity=0):
+        dllogger.log(step="PARAMETER", data=data, verbosity=verbosity)
+
+    def register_metric(self, metric_name, meter, verbosity=0, metadata={}):
+        if self.verbose:
+            print("Registering metric: {}".format(metric_name))
+        self.metrics[metric_name] = {"meter": meter, "level": verbosity}
+        dllogger.metadata(metric_name, metadata)
+
+    def log_metric(self, metric_name, val, n=1):
+        self.metrics[metric_name]["meter"].record(val, n=n)
+
+    def start_iteration(self, mode="train"):
+        if mode == "val":
+            self.val_iteration += 1
+        elif mode == "train":
+            self.iteration += 1
+        elif mode == "calib":
+            self.calib_iteration += 1
+
+    def end_iteration(self, mode="train"):
+        if mode == "val":
+            it = self.val_iteration
+        elif mode == "train":
+            it = self.iteration
+        elif mode == "calib":
+            it = self.calib_iteration
+        if it % self.print_interval == 0 or mode == "calib":
+            metrics = {n: m for n, m in self.metrics.items() if n.startswith(mode)}
+            if mode == "train":
+                step = (self.epoch, self.iteration)
+            elif mode == "val":
+                step = (self.epoch, self.iteration, self.val_iteration)
+            elif mode == "calib":
+                step = ("Calibration", self.calib_iteration)
+
+            verbositys = {m["level"] for _, m in metrics.items()}
+            for ll in verbositys:
+                llm = {n: m for n, m in metrics.items() if m["level"] == ll}
+
+                dllogger.log(
+                    step=step,
+                    data={n: m["meter"].get_iteration() for n, m in llm.items()},
+                    verbosity=ll,
+                )
+
+            for n, m in metrics.items():
+                m["meter"].reset_iteration()
+
+            dllogger.flush()
+
+    def start_epoch(self):
+        self.epoch += 1
+        self.iteration = 0
+        self.val_iteration = 0
+
+        for n, m in self.metrics.items():
+            if not n.startswith("calib"):
+                m["meter"].reset_epoch()
+
+    def end_epoch(self):
+        for n, m in self.metrics.items():
+            if not n.startswith("calib"):
+                m["meter"].reset_iteration()
+
+        verbositys = {m["level"] for _, m in self.metrics.items()}
+        for ll in verbositys:
+            llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
+            dllogger.log(
+                step=(self.epoch,),
+                data={n: m["meter"].get_epoch() for n, m in llm.items()},
+            )
+
+    def start_calibration(self):
+        self.calib_iteration = 0
+
+        for n, m in self.metrics.items():
+            if n.startswith("calib"):
+                m["meter"].reset_epoch()
+
+    def end_calibration(self):
+        for n, m in self.metrics.items():
+            if n.startswith("calib"):
+                m["meter"].reset_iteration()
+
+    def end(self):
+        for n, m in self.metrics.items():
+            m["meter"].reset_epoch()
+
+        verbositys = {m["level"] for _, m in self.metrics.items()}
+        for ll in verbositys:
+            llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
+            dllogger.log(
+                step=tuple(), data={n: m["meter"].get_run() for n, m in llm.items()}
+            )
+
+        for n, m in self.metrics.items():
+            m["meter"].reset_epoch()
+
+        dllogger.flush()
+
+    def iteration_generator_wrapper(self, gen, mode="train"):
+        for g in gen:
+            self.start_iteration(mode=mode)
+            yield g
+            self.end_iteration(mode=mode)
+
+    def epoch_generator_wrapper(self, gen):
+        for g in gen:
+            self.start_epoch()
+            yield g
+            self.end_epoch()
+
+
+class Metrics:
+    ACC_METADATA = {"unit": "%", "format": ":.2f"}
+    IPS_METADATA = {"unit": "images/s", "format": ":.2f"}
+    TIME_METADATA = {"unit": "s", "format": ":.5f"}
+    LOSS_METADATA = {"unit": None, "format": ":.5f"}
+    LR_METADATA = {"unit": None, "format": ":.5f"}
+
+    def __init__(self, logger):
+        self.logger = logger
+        self.map = {}
+
+    def log(self, **kwargs):
+        if self.logger is None:
+            return
+        for k, v in kwargs.items():
+            tks = self.map.get(k, [k])
+            for tk in tks:
+                if isinstance(v, tuple):
+                    self.logger.log_metric(tk, v[0], v[1])
+                else:
+                    self.logger.log_metric(tk, v)
+
+
+class TrainingMetrics(Metrics):
+    def __init__(self, logger):
+        super().__init__(logger)
+        if self.logger is not None:
+            self.map = {
+                "loss": ["train.loss"],
+                "compute_ips": ["train.compute_ips"],
+                "total_ips": ["train.total_ips"],
+                "data_time": ["train.data_time"],
+                "compute_time": ["train.compute_time"],
+                "lr": ["train.lr"],
+                "grad_scale": ["train.grad_scale"],
+            }
+            logger.register_metric(
+                "train.loss",
+                LOSS_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+                metadata=Metrics.LOSS_METADATA,
+            )
+            logger.register_metric(
+                "train.compute_ips",
+                PERF_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+                metadata=Metrics.IPS_METADATA,
+            )
+            logger.register_metric(
+                "train.total_ips",
+                PERF_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+                metadata=Metrics.IPS_METADATA,
+            )
+            logger.register_metric(
+                "train.data_time",
+                PERF_METER(),
+                verbosity=dllogger.Verbosity.VERBOSE,
+                metadata=Metrics.TIME_METADATA,
+            )
+            logger.register_metric(
+                "train.compute_time",
+                PERF_METER(),
+                verbosity=dllogger.Verbosity.VERBOSE,
+                metadata=Metrics.TIME_METADATA,
+            )
+            logger.register_metric(
+                "train.lr",
+                LR_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+            )
+            logger.register_metric(
+                "train.grad_scale",
+                PERF_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+                metadata=Metrics.LOSS_METADATA,
+            )
+
+
+class ValidationMetrics(Metrics):
+    def __init__(self, logger, prefix, topk):
+        super().__init__(logger)
+        if self.logger is not None:
+            self.map = {
+                "loss": [f"{prefix}.loss"],
+                "top1": [f"{prefix}.top1"],
+                f"top{topk}": [f"{prefix}.top{topk}"],
+                "compute_ips": [f"{prefix}.compute_ips"],
+                "total_ips": [f"{prefix}.total_ips"],
+                "data_time": [f"{prefix}.data_time"],
+                "compute_time": [
+                    f"{prefix}.compute_latency",
+                    f"{prefix}.compute_latency_at100",
+                    f"{prefix}.compute_latency_at99",
+                    f"{prefix}.compute_latency_at95",
+                ],
+            }
+            logger.register_metric(
+                f"{prefix}.top1",
+                ACC_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+                metadata=Metrics.ACC_METADATA,
+            )
+            logger.register_metric(
+                f"{prefix}.top{topk}",
+                ACC_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+                metadata=Metrics.ACC_METADATA,
+            )
+            logger.register_metric(
+                f"{prefix}.loss",
+                LOSS_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+                metadata=Metrics.LOSS_METADATA,
+            )
+            logger.register_metric(
+                f"{prefix}.compute_ips",
+                PERF_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+                metadata=Metrics.IPS_METADATA,
+            )
+            logger.register_metric(
+                f"{prefix}.total_ips",
+                PERF_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+                metadata=Metrics.IPS_METADATA,
+            )
+            logger.register_metric(
+                f"{prefix}.data_time",
+                PERF_METER(),
+                verbosity=dllogger.Verbosity.VERBOSE,
+                metadata=Metrics.TIME_METADATA,
+            )
+            logger.register_metric(
+                f"{prefix}.compute_latency",
+                PERF_METER(),
+                verbosity=dllogger.Verbosity.DEFAULT,
+                metadata=Metrics.TIME_METADATA,
+            )
+            logger.register_metric(
+                f"{prefix}.compute_latency_at100",
+                LAT_100(),
+                verbosity=dllogger.Verbosity.VERBOSE,
+                metadata=Metrics.TIME_METADATA,
+            )
+            logger.register_metric(
+                f"{prefix}.compute_latency_at99",
+                LAT_99(),
+                verbosity=dllogger.Verbosity.VERBOSE,
+                metadata=Metrics.TIME_METADATA,
+            )
+            logger.register_metric(
+                f"{prefix}.compute_latency_at95",
+                LAT_95(),
+                verbosity=dllogger.Verbosity.VERBOSE,
+                metadata=Metrics.TIME_METADATA,
+            )
--- a/image_classification/mixup.py
+++ b/image_classification/mixup.py
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+def mixup(alpha, data, target):
+    with torch.no_grad():
+        bs = data.size(0)
+        c = np.random.beta(alpha, alpha)
+
+        perm = torch.randperm(bs).cuda()
+
+        md = c * data + (1 - c) * data[perm, :]
+        mt = c * target + (1 - c) * target[perm, :]
+        return md, mt
+
+
+class MixUpWrapper(object):
+    def __init__(self, alpha, dataloader):
+        self.alpha = alpha
+        self.dataloader = dataloader
+
+    def mixup_loader(self, loader):
+        for input, target in loader:
+            i, t = mixup(self.alpha, input, target)
+            yield i, t
+
+    def __iter__(self):
+        return self.mixup_loader(self.dataloader)
+
+    def __len__(self):
+        return len(self.dataloader)
+
+
+class NLLMultiLabelSmooth(nn.Module):
+    def __init__(self, smoothing=0.0):
+        super(NLLMultiLabelSmooth, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+
+    def forward(self, x, target):
+        if self.training:
+            x = x.float()
+            target = target.float()
+            logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+
+            nll_loss = -logprobs * target
+            nll_loss = nll_loss.sum(-1)
+
+            smooth_loss = -logprobs.mean(dim=-1)
+
+            loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+
+            return loss.mean()
+        else:
+            return torch.nn.functional.cross_entropy(x, target)
--- a/image_classification/models/__init__.py
+++ b/image_classification/models/__init__.py
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .entrypoints import nvidia_convnets_processing_utils, nvidia_efficientnet
+from .resnet import resnet50, resnext101_32x4d, se_resnext101_32x4d
+from .efficientnet import (
+    efficientnet_b0,
+    efficientnet_b4,
+    efficientnet_widese_b0,
+    efficientnet_widese_b4,
+    efficientnet_quant_b0,
+    efficientnet_quant_b4,
+)
--- a/image_classification/models/__pycache__/__init__.cpython-37.pyc
+++ b/image_classification/models/__pycache__/__init__.cpython-37.pyc
--- a/image_classification/models/__pycache__/common.cpython-37.pyc
+++ b/image_classification/models/__pycache__/common.cpython-37.pyc
--- a/image_classification/models/__pycache__/efficientnet.cpython-37.pyc
+++ b/image_classification/models/__pycache__/efficientnet.cpython-37.pyc
--- a/image_classification/models/__pycache__/entrypoints.cpython-37.pyc
+++ b/image_classification/models/__pycache__/entrypoints.cpython-37.pyc
--- a/image_classification/models/__pycache__/model.cpython-37.pyc
+++ b/image_classification/models/__pycache__/model.cpython-37.pyc
--- a/image_classification/models/__pycache__/resnet.cpython-37.pyc
+++ b/image_classification/models/__pycache__/resnet.cpython-37.pyc
--- a/image_classification/models/common.py
+++ b/image_classification/models/common.py
+import copy
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import warnings
+from torch import nn
+import torch.nn.functional as F
+
+try:
+    from pytorch_quantization import nn as quant_nn
+except ImportError as e:
+    warnings.warn(
+        "pytorch_quantization module not found, quantization will not be available"
+    )
+    quant_nn = None
+
+
+# LayerBuilder {{{
+class LayerBuilder(object):
+    @dataclass
+    class Config:
+        activation: str = "relu"
+        conv_init: str = "fan_in"
+        bn_momentum: Optional[float] = None
+        bn_epsilon: Optional[float] = None
+
+    def __init__(self, config: "LayerBuilder.Config"):
+        self.config = config
+
+    def conv(
+        self,
+        kernel_size,
+        in_planes,
+        out_planes,
+        groups=1,
+        stride=1,
+        bn=False,
+        zero_init_bn=False,
+        act=False,
+    ):
+        conv = nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            groups=groups,
+            stride=stride,
+            padding=int((kernel_size - 1) / 2),
+            bias=False,
+        )
+
+        nn.init.kaiming_normal_(
+            conv.weight, mode=self.config.conv_init, nonlinearity="relu"
+        )
+        layers = [("conv", conv)]
+        if bn:
+            layers.append(("bn", self.batchnorm(out_planes, zero_init_bn)))
+        if act:
+            layers.append(("act", self.activation()))
+
+        if bn or act:
+            return nn.Sequential(OrderedDict(layers))
+        else:
+            return conv
+
+    def convDepSep(
+        self, kernel_size, in_planes, out_planes, stride=1, bn=False, act=False
+    ):
+        """3x3 depthwise separable convolution with padding"""
+        c = self.conv(
+            kernel_size,
+            in_planes,
+            out_planes,
+            groups=in_planes,
+            stride=stride,
+            bn=bn,
+            act=act,
+        )
+        return c
+
+    def conv3x3(self, in_planes, out_planes, stride=1, groups=1, bn=False, act=False):
+        """3x3 convolution with padding"""
+        c = self.conv(
+            3, in_planes, out_planes, groups=groups, stride=stride, bn=bn, act=act
+        )
+        return c
+
+    def conv1x1(self, in_planes, out_planes, stride=1, groups=1, bn=False, act=False):
+        """1x1 convolution with padding"""
+        c = self.conv(
+            1, in_planes, out_planes, groups=groups, stride=stride, bn=bn, act=act
+        )
+        return c
+
+    def conv7x7(self, in_planes, out_planes, stride=1, groups=1, bn=False, act=False):
+        """7x7 convolution with padding"""
+        c = self.conv(
+            7, in_planes, out_planes, groups=groups, stride=stride, bn=bn, act=act
+        )
+        return c
+
+    def conv5x5(self, in_planes, out_planes, stride=1, groups=1, bn=False, act=False):
+        """5x5 convolution with padding"""
+        c = self.conv(
+            5, in_planes, out_planes, groups=groups, stride=stride, bn=bn, act=act
+        )
+        return c
+
+    def batchnorm(self, planes, zero_init=False):
+        bn_cfg = {}
+        if self.config.bn_momentum is not None:
+            bn_cfg["momentum"] = self.config.bn_momentum
+        if self.config.bn_epsilon is not None:
+            bn_cfg["eps"] = self.config.bn_epsilon
+
+        bn = nn.BatchNorm2d(planes, **bn_cfg)
+        gamma_init_val = 0 if zero_init else 1
+        nn.init.constant_(bn.weight, gamma_init_val)
+        nn.init.constant_(bn.bias, 0)
+
+        return bn
+
+    def activation(self):
+        return {
+            "silu": lambda: nn.SiLU(inplace=True),
+            "relu": lambda: nn.ReLU(inplace=True),
+            "onnx-silu": ONNXSiLU,
+        }[self.config.activation]()
+
+
+# LayerBuilder }}}
+
+# LambdaLayer {{{
+class LambdaLayer(nn.Module):
+    def __init__(self, lmbd):
+        super().__init__()
+        self.lmbd = lmbd
+
+    def forward(self, x):
+        return self.lmbd(x)
+
+
+# }}}
+
+# SqueezeAndExcitation {{{
+class SqueezeAndExcitation(nn.Module):
+    def __init__(self, in_channels, squeeze, activation):
+        super(SqueezeAndExcitation, self).__init__()
+        self.squeeze = nn.Linear(in_channels, squeeze)
+        self.expand = nn.Linear(squeeze, in_channels)
+        self.activation = activation
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        return self._attention(x)
+
+    def _attention(self, x):
+        out = torch.mean(x, [2, 3])
+        out = self.squeeze(out)
+        out = self.activation(out)
+        out = self.expand(out)
+        out = self.sigmoid(out)
+        out = out.unsqueeze(2).unsqueeze(3)
+        return out
+
+
+class SqueezeAndExcitationTRT(nn.Module):
+    def __init__(self, in_channels, squeeze, activation):
+        super(SqueezeAndExcitationTRT, self).__init__()
+        self.pooling = nn.AdaptiveAvgPool2d(1)
+        self.squeeze = nn.Conv2d(in_channels, squeeze, 1)
+        self.expand = nn.Conv2d(squeeze, in_channels, 1)
+        self.activation = activation
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        return self._attention(x)
+
+    def _attention(self, x):
+        out = self.pooling(x)
+        out = self.squeeze(out)
+        out = self.activation(out)
+        out = self.expand(out)
+        out = self.sigmoid(out)
+        return out
+
+
+# }}}
+
+# EMA {{{
+class EMA:
+    def __init__(self, mu, module_ema):
+        self.mu = mu
+        self.module_ema = module_ema
+
+    def __call__(self, module, step=None):
+        if step is None:
+            mu = self.mu
+        else:
+            mu = min(self.mu, (1.0 + step) / (10 + step))
+
+        def strip_module(s: str) -> str:
+            return s
+
+        mesd = self.module_ema.state_dict()
+        with torch.no_grad():
+            for name, x in module.state_dict().items():
+                if name.endswith("num_batches_tracked"):
+                    continue
+                n = strip_module(name)
+                mesd[n].mul_(mu)
+                mesd[n].add_((1.0 - mu) * x)
+
+
+# }}}
+
+# ONNXSiLU {{{
+# Since torch.nn.SiLU is not supported in ONNX,
+# it is required to use this implementation in exported model (15-20% more GPU memory is needed)
+class ONNXSiLU(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(ONNXSiLU, self).__init__()
+
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+
+
+# }}}
+
+
+class SequentialSqueezeAndExcitation(SqueezeAndExcitation):
+    def __init__(self, in_channels, squeeze, activation, quantized=False):
+        super().__init__(in_channels, squeeze, activation)
+        self.quantized = quantized
+        if quantized:
+            assert quant_nn is not None, "pytorch_quantization is not available"
+            self.mul_a_quantizer = quant_nn.TensorQuantizer(
+                quant_nn.QuantConv2d.default_quant_desc_input
+            )
+            self.mul_b_quantizer = quant_nn.TensorQuantizer(
+                quant_nn.QuantConv2d.default_quant_desc_input
+            )
+        else:
+            self.mul_a_quantizer = nn.Identity()
+            self.mul_b_quantizer = nn.Identity()
+
+    def forward(self, x):
+        out = self._attention(x)
+        if not self.quantized:
+            return out * x
+        else:
+            x_quant = self.mul_a_quantizer(out)
+            return x_quant * self.mul_b_quantizer(x)
+
+
+class SequentialSqueezeAndExcitationTRT(SqueezeAndExcitationTRT):
+    def __init__(self, in_channels, squeeze, activation, quantized=False):
+        super().__init__(in_channels, squeeze, activation)
+        self.quantized = quantized
+        if quantized:
+            assert quant_nn is not None, "pytorch_quantization is not available"
+            self.mul_a_quantizer = quant_nn.TensorQuantizer(
+                quant_nn.QuantConv2d.default_quant_desc_input
+            )
+            self.mul_b_quantizer = quant_nn.TensorQuantizer(
+                quant_nn.QuantConv2d.default_quant_desc_input
+            )
+        else:
+            self.mul_a_quantizer = nn.Identity()
+            self.mul_b_quantizer = nn.Identity()
+
+    def forward(self, x):
+        out = self._attention(x)
+        if not self.quantized:
+            return out * x
+        else:
+            x_quant = self.mul_a_quantizer(out)
+            return x_quant * self.mul_b_quantizer(x)
+
+
+class StochasticDepthResidual(nn.Module):
+    def __init__(self, survival_prob: float):
+        super().__init__()
+        self.survival_prob = survival_prob
+        self.register_buffer("mask", torch.ones(()), persistent=False)
+
+    def forward(self, residual: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+        if not self.training:
+            return torch.add(residual, other=x)
+        else:
+            with torch.no_grad():
+                mask = F.dropout(
+                    self.mask,
+                    p=1 - self.survival_prob,
+                    training=self.training,
+                    inplace=False,
+                )
+            return torch.addcmul(residual, mask, x)
+
+class Flatten(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.squeeze(-1).squeeze(-1)
--- a/image_classification/models/efficientnet.py
+++ b/image_classification/models/efficientnet.py
+import argparse
+import random
+import math
+import warnings
+from typing import List, Any, Optional
+from collections import namedtuple, OrderedDict
+from dataclasses import dataclass, replace
+
+import torch
+from torch import nn
+from functools import partial
+
+try:
+    from pytorch_quantization import nn as quant_nn
+    from ..quantization import switch_on_quantization
+except ImportError as e:
+    warnings.warn(
+        "pytorch_quantization module not found, quantization will not be available"
+    )
+    quant_nn = None
+
+    import contextlib
+
+    @contextlib.contextmanager
+    def switch_on_quantization(do_quantization=False):
+        assert not do_quantization, "quantization is not available"
+        try:
+            yield
+        finally:
+            pass
+
+
+from .common import (
+    SequentialSqueezeAndExcitation,
+    SequentialSqueezeAndExcitationTRT,
+    LayerBuilder,
+    StochasticDepthResidual,
+    Flatten,
+)
+
+from .model import (
+    Model,
+    ModelParams,
+    ModelArch,
+    OptimizerParams,
+    create_entrypoint,
+    EntryPoint,
+)
+
+
+# EffNetArch {{{
+@dataclass
+class EffNetArch(ModelArch):
+    block: Any
+    stem_channels: int
+    feature_channels: int
+    kernel: List[int]
+    stride: List[int]
+    num_repeat: List[int]
+    expansion: List[int]
+    channels: List[int]
+    default_image_size: int
+    squeeze_excitation_ratio: float = 0.25
+
+    def enumerate(self):
+        return enumerate(
+            zip(
+                self.kernel, self.stride, self.num_repeat, self.expansion, self.channels
+            )
+        )
+
+    def num_layers(self):
+        _f = lambda l: len(set(map(len, l)))
+        l = [self.kernel, self.stride, self.num_repeat, self.expansion, self.channels]
+        assert _f(l) == 1
+        return len(self.kernel)
+
+    @staticmethod
+    def _scale_width(width_coeff, divisor=8):
+        def _sw(num_channels):
+            num_channels *= width_coeff
+            # Rounding should not go down by more than 10%
+            rounded_num_channels = max(
+                divisor, int(num_channels + divisor / 2) // divisor * divisor
+            )
+            if rounded_num_channels < 0.9 * num_channels:
+                rounded_num_channels += divisor
+            return rounded_num_channels
+
+        return _sw
+
+    @staticmethod
+    def _scale_depth(depth_coeff):
+        def _sd(num_repeat):
+            return int(math.ceil(num_repeat * depth_coeff))
+
+        return _sd
+
+    def scale(self, wc, dc, dis, divisor=8) -> "EffNetArch":
+        sw = EffNetArch._scale_width(wc, divisor=divisor)
+        sd = EffNetArch._scale_depth(dc)
+
+        return EffNetArch(
+            block=self.block,
+            stem_channels=sw(self.stem_channels),
+            feature_channels=sw(self.feature_channels),
+            kernel=self.kernel,
+            stride=self.stride,
+            num_repeat=list(map(sd, self.num_repeat)),
+            expansion=self.expansion,
+            channels=list(map(sw, self.channels)),
+            default_image_size=dis,
+            squeeze_excitation_ratio=self.squeeze_excitation_ratio,
+        )
+
+
+# }}}
+# EffNetParams {{{
+@dataclass
+class EffNetParams(ModelParams):
+    dropout: float
+    num_classes: int = 1000
+    activation: str = "silu"
+    conv_init: str = "fan_in"
+    bn_momentum: float = 1 - 0.99
+    bn_epsilon: float = 1e-3
+    survival_prob: float = 1
+    quantized: bool = False
+    trt: bool = False
+
+    def parser(self, name):
+        p = super().parser(name)
+        p.add_argument(
+            "--num_classes",
+            metavar="N",
+            default=self.num_classes,
+            type=int,
+            help="number of classes",
+        )
+        p.add_argument(
+            "--conv_init",
+            default=self.conv_init,
+            choices=["fan_in", "fan_out"],
+            type=str,
+            help="initialization mode for convolutional layers, see https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.kaiming_normal_",
+        )
+        p.add_argument(
+            "--bn_momentum",
+            default=self.bn_momentum,
+            type=float,
+            help="Batch Norm momentum",
+        )
+        p.add_argument(
+            "--bn_epsilon",
+            default=self.bn_epsilon,
+            type=float,
+            help="Batch Norm epsilon",
+        )
+        p.add_argument(
+            "--survival_prob",
+            default=self.survival_prob,
+            type=float,
+            help="Survival probability for stochastic depth",
+        )
+        p.add_argument(
+            "--dropout", default=self.dropout, type=float, help="Dropout drop prob"
+        )
+        p.add_argument("--trt", metavar="True|False", default=self.trt, type=bool)
+        return p
+
+
+# }}}
+
+
+class EfficientNet(nn.Module):
+    def __init__(
+        self,
+        arch: EffNetArch,
+        dropout: float,
+        num_classes: int = 1000,
+        activation: str = "silu",
+        conv_init: str = "fan_in",
+        bn_momentum: float = 1 - 0.99,
+        bn_epsilon: float = 1e-3,
+        survival_prob: float = 1,
+        quantized: bool = False,
+        trt: bool = False,
+    ):
+        self.quantized = quantized
+        with switch_on_quantization(self.quantized):
+            super(EfficientNet, self).__init__()
+            self.arch = arch
+            self.num_layers = arch.num_layers()
+            self.num_blocks = sum(arch.num_repeat)
+            self.survival_prob = survival_prob
+            self.builder = LayerBuilder(
+                LayerBuilder.Config(
+                    activation=activation,
+                    conv_init=conv_init,
+                    bn_momentum=bn_momentum,
+                    bn_epsilon=bn_epsilon,
+                )
+            )
+
+            self.stem = self._make_stem(arch.stem_channels)
+            out_channels = arch.stem_channels
+
+            plc = 0
+            layers = []
+            for i, (k, s, r, e, c) in arch.enumerate():
+                layer, out_channels = self._make_layer(
+                    block=arch.block,
+                    kernel_size=k,
+                    stride=s,
+                    num_repeat=r,
+                    expansion=e,
+                    in_channels=out_channels,
+                    out_channels=c,
+                    squeeze_excitation_ratio=arch.squeeze_excitation_ratio,
+                    prev_layer_count=plc,
+                    trt=trt,
+                )
+                plc = plc + r
+                layers.append(layer)
+            self.layers = nn.Sequential(*layers)
+            self.features = self._make_features(out_channels, arch.feature_channels)
+            self.classifier = self._make_classifier(
+                arch.feature_channels, num_classes, dropout
+            )
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.layers(x)
+        x = self.features(x)
+        x = self.classifier(x)
+
+        return x
+
+    def extract_features(self, x, layers=None):
+        if layers is None:
+            layers = [f"layer{i+1}" for i in range(self.num_layers)] + [
+                "features",
+                "classifier",
+            ]
+
+        run = [
+            i
+            for i in range(self.num_layers)
+            if "classifier" in layers
+            or "features" in layers
+            or any([f"layer{j+1}" in layers for j in range(i, self.num_layers)])
+        ]
+
+        output = {}
+        x = self.stem(x)
+        for l in run:
+            fn = self.layers[l]
+            x = fn(x)
+            if f"layer{l+1}" in layers:
+                output[f"layer{l+1}"] = x
+
+        if "features" in layers or "classifier" in layers:
+            x = self.features(x)
+            if "features" in layers:
+                output["features"] = x
+
+        if "classifier" in layers:
+            output["classifier"] = self.classifier(x)
+
+        return output
+
+    # helper functions {{{
+    def _make_stem(self, stem_width):
+        return nn.Sequential(
+            OrderedDict(
+                [
+                    ("conv", self.builder.conv3x3(3, stem_width, stride=2)),
+                    ("bn", self.builder.batchnorm(stem_width)),
+                    ("activation", self.builder.activation()),
+                ]
+            )
+        )
+
+    def _get_survival_prob(self, block_id):
+        drop_rate = 1.0 - self.survival_prob
+        sp = 1.0 - drop_rate * float(block_id) / self.num_blocks
+        return sp
+
+    def _make_features(self, in_channels, num_features):
+        return nn.Sequential(
+            OrderedDict(
+                [
+                    ("conv", self.builder.conv1x1(in_channels, num_features)),
+                    ("bn", self.builder.batchnorm(num_features)),
+                    ("activation", self.builder.activation()),
+                ]
+            )
+        )
+
+    def _make_classifier(self, num_features, num_classes, dropout):
+        return nn.Sequential(
+            OrderedDict(
+                [
+                    ("pooling", nn.AdaptiveAvgPool2d(1)),
+                    ("squeeze", Flatten()),
+                    ("dropout", nn.Dropout(dropout)),
+                    ("fc", nn.Linear(num_features, num_classes)),
+                ]
+            )
+        )
+
+    def _make_layer(
+        self,
+        block,
+        kernel_size,
+        stride,
+        num_repeat,
+        expansion,
+        in_channels,
+        out_channels,
+        squeeze_excitation_ratio,
+        prev_layer_count,
+        trt,
+    ):
+        layers = []
+
+        idx = 0
+        survival_prob = self._get_survival_prob(idx + prev_layer_count)
+        blk = block(
+            self.builder,
+            kernel_size,
+            in_channels,
+            out_channels,
+            expansion,
+            stride,
+            self.arch.squeeze_excitation_ratio,
+            survival_prob if stride == 1 and in_channels == out_channels else 1.0,
+            self.quantized,
+            trt=trt,
+        )
+        layers.append((f"block{idx}", blk))
+
+        for idx in range(1, num_repeat):
+            survival_prob = self._get_survival_prob(idx + prev_layer_count)
+            blk = block(
+                self.builder,
+                kernel_size,
+                out_channels,
+                out_channels,
+                expansion,
+                1,  # stride
+                squeeze_excitation_ratio,
+                survival_prob,
+                self.quantized,
+                trt=trt,
+            )
+            layers.append((f"block{idx}", blk))
+        return nn.Sequential(OrderedDict(layers)), out_channels
+
+    def ngc_checkpoint_remap(self, url=None, version=None):
+        if version is None:
+            version = url.split("/")[8]
+
+        def to_sequential_remap(s):
+            splited = s.split(".")
+            if splited[0].startswith("layer"):
+                return ".".join(
+                    ["layers." + str(int(splited[0][len("layer") :]) - 1)] + splited[1:]
+                )
+            else:
+                return s
+
+        def no_remap(s):
+            return s
+
+        return {"20.12.0": to_sequential_remap, "21.03.0": to_sequential_remap}.get(
+            version, no_remap
+        )
+
+
+# }}}
+
+# MBConvBlock {{{
+class MBConvBlock(nn.Module):
+    __constants__ = ["quantized"]
+
+    def __init__(
+        self,
+        builder: LayerBuilder,
+        depsep_kernel_size: int,
+        in_channels: int,
+        out_channels: int,
+        expand_ratio: int,
+        stride: int,
+        squeeze_excitation_ratio: float,
+        squeeze_hidden=False,
+        survival_prob: float = 1.0,
+        quantized: bool = False,
+        trt: bool = False,
+    ):
+        super().__init__()
+        self.quantized = quantized
+        self.residual = stride == 1 and in_channels == out_channels
+        hidden_dim = in_channels * expand_ratio
+        squeeze_base = hidden_dim if squeeze_hidden else in_channels
+        squeeze_dim = max(1, int(squeeze_base * squeeze_excitation_ratio))
+
+        self.expand = (
+            None
+            if in_channels == hidden_dim
+            else builder.conv1x1(in_channels, hidden_dim, bn=True, act=True)
+        )
+        self.depsep = builder.convDepSep(
+            depsep_kernel_size, hidden_dim, hidden_dim, stride, bn=True, act=True
+        )
+        if trt or self.quantized:
+            # Need TRT mode for quantized in order to automatically insert quantization before pooling
+            self.se: nn.Module = SequentialSqueezeAndExcitationTRT(
+                hidden_dim, squeeze_dim, builder.activation(), self.quantized
+            )
+        else:
+            self.se: nn.Module = SequentialSqueezeAndExcitation(
+                hidden_dim, squeeze_dim, builder.activation(), self.quantized
+            )
+
+        self.proj = builder.conv1x1(hidden_dim, out_channels, bn=True)
+
+        if survival_prob == 1.0:
+            self.residual_add = torch.add
+        else:
+            self.residual_add = StochasticDepthResidual(survival_prob=survival_prob)
+        if self.quantized and self.residual:
+            assert quant_nn is not None, "pytorch_quantization is not available"
+            self.residual_quantizer = quant_nn.TensorQuantizer(
+                quant_nn.QuantConv2d.default_quant_desc_input
+            )  # TODO QuantConv2d ?!?
+        else:
+            self.residual_quantizer = nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.residual:
+            return self.proj(
+                self.se(self.depsep(x if self.expand is None else self.expand(x)))
+            )
+
+        b = self.proj(
+            self.se(self.depsep(x if self.expand is None else self.expand(x)))
+        )
+        if self.quantized:
+            x = self.residual_quantizer(x)
+
+        return self.residual_add(x, b)
+
+
+def original_mbconv(
+    builder: LayerBuilder,
+    depsep_kernel_size: int,
+    in_channels: int,
+    out_channels: int,
+    expand_ratio: int,
+    stride: int,
+    squeeze_excitation_ratio: int,
+    survival_prob: float,
+    quantized: bool,
+    trt: bool,
+):
+    return MBConvBlock(
+        builder,
+        depsep_kernel_size,
+        in_channels,
+        out_channels,
+        expand_ratio,
+        stride,
+        squeeze_excitation_ratio,
+        squeeze_hidden=False,
+        survival_prob=survival_prob,
+        quantized=quantized,
+        trt=trt,
+    )
+
+
+def widese_mbconv(
+    builder: LayerBuilder,
+    depsep_kernel_size: int,
+    in_channels: int,
+    out_channels: int,
+    expand_ratio: int,
+    stride: int,
+    squeeze_excitation_ratio: int,
+    survival_prob: float,
+    quantized: bool,
+    trt: bool,
+):
+    return MBConvBlock(
+        builder,
+        depsep_kernel_size,
+        in_channels,
+        out_channels,
+        expand_ratio,
+        stride,
+        squeeze_excitation_ratio,
+        squeeze_hidden=True,
+        survival_prob=survival_prob,
+        quantized=quantized,
+        trt=trt,
+    )
+
+
+# }}}
+
+# EffNet configs {{{
+# fmt: off
+effnet_b0_layers = EffNetArch(
+    block = original_mbconv,
+    stem_channels = 32,
+    feature_channels=1280,
+    kernel     = [ 3,  3,  5,  3,   5,   5,   3],
+    stride     = [ 1,  2,  2,  2,   1,   2,   1],
+    num_repeat = [ 1,  2,  2,  3,   3,   4,   1],
+    expansion  = [ 1,  6,  6,  6,   6,   6,   6],
+    channels   = [16, 24, 40, 80, 112, 192, 320],
+    default_image_size=224,
+)
+effnet_b1_layers=effnet_b0_layers.scale(wc=1,   dc=1.1, dis=240)
+effnet_b2_layers=effnet_b0_layers.scale(wc=1.1, dc=1.2, dis=260)
+effnet_b3_layers=effnet_b0_layers.scale(wc=1.2, dc=1.4, dis=300)
+effnet_b4_layers=effnet_b0_layers.scale(wc=1.4, dc=1.8, dis=380)
+effnet_b5_layers=effnet_b0_layers.scale(wc=1.6, dc=2.2, dis=456)
+effnet_b6_layers=effnet_b0_layers.scale(wc=1.8, dc=2.6, dis=528)
+effnet_b7_layers=effnet_b0_layers.scale(wc=2.0, dc=3.1, dis=600)
+
+
+
+urls = {
+    "efficientnet-b0": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_b0_pyt_amp/versions/20.12.0/files/nvidia_efficientnet-b0_210412.pth",
+    "efficientnet-b4": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_b4_pyt_amp/versions/20.12.0/files/nvidia_efficientnet-b4_210412.pth",
+    "efficientnet-widese-b0": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_widese_b0_pyt_amp/versions/20.12.0/files/nvidia_efficientnet-widese-b0_210412.pth",
+    "efficientnet-widese-b4": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_widese_b4_pyt_amp/versions/20.12.0/files/nvidia_efficientnet-widese-b4_210412.pth",
+    "efficientnet-quant-b0": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_b0_pyt_qat_ckpt_fp32/versions/21.03.0/files/nvidia-efficientnet-quant-b0-130421.pth",
+    "efficientnet-quant-b4": "https://api.ngc.nvidia.com/v2/models/nvidia/efficientnet_b4_pyt_qat_ckpt_fp32/versions/21.03.0/files/nvidia-efficientnet-quant-b4-130421.pth",
+}
+
+def _m(*args, **kwargs):
+    return Model(constructor=EfficientNet, *args, **kwargs)
+
+architectures = {
+    "efficientnet-b0": _m(arch=effnet_b0_layers, params=EffNetParams(dropout=0.2), checkpoint_url=urls["efficientnet-b0"]),
+    "efficientnet-b1": _m(arch=effnet_b1_layers, params=EffNetParams(dropout=0.2)),
+    "efficientnet-b2": _m(arch=effnet_b2_layers, params=EffNetParams(dropout=0.3)),
+    "efficientnet-b3": _m(arch=effnet_b3_layers, params=EffNetParams(dropout=0.3)),
+    "efficientnet-b4": _m(arch=effnet_b4_layers, params=EffNetParams(dropout=0.4, survival_prob=0.8), checkpoint_url=urls["efficientnet-b4"]),
+    "efficientnet-b5": _m(arch=effnet_b5_layers, params=EffNetParams(dropout=0.4)),
+    "efficientnet-b6": _m(arch=effnet_b6_layers, params=EffNetParams(dropout=0.5)),
+    "efficientnet-b7": _m(arch=effnet_b7_layers, params=EffNetParams(dropout=0.5)),
+    "efficientnet-widese-b0": _m(arch=replace(effnet_b0_layers, block=widese_mbconv), params=EffNetParams(dropout=0.2), checkpoint_url=urls["efficientnet-widese-b0"]),
+    "efficientnet-widese-b1": _m(arch=replace(effnet_b1_layers, block=widese_mbconv), params=EffNetParams(dropout=0.2)),
+    "efficientnet-widese-b2": _m(arch=replace(effnet_b2_layers, block=widese_mbconv), params=EffNetParams(dropout=0.3)),
+    "efficientnet-widese-b3": _m(arch=replace(effnet_b3_layers, block=widese_mbconv), params=EffNetParams(dropout=0.3)),
+    "efficientnet-widese-b4": _m(arch=replace(effnet_b4_layers, block=widese_mbconv), params=EffNetParams(dropout=0.4, survival_prob=0.8), checkpoint_url=urls["efficientnet-widese-b4"]),
+    "efficientnet-widese-b5": _m(arch=replace(effnet_b5_layers, block=widese_mbconv), params=EffNetParams(dropout=0.4)),
+    "efficientnet-widese-b6": _m(arch=replace(effnet_b6_layers, block=widese_mbconv), params=EffNetParams(dropout=0.5)),
+    "efficientnet-widese-b7": _m(arch=replace(effnet_b7_layers, block=widese_mbconv), params=EffNetParams(dropout=0.5)),
+    "efficientnet-quant-b0": _m(arch=effnet_b0_layers, params=EffNetParams(dropout=0.2, quantized=True), checkpoint_url=urls["efficientnet-quant-b0"]),
+    "efficientnet-quant-b1": _m(arch=effnet_b1_layers, params=EffNetParams(dropout=0.2, quantized=True)),
+    "efficientnet-quant-b2": _m(arch=effnet_b2_layers, params=EffNetParams(dropout=0.3, quantized=True)),
+    "efficientnet-quant-b3": _m(arch=effnet_b3_layers, params=EffNetParams(dropout=0.3, quantized=True)),
+    "efficientnet-quant-b4": _m(arch=effnet_b4_layers, params=EffNetParams(dropout=0.4, survival_prob=0.8, quantized=True), checkpoint_url=urls["efficientnet-quant-b4"]),
+    "efficientnet-quant-b5": _m(arch=effnet_b5_layers, params=EffNetParams(dropout=0.4, quantized=True)),
+    "efficientnet-quant-b6": _m(arch=effnet_b6_layers, params=EffNetParams(dropout=0.5, quantized=True)),
+    "efficientnet-quant-b7": _m(arch=effnet_b7_layers, params=EffNetParams(dropout=0.5, quantized=True)),
+}
+# fmt: on
+
+# }}}
+
+_ce = lambda n: EntryPoint.create(n, architectures[n])
+efficientnet_b0 = _ce("efficientnet-b0")
+efficientnet_b4 = _ce("efficientnet-b4")
+
+efficientnet_widese_b0 = _ce("efficientnet-widese-b0")
+efficientnet_widese_b4 = _ce("efficientnet-widese-b4")
+
+efficientnet_quant_b0 = _ce("efficientnet-quant-b0")
+efficientnet_quant_b4 = _ce("efficientnet-quant-b4")
--- a/image_classification/models/entrypoints.py
+++ b/image_classification/models/entrypoints.py
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+def nvidia_efficientnet(type='efficient-b0', pretrained=True, **kwargs):
+    """Constructs a EfficientNet model.
+    For detailed information on model input and output, training recipies, inference and performance
+    visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
+    Args:
+        pretrained (bool, True): If True, returns a model pretrained on IMAGENET dataset.
+    """
+
+    from .efficientnet import _ce
+
+    return _ce(type)(pretrained=pretrained, **kwargs)
+
+
+def nvidia_convnets_processing_utils():
+    import numpy as np
+    import torch
+    from PIL import Image
+    import torchvision.transforms as transforms
+    import numpy as np
+    import json
+    import requests
+    import validators
+
+    class Processing:
+
+        @staticmethod
+        def prepare_input_from_uri(uri, cuda=False):
+            img_transforms = transforms.Compose(
+                [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
+            )
+
+            if (validators.url(uri)):
+                img = Image.open(requests.get(uri, stream=True).raw)
+            else:
+                img = Image.open(uri)
+
+            img = img_transforms(img)
+            with torch.no_grad():
+                # mean and std are not multiplied by 255 as they are in training script
+                # torch dataloader reads data into bytes whereas loading directly
+                # through PIL creates a tensor with floats in [0,1] range
+                mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+                std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+                img = img.float()
+                if cuda:
+                    mean = mean.cuda()
+                    std = std.cuda()
+                    img = img.cuda()
+                input = img.unsqueeze(0).sub_(mean).div_(std)
+
+            return input
+
+        @staticmethod
+        def pick_n_best(predictions, n=5):
+            predictions = predictions.float().cpu().numpy()
+            topN = np.argsort(-1*predictions, axis=-1)[:,:n]
+            imgnet_classes = Processing.get_imgnet_classes()
+            
+            results=[]
+            for idx,case in enumerate(topN):
+                r = []
+                for c, v in zip(imgnet_classes[case], predictions[idx, case]):
+                    r.append((f"{c}", f"{100*v:.1f}%"))
+                print(f"sample {idx}: {r}")
+                results.append(r)
+            
+            return results
+
+        @staticmethod
+        def get_imgnet_classes():
+            import os
+            import json
+            imgnet_classes_json = "LOC_synset_mapping.json"
+
+            if not os.path.exists(imgnet_classes_json):
+                print("Downloading Imagenet Classes names.")
+                import urllib
+                urllib.request.urlretrieve(
+                    "https://raw.githubusercontent.com/NVIDIA/DeepLearningExamples/master/PyTorch/Classification/ConvNets/LOC_synset_mapping.json", 
+                    filename=imgnet_classes_json)
+                print("Downloading finished.")
+            imgnet_classes = np.array(json.load(open(imgnet_classes_json, "r")))
+
+            return imgnet_classes
+
+    return Processing()
--- a/image_classification/models/model.py
+++ b/image_classification/models/model.py
+from dataclasses import dataclass, asdict, replace
+from .common import (
+    SequentialSqueezeAndExcitationTRT,
+    SequentialSqueezeAndExcitation,
+    SqueezeAndExcitation,
+    SqueezeAndExcitationTRT,
+)
+from typing import Optional, Callable
+import os
+import torch
+import argparse
+from functools import partial
+
+
+@dataclass
+class ModelArch:
+    pass
+
+
+@dataclass
+class ModelParams:
+    def parser(self, name):
+        return argparse.ArgumentParser(
+            description=f"{name} arguments", add_help=False, usage=""
+        )
+
+
+@dataclass
+class OptimizerParams:
+    pass
+
+
+@dataclass
+class Model:
+    constructor: Callable
+    arch: ModelArch
+    params: Optional[ModelParams]
+    optimizer_params: Optional[OptimizerParams] = None
+    checkpoint_url: Optional[str] = None
+
+
+def torchhub_docstring(name: str):
+    return f"""Constructs a {name} model.
+    For detailed information on model input and output, training recipies, inference and performance
+    visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
+    Args:
+        pretrained (bool, True): If True, returns a model pretrained on IMAGENET dataset.
+    """
+
+
+class EntryPoint:
+    @staticmethod
+    def create(name: str, model: Model):
+        ep = EntryPoint(name, model)
+        ep.__doc__ = torchhub_docstring(name)
+        return ep
+
+    def __init__(self, name: str, model: Model):
+        self.name = name
+        self.model = model
+
+    def __call__(
+        self,
+        pretrained=True,
+        pretrained_from_file=None,
+        state_dict_key_map_fn=None,
+        **kwargs,
+    ):
+        assert not (pretrained and (pretrained_from_file is not None))
+        params = replace(self.model.params, **kwargs)
+
+        model = self.model.constructor(arch=self.model.arch, **asdict(params))
+
+        state_dict = None
+        if pretrained:
+            assert self.model.checkpoint_url is not None
+            state_dict = torch.hub.load_state_dict_from_url(
+                self.model.checkpoint_url,
+                map_location=torch.device("cpu"),
+                progress=True,
+            )
+
+        if pretrained_from_file is not None:
+            if os.path.isfile(pretrained_from_file):
+                print(
+                    "=> loading pretrained weights from '{}'".format(
+                        pretrained_from_file
+                    )
+                )
+                state_dict = torch.load(
+                    pretrained_from_file, map_location=torch.device("cpu")
+                )
+            else:
+                print(
+                    "=> no pretrained weights found at '{}'".format(
+                        pretrained_from_file
+                    )
+                )
+
+        if state_dict is not None:
+            state_dict = {
+                k[len("module.") :] if k.startswith("module.") else k: v
+                for k, v in state_dict.items()
+            }
+
+            def reshape(t, conv):
+                if conv:
+                    if len(t.shape) == 4:
+                        return t
+                    else:
+                        return t.view(t.shape[0], -1, 1, 1)
+                else:
+                    if len(t.shape) == 4:
+                        return t.view(t.shape[0], t.shape[1])
+                    else:
+                        return t
+
+            if state_dict_key_map_fn is not None:
+                state_dict = {
+                    state_dict_key_map_fn(k): v for k, v in state_dict.items()
+                }
+
+            if pretrained and hasattr(model, "ngc_checkpoint_remap"):
+                remap_fn = model.ngc_checkpoint_remap(url=self.model.checkpoint_url)
+                state_dict = {remap_fn(k): v for k, v in state_dict.items()}
+
+            def _se_layer_uses_conv(m):
+                return any(
+                    map(
+                        partial(isinstance, m),
+                        [
+                            SqueezeAndExcitationTRT,
+                            SequentialSqueezeAndExcitationTRT,
+                        ],
+                    )
+                )
+
+            state_dict = {
+                k: reshape(
+                    v,
+                    conv=_se_layer_uses_conv(
+                        dict(model.named_modules())[".".join(k.split(".")[:-2])]
+                    ),
+                )
+                if is_se_weight(k, v)
+                else v
+                for k, v in state_dict.items()
+            }
+
+            model.load_state_dict(state_dict)
+        return model
+
+    def parser(self):
+        if self.model.params is None:
+            return None
+        parser = self.model.params.parser(self.name)
+        parser.add_argument(
+            "--pretrained-from-file",
+            default=None,
+            type=str,
+            metavar="PATH",
+            help="load weights from local file",
+        )
+        if self.model.checkpoint_url is not None:
+            parser.add_argument(
+                "--pretrained",
+                default=False,
+                action="store_true",
+                help="load pretrained weights from NGC",
+            )
+
+        return parser
+
+
+def is_se_weight(key, value):
+    return key.endswith("squeeze.weight") or key.endswith("expand.weight")
+
+
+def create_entrypoint(m: Model):
+    def _ep(**kwargs):
+        params = replace(m.params, **kwargs)
+        return m.constructor(arch=m.arch, **asdict(params))
+
+    return _ep
--- a/image_classification/models/resnet.py
+++ b/image_classification/models/resnet.py
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import List, Dict, Callable, Any, Type
+
+import torch
+import torch.nn as nn
+
+from .common import (
+    SqueezeAndExcitation,
+    LayerBuilder,
+    SqueezeAndExcitationTRT,
+)
+
+from .model import (
+    Model,
+    ModelParams,
+    ModelArch,
+    EntryPoint,
+)
+
+
+__all__ = ["ResNet", "resnet_configs"]
+
+# BasicBlock {{{
+class BasicBlock(nn.Module):
+    def __init__(
+        self,
+        builder,
+        inplanes,
+        planes,
+        expansion,
+        stride=1,
+        cardinality=1,
+        downsample=None,
+        fused_se=True,
+        last_bn_0_init=False,
+        trt=False,
+    ):
+        super(BasicBlock, self).__init__()
+        self.conv1 = builder.conv3x3(inplanes, planes, stride, groups=cardinality)
+        self.bn1 = builder.batchnorm(planes)
+        self.relu = builder.activation()
+        self.conv2 = builder.conv3x3(
+            planes, planes * expansion, groups=cardinality
+        )
+        self.bn2 = builder.batchnorm(planes * expansion, zero_init=last_bn_0_init)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        if self.bn1 is not None:
+            out = self.bn1(out)
+
+        out = self.relu(out)
+
+        out = self.conv2(out)
+
+        if self.bn2 is not None:
+            out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+# BasicBlock }}}
+
+# Bottleneck {{{
+class Bottleneck(nn.Module):
+    def __init__(
+        self,
+        builder,
+        inplanes,
+        planes,
+        expansion,
+        stride=1,
+        cardinality=1,
+        se=False,
+        se_squeeze=16,
+        downsample=None,
+        fused_se=True,
+        last_bn_0_init=False,
+        trt=False,
+    ):
+        super(Bottleneck, self).__init__()
+        self.conv1 = builder.conv1x1(inplanes, planes)
+        self.bn1 = builder.batchnorm(planes)
+        self.conv2 = builder.conv3x3(planes, planes, groups=cardinality, stride=stride)
+        self.bn2 = builder.batchnorm(planes)
+        self.conv3 = builder.conv1x1(planes, planes * expansion)
+        self.bn3 = builder.batchnorm(planes * expansion, zero_init=last_bn_0_init)
+        self.relu = builder.activation()
+        self.downsample = downsample
+        self.stride = stride
+
+        self.fused_se = fused_se
+        if se:
+            self.squeeze = (
+                SqueezeAndExcitation(
+                    planes * expansion, se_squeeze, builder.activation()
+                )
+                if not trt
+                else SqueezeAndExcitationTRT(
+                    planes * expansion, se_squeeze, builder.activation()
+                )
+            )
+        else:
+            self.squeeze = None
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        if self.squeeze is None:
+            out += residual
+        else:
+            if self.fused_se:
+                out = torch.addcmul(residual, out, self.squeeze(out), value=1)
+            else:
+                out = residual + out * self.squeeze(out)
+
+        out = self.relu(out)
+
+        return out
+
+
+class SEBottleneck(Bottleneck):
+    def __init__(
+        self,
+        builder,
+        inplanes,
+        planes,
+        expansion,
+        stride=1,
+        cardinality=1,
+        downsample=None,
+        fused_se=True,
+        last_bn_0_init=False,
+        trt=False,
+    ):
+        super(SEBottleneck, self).__init__(
+            builder,
+            inplanes,
+            planes,
+            expansion,
+            stride=stride,
+            cardinality=cardinality,
+            se=True,
+            se_squeeze=16,
+            downsample=downsample,
+            fused_se=fused_se,
+            last_bn_0_init=last_bn_0_init,
+            trt=trt,
+        )
+
+
+# Bottleneck }}}
+
+
+class ResNet(nn.Module):
+    @dataclass
+    class Arch(ModelArch):
+        block: Type[Bottleneck]
+        layers: List[int]  # arch
+        widths: List[int]  # arch
+        expansion: int
+        cardinality: int = 1
+        stem_width: int = 64
+        activation: str = "relu"
+        default_image_size: int = 224
+
+    @dataclass
+    class Params(ModelParams):
+        num_classes: int = 1000
+        last_bn_0_init: bool = False
+        conv_init: str = "fan_in"
+        trt: bool = False
+        fused_se: bool = True
+
+        def parser(self, name):
+            p = super().parser(name)
+
+            p.add_argument(
+                "--num_classes",
+                metavar="N",
+                default=self.num_classes,
+                type=int,
+                help="number of classes",
+            )
+            p.add_argument(
+                "--last_bn_0_init",
+                metavar="True|False",
+                default=self.last_bn_0_init,
+                type=bool,
+            )
+            p.add_argument(
+                "--conv_init",
+                default=self.conv_init,
+                choices=["fan_in", "fan_out"],
+                type=str,
+                help="initialization mode for convolutional layers, see https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.kaiming_normal_",
+            )
+            p.add_argument("--trt", metavar="True|False", default=self.trt, type=bool)
+            p.add_argument(
+                "--fused_se", metavar="True|False", default=self.fused_se, type=bool
+            )
+
+            return p
+
+    def __init__(
+        self,
+        arch: Arch,
+        num_classes: int = 1000,
+        last_bn_0_init: bool = False,
+        conv_init: str = "fan_in",
+        trt: bool = False,
+        fused_se: bool = True,
+    ):
+
+        super(ResNet, self).__init__()
+        self.arch = arch
+        self.builder = LayerBuilder(
+            LayerBuilder.Config(activation=arch.activation, conv_init=conv_init)
+        )
+        self.last_bn_0_init = last_bn_0_init
+        self.conv1 = self.builder.conv7x7(3, arch.stem_width, stride=2)
+        self.bn1 = self.builder.batchnorm(arch.stem_width)
+        self.relu = self.builder.activation()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        inplanes = arch.stem_width
+        assert len(arch.widths) == len(arch.layers)
+        self.num_layers = len(arch.widths)
+        layers = []
+        for i, (w, l) in enumerate(zip(arch.widths, arch.layers)):
+            layer, inplanes = self._make_layer(
+                arch.block,
+                arch.expansion,
+                inplanes,
+                w,
+                l,
+                cardinality=arch.cardinality,
+                stride=1 if i == 0 else 2,
+                trt=trt,
+                fused_se=fused_se,
+            )
+            layers.append(layer)
+
+        self.layers = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(arch.widths[-1] * arch.expansion, num_classes)
+
+    def stem(self, x):
+        x = self.conv1(x)
+        if self.bn1 is not None:
+            x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        return x
+
+    def classifier(self, x):
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.layers(x)
+        x = self.classifier(x)
+        return x
+
+    def extract_features(self, x, layers=None):
+        if layers is None:
+            layers = [f"layer{i+1}" for i in range(self.num_layers)] + ["classifier"]
+
+        run = [
+            i
+            for i in range(self.num_layers)
+            if "classifier" in layers
+            or any([f"layer{j+1}" in layers for j in range(i, self.num_layers)])
+        ]
+
+        output = {}
+        x = self.stem(x)
+        for l in run:
+            fn = self.layers[l]
+            x = fn(x)
+            if f"layer{l+1}" in layers:
+                output[f"layer{l+1}"] = x
+
+        if "classifier" in layers:
+            output["classifier"] = self.classifier(x)
+
+        return output
+
+    # helper functions {{{
+    def _make_layer(
+        self,
+        block,
+        expansion,
+        inplanes,
+        planes,
+        blocks,
+        stride=1,
+        cardinality=1,
+        trt=False,
+        fused_se=True,
+    ):
+        downsample = None
+        if stride != 1 or inplanes != planes * expansion:
+            dconv = self.builder.conv1x1(inplanes, planes * expansion, stride=stride)
+            dbn = self.builder.batchnorm(planes * expansion)
+            if dbn is not None:
+                downsample = nn.Sequential(dconv, dbn)
+            else:
+                downsample = dconv
+
+        layers = []
+        for i in range(blocks):
+            layers.append(
+                block(
+                    self.builder,
+                    inplanes,
+                    planes,
+                    expansion,
+                    stride=stride if i == 0 else 1,
+                    cardinality=cardinality,
+                    downsample=downsample if i == 0 else None,
+                    fused_se=fused_se,
+                    last_bn_0_init=self.last_bn_0_init,
+                    trt=trt,
+                )
+            )
+            inplanes = planes * expansion
+
+        return nn.Sequential(*layers), inplanes
+
+    def ngc_checkpoint_remap(self, url=None, version=None):
+        if version is None:
+            version = url.split("/")[8]
+
+        def to_sequential_remap(s):
+            splited = s.split(".")
+            if splited[0].startswith("layer"):
+                return ".".join(
+                    ["layers." + str(int(splited[0][len("layer") :]) - 1)] + splited[1:]
+                )
+            else:
+                return s
+
+        def no_remap(s):
+            return s
+
+        return {"20.06.0": to_sequential_remap}.get(version, no_remap)
+
+    # }}}
+
+
+__models: Dict[str, Model] = {
+    "resnet50": Model(
+        constructor=ResNet,
+        arch=ResNet.Arch(
+            stem_width=64,
+            block=Bottleneck,
+            layers=[3, 4, 6, 3],
+            widths=[64, 128, 256, 512],
+            expansion=4,
+            default_image_size=224,
+        ),
+        params=ResNet.Params(),
+        checkpoint_url="https://api.ngc.nvidia.com/v2/models/nvidia/resnet50_pyt_amp/versions/20.06.0/files/nvidia_resnet50_200821.pth.tar",
+    ),
+    "resnext101-32x4d": Model(
+        constructor=ResNet,
+        arch=ResNet.Arch(
+            stem_width=64,
+            block=Bottleneck,
+            layers=[3, 4, 23, 3],
+            widths=[128, 256, 512, 1024],
+            expansion=2,
+            cardinality=32,
+            default_image_size=224,
+        ),
+        params=ResNet.Params(),
+        checkpoint_url="https://api.ngc.nvidia.com/v2/models/nvidia/resnext101_32x4d_pyt_amp/versions/20.06.0/files/nvidia_resnext101-32x4d_200821.pth.tar",
+    ),
+    "se-resnext101-32x4d": Model(
+        constructor=ResNet,
+        arch=ResNet.Arch(
+            stem_width=64,
+            block=SEBottleneck,
+            layers=[3, 4, 23, 3],
+            widths=[128, 256, 512, 1024],
+            expansion=2,
+            cardinality=32,
+            default_image_size=224,
+        ),
+        params=ResNet.Params(),
+        checkpoint_url="https://api.ngc.nvidia.com/v2/models/nvidia/seresnext101_32x4d_pyt_amp/versions/20.06.0/files/nvidia_se-resnext101-32x4d_200821.pth.tar",
+    ),
+}
+
+_ce = lambda n: EntryPoint.create(n, __models[n])
+resnet50 = _ce("resnet50")
+resnext101_32x4d = _ce("resnext101-32x4d")
+se_resnext101_32x4d = _ce("se-resnext101-32x4d")
--- a/image_classification/optimizers.py
+++ b/image_classification/optimizers.py
+import math
+
+import numpy as np
+import torch
+from torch import optim
+
+
+def get_optimizer(parameters, lr, args, state=None):
+    if args.optimizer == "sgd":
+        optimizer = get_sgd_optimizer(
+            parameters,
+            lr,
+            momentum=args.momentum,
+            weight_decay=args.weight_decay,
+            nesterov=args.nesterov,
+            bn_weight_decay=args.bn_weight_decay,
+        )
+    elif args.optimizer == "rmsprop":
+        optimizer = get_rmsprop_optimizer(
+            parameters,
+            lr,
+            alpha=args.rmsprop_alpha,
+            momentum=args.momentum,
+            weight_decay=args.weight_decay,
+            eps=args.rmsprop_eps,
+            bn_weight_decay=args.bn_weight_decay,
+        )
+    if not state is None:
+        optimizer.load_state_dict(state)
+
+    return optimizer
+
+
+def get_sgd_optimizer(
+    parameters, lr, momentum, weight_decay, nesterov=False, bn_weight_decay=False
+):
+    if bn_weight_decay:
+        print(" ! Weight decay applied to BN parameters ")
+        params = [v for n, v in parameters]
+    else:
+        print(" ! Weight decay NOT applied to BN parameters ")
+        bn_params = [v for n, v in parameters if "bn" in n]
+        rest_params = [v for n, v in parameters if not "bn" in n]
+        print(len(bn_params))
+        print(len(rest_params))
+
+        params = [
+            {"params": bn_params, "weight_decay": 0},
+            {"params": rest_params, "weight_decay": weight_decay},
+        ]
+
+    optimizer = torch.optim.SGD(
+        params, lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov
+    )
+
+    return optimizer
+
+
+def get_rmsprop_optimizer(
+    parameters, lr, alpha, weight_decay, momentum, eps, bn_weight_decay=False
+):
+    bn_params = [v for n, v in parameters if "bn" in n]
+    rest_params = [v for n, v in parameters if not "bn" in n]
+
+    params = [
+        {"params": bn_params, "weight_decay": weight_decay if bn_weight_decay else 0},
+        {"params": rest_params, "weight_decay": weight_decay},
+    ]
+
+    optimizer = torch.optim.RMSprop(
+        params,
+        lr=lr,
+        alpha=alpha,
+        weight_decay=weight_decay,
+        momentum=momentum,
+        eps=eps,
+    )
+
+    return optimizer
+
+
+def lr_policy(lr_fn):
+    def _alr(optimizer, iteration, epoch):
+        lr = lr_fn(iteration, epoch)
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = lr
+
+        return lr
+
+    return _alr
+
+
+def lr_step_policy(base_lr, steps, decay_factor, warmup_length):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            lr = base_lr
+            for s in steps:
+                if epoch >= s:
+                    lr *= decay_factor
+        return lr
+
+    return lr_policy(_lr_fn)
+
+
+def lr_linear_policy(base_lr, warmup_length, epochs):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            es = epochs - warmup_length
+            lr = base_lr * (1 - (e / es))
+        return lr
+
+    return lr_policy(_lr_fn)
+
+
+def lr_cosine_policy(base_lr, warmup_length, epochs, end_lr=0):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            es = epochs - warmup_length
+            lr = end_lr + (0.5 * (1 + np.cos(np.pi * e / es)) * (base_lr - end_lr))
+        return lr
+
+    return lr_policy(_lr_fn)
+
+
+def lr_exponential_policy(
+    base_lr,
+    warmup_length,
+    epochs,
+    final_multiplier=0.001,
+    decay_factor=None,
+    decay_step=1,
+    logger=None,
+):
+    """Exponential lr policy. Setting decay factor parameter overrides final_multiplier"""
+    es = epochs - warmup_length
+
+    if decay_factor is not None:
+        epoch_decay = decay_factor
+    else:
+        epoch_decay = np.power(
+            2, np.log2(final_multiplier) / math.floor(es / decay_step)
+        )
+
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            lr = base_lr * (epoch_decay ** math.floor(e / decay_step))
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
--- a/image_classification/quantization.py
+++ b/image_classification/quantization.py
+from tqdm import tqdm
+import torch
+import contextlib
+import time
+import logging
+
+from pytorch_quantization import quant_modules
+from pytorch_quantization import nn as quant_nn
+from pytorch_quantization import calib
+from pytorch_quantization.tensor_quant import QuantDescriptor
+from . import logger as log
+from .utils import calc_ips
+import dllogger
+
+initialize = quant_modules.initialize
+deactivate = quant_modules.deactivate
+
+IPS_METADATA = {"unit": "img/s", "format": ":.2f"}
+TIME_METADATA = {"unit": "s", "format": ":.5f"}
+
+
+def select_default_calib_method(calib_method='histogram'):
+    """Set up selected calibration method in whole network"""
+    quant_desc_input = QuantDescriptor(calib_method=calib_method)
+    quant_nn.QuantConv1d.set_default_quant_desc_input(quant_desc_input)
+    quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
+    quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
+    quant_nn.QuantAdaptiveAvgPool2d.set_default_quant_desc_input(quant_desc_input)
+
+
+def quantization_setup(calib_method='histogram'):
+    """Change network into quantized version "automatically" and selects histogram as default quantization method"""
+    select_default_calib_method(calib_method)
+    initialize()
+
+
+def disable_calibration(model):
+    """Disables calibration in whole network. Should be run always before running interference."""
+    for name, module in model.named_modules():
+        if isinstance(module, quant_nn.TensorQuantizer):
+            if module._calibrator is not None:
+                module.enable_quant()
+                module.disable_calib()
+            else:
+                module.enable()
+
+
+def collect_stats(model, data_loader, logger, num_batches):
+    """Feed data to the network and collect statistic"""
+    if logger is not None:
+        logger.register_metric(
+            f"calib.total_ips",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.DEFAULT,
+            metadata=IPS_METADATA,
+        )
+        logger.register_metric(
+            f"calib.data_time",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.DEFAULT,
+            metadata=TIME_METADATA,
+        )
+        logger.register_metric(
+            f"calib.compute_latency",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.DEFAULT,
+            metadata=TIME_METADATA,
+        )
+    # Enable calibrators
+    data_iter = enumerate(data_loader)
+    if logger is not None:
+        data_iter = logger.iteration_generator_wrapper(data_iter, mode='calib')
+
+    for name, module in model.named_modules():
+        if isinstance(module, quant_nn.TensorQuantizer):
+            if module._calibrator is not None:
+                module.disable_quant()
+                module.enable_calib()
+            else:
+                module.disable()
+
+    end = time.time()
+
+    if logger is not None:
+        logger.start_calibration()
+
+    for i, (image, _) in data_iter:
+        bs = image.size(0)
+        data_time = time.time() - end
+
+        model(image.cuda())
+
+        it_time = time.time() - end
+
+        if logger is not None:
+            logger.log_metric(f"calib.total_ips", calc_ips(bs, it_time))
+            logger.log_metric(f"calib.data_time", data_time)
+            logger.log_metric(f"calib.compute_latency", it_time - data_time)
+
+        if i >= num_batches:
+            time.sleep(5)
+            break
+
+        end = time.time()
+
+    if logger is not None:
+        logger.end_calibration()
+
+    logging.disable(logging.WARNING)
+    disable_calibration(model)
+
+
+def compute_amax(model, **kwargs):
+    """Loads statistics of data and calculates quantization parameters in whole network"""
+    for name, module in model.named_modules():
+        if isinstance(module, quant_nn.TensorQuantizer) and module._calibrator is not None:
+            if isinstance(module._calibrator, calib.MaxCalibrator):
+                module.load_calib_amax()
+            else:
+                module.load_calib_amax(**kwargs)
+    model.cuda()
+
+
+def calibrate(model, train_loader, logger, calib_iter=1, percentile=99.99):
+    """Calibrates whole network i.e. gathers data for quantization and calculates quantization parameters"""
+    model.eval()
+
+    with torch.no_grad():
+        collect_stats(model, train_loader, logger, num_batches=calib_iter)
+        compute_amax(model, method="percentile", percentile=percentile)
+
+    logging.disable(logging.NOTSET)
+
+
+@contextlib.contextmanager
+def switch_on_quantization(do_quantization=True):
+    """Context manager for quantization activation"""
+    if do_quantization:
+        initialize()
+    try:
+        yield
+    finally:
+        if do_quantization:
+            deactivate()
--- a/image_classification/smoothing.py
+++ b/image_classification/smoothing.py
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+
+
+class LabelSmoothing(nn.Module):
+    """
+    NLL loss with label smoothing.
+    """
+
+    def __init__(self, smoothing=0.0):
+        """
+        Constructor for the LabelSmoothing module.
+
+        :param smoothing: label smoothing factor
+        """
+        super(LabelSmoothing, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+
+    def forward(self, x, target):
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+
+        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.mean()
--- a/image_classification/training.py
+++ b/image_classification/training.py
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import time
+from copy import deepcopy
+from functools import wraps
+from typing import Callable, Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch.cuda.amp import autocast
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from . import logger as log
+from . import utils
+from .logger import TrainingMetrics, ValidationMetrics
+from .models.common import EMA
+
+
+class Executor:
+    def __init__(
+        self,
+        model: nn.Module,
+        loss: Optional[nn.Module],
+        cuda: bool = True,
+        memory_format: torch.memory_format = torch.contiguous_format,
+        amp: bool = False,
+        scaler: Optional[torch.cuda.amp.GradScaler] = None,
+        divide_loss: int = 1,
+        ts_script: bool = False,
+    ):
+        assert not (amp and scaler is None), "Gradient Scaler is needed for AMP"
+
+        def xform(m: nn.Module) -> nn.Module:
+            if cuda:
+                m = m.cuda()
+            m.to(memory_format=memory_format)
+            return m
+
+        self.model = xform(model)
+        if ts_script:
+            self.model = torch.jit.script(self.model)
+        self.ts_script = ts_script
+        self.loss = xform(loss) if loss is not None else None
+        self.amp = amp
+        self.scaler = scaler
+        self.is_distributed = False
+        self.divide_loss = divide_loss
+        self._fwd_bwd = None
+        self._forward = None
+
+    def distributed(self, gpu_id):
+        self.is_distributed = True
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            self.model = DDP(self.model, device_ids=[gpu_id], output_device=gpu_id)
+        torch.cuda.current_stream().wait_stream(s)
+
+    def _fwd_bwd_fn(
+        self,
+        input: torch.Tensor,
+        target: torch.Tensor,
+    ) -> torch.Tensor:
+        with autocast(enabled=self.amp):
+            loss = self.loss(self.model(input), target)
+            loss /= self.divide_loss
+
+        self.scaler.scale(loss).backward()
+        return loss
+
+    def _forward_fn(
+        self, input: torch.Tensor, target: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        with torch.no_grad(), autocast(enabled=self.amp):
+            output = self.model(input)
+            loss = None if self.loss is None else self.loss(output, target)
+
+        return output if loss is None else loss, output
+
+    def optimize(self, fn):
+        return fn
+
+    @property
+    def forward_backward(self):
+        if self._fwd_bwd is None:
+            if self.loss is None:
+                raise NotImplementedError(
+                    "Loss must not be None for forward+backward step"
+                )
+            self._fwd_bwd = self.optimize(self._fwd_bwd_fn)
+        return self._fwd_bwd
+
+    @property
+    def forward(self):
+        if self._forward is None:
+            self._forward = self.optimize(self._forward_fn)
+        return self._forward
+
+    def train(self):
+        self.model.train()
+        if self.loss is not None:
+            self.loss.train()
+
+    def eval(self):
+        self.model.eval()
+        if self.loss is not None:
+            self.loss.eval()
+
+
+class Trainer:
+    def __init__(
+        self,
+        executor: Executor,
+        optimizer: torch.optim.Optimizer,
+        grad_acc_steps: int,
+        ema: Optional[float] = None,
+    ):
+        self.executor = executor
+        self.optimizer = optimizer
+        self.grad_acc_steps = grad_acc_steps
+        self.use_ema = False
+        if ema is not None:
+            self.ema_executor = deepcopy(self.executor)
+            self.ema = EMA(ema, self.ema_executor.model)
+            self.use_ema = True
+
+        self.optimizer.zero_grad(set_to_none=True)
+        self.steps_since_update = 0
+
+    def train(self):
+        self.executor.train()
+        if self.use_ema:
+            self.ema_executor.train()
+
+    def eval(self):
+        self.executor.eval()
+        if self.use_ema:
+            self.ema_executor.eval()
+
+    def train_step(self, input, target, step=None):
+        loss = self.executor.forward_backward(input, target)
+
+        self.steps_since_update += 1
+
+        if self.steps_since_update == self.grad_acc_steps:
+            if self.executor.scaler is not None:
+                self.executor.scaler.step(self.optimizer)
+                self.executor.scaler.update()
+            else:
+                self.optimizer.step()
+            self.optimizer.zero_grad()
+            self.steps_since_update = 0
+
+        torch.cuda.synchronize()
+
+        if self.use_ema:
+            self.ema(self.executor.model, step=step)
+
+        return loss
+
+    def validation_steps(self) -> Dict[str, Callable]:
+        vsd: Dict[str, Callable] = {"val": self.executor.forward}
+        if self.use_ema:
+            vsd["val_ema"] = self.ema_executor.forward
+        return vsd
+
+    def state_dict(self) -> dict:
+        res = {
+            "state_dict": self.executor.model.state_dict(),
+            "optimizer": self.optimizer.state_dict(),
+        }
+        if self.use_ema:
+            res["state_dict_ema"] = self.ema_executor.model.state_dict()
+
+        return res
+
+
+def train(
+    train_step,
+    train_loader,
+    lr_scheduler,
+    grad_scale_fn,
+    log_fn,
+    timeout_handler,
+    prof=-1,
+    step=0,
+):
+    interrupted = False
+
+    end = time.time()
+
+    data_iter = enumerate(train_loader)
+
+    for i, (input, target) in data_iter:
+        bs = input.size(0)
+        lr = lr_scheduler(i)
+        data_time = time.time() - end
+
+        loss = train_step(input, target, step=step + i)
+        it_time = time.time() - end
+
+        with torch.no_grad():
+            if torch.distributed.is_initialized():
+                reduced_loss = utils.reduce_tensor(loss.detach())
+            else:
+                reduced_loss = loss.detach()
+
+        log_fn(
+            compute_ips=utils.calc_ips(bs, it_time - data_time),
+            total_ips=utils.calc_ips(bs, it_time),
+            data_time=data_time,
+            compute_time=it_time - data_time,
+            lr=lr,
+            loss=reduced_loss.item(),
+            grad_scale=grad_scale_fn(),
+        )
+
+        end = time.time()
+        if prof > 0 and (i + 1 >= prof):
+            time.sleep(5)
+            break
+        if ((i + 1) % 20 == 0) and timeout_handler.interrupted:
+            time.sleep(5)
+            interrupted = True
+            break
+
+    return interrupted
+
+
+def validate(infer_fn, val_loader, log_fn, prof=-1, with_loss=True, topk=5):
+    top1 = log.AverageMeter()
+    # switch to evaluate mode
+
+    end = time.time()
+
+    data_iter = enumerate(val_loader)
+
+    for i, (input, target) in data_iter:
+        bs = input.size(0)
+        data_time = time.time() - end
+
+        if with_loss:
+            loss, output = infer_fn(input, target)
+        else:
+            output = infer_fn(input)
+
+        with torch.no_grad():
+            precs = utils.accuracy(output.data, target, topk=(1, topk))
+
+            if torch.distributed.is_initialized():
+                if with_loss:
+                    reduced_loss = utils.reduce_tensor(loss.detach())
+                precs = map(utils.reduce_tensor, precs)
+            else:
+                if with_loss:
+                    reduced_loss = loss.detach()
+
+        precs = map(lambda t: t.item(), precs)
+        infer_result = {f"top{k}": (p, bs) for k, p in zip((1, topk), precs)}
+
+        if with_loss:
+            infer_result["loss"] = (reduced_loss.item(), bs)
+
+        torch.cuda.synchronize()
+
+        it_time = time.time() - end
+
+        top1.record(infer_result["top1"][0], bs)
+
+        log_fn(
+            compute_ips=utils.calc_ips(bs, it_time - data_time),
+            total_ips=utils.calc_ips(bs, it_time),
+            data_time=data_time,
+            compute_time=it_time - data_time,
+            **infer_result,
+        )
+
+        end = time.time()
+        if (prof > 0) and (i + 1 >= prof):
+            time.sleep(5)
+            break
+
+    return top1.get_val()
+
+
+# Train loop {{{
+def train_loop(
+    trainer: Trainer,
+    lr_scheduler,
+    train_loader,
+    train_loader_len,
+    val_loader,
+    logger,
+    best_prec1=0,
+    start_epoch=0,
+    end_epoch=0,
+    early_stopping_patience=-1,
+    prof=-1,
+    skip_training=False,
+    skip_validation=False,
+    save_checkpoints=True,
+    checkpoint_dir="./",
+    checkpoint_filename="checkpoint.pth.tar",
+    keep_last_n_checkpoints=0,
+    topk=5,
+):
+    checkpointer = utils.Checkpointer(
+        last_filename=checkpoint_filename,
+        checkpoint_dir=checkpoint_dir,
+        keep_last_n=keep_last_n_checkpoints,
+    )
+    train_metrics = TrainingMetrics(logger)
+    val_metrics = {
+        k: ValidationMetrics(logger, k, topk) for k in trainer.validation_steps().keys()
+    }
+    training_step = trainer.train_step
+
+    prec1 = -1
+
+    if early_stopping_patience > 0:
+        epochs_since_improvement = 0
+
+    print(f"RUNNING EPOCHS FROM {start_epoch} TO {end_epoch}")
+    with utils.TimeoutHandler() as timeout_handler:
+        interrupted = False
+        for epoch in range(start_epoch, end_epoch):
+            if logger is not None:
+                logger.start_epoch()
+            if not skip_training:
+                if logger is not None:
+                    data_iter = logger.iteration_generator_wrapper(
+                        train_loader, mode="train"
+                    )
+                else:
+                    data_iter = train_loader
+
+                trainer.train()
+                interrupted = train(
+                    training_step,
+                    data_iter,
+                    lambda i: lr_scheduler(trainer.optimizer, i, epoch),
+                    trainer.executor.scaler.get_scale,
+                    train_metrics.log,
+                    timeout_handler,
+                    prof=prof,
+                    step=epoch * train_loader_len,
+                )
+
+            if not skip_validation:
+                trainer.eval()
+                for k, infer_fn in trainer.validation_steps().items():
+                    if logger is not None:
+                        data_iter = logger.iteration_generator_wrapper(
+                            val_loader, mode="val"
+                        )
+                    else:
+                        data_iter = val_loader
+
+                    step_prec1, _ = validate(
+                        infer_fn,
+                        data_iter,
+                        val_metrics[k].log,
+                        prof=prof,
+                        topk=topk,
+                    )
+
+                    if k == "val":
+                        prec1 = step_prec1
+
+                if prec1 > best_prec1:
+                    is_best = True
+                    best_prec1 = prec1
+                else:
+                    is_best = False
+            else:
+                is_best = False
+                best_prec1 = 0
+
+            if logger is not None:
+                logger.end_epoch()
+
+            if save_checkpoints and (
+                not torch.distributed.is_initialized()
+                or torch.distributed.get_rank() == 0
+            ):
+                checkpoint_state = {
+                    "epoch": epoch + 1,
+                    "best_prec1": best_prec1,
+                    **trainer.state_dict(),
+                }
+                checkpointer.save_checkpoint(
+                    checkpoint_state,
+                    is_best,
+                    filename=f"checkpoint_{epoch:04}.pth.tar",
+                )
+
+            if early_stopping_patience > 0:
+                if not is_best:
+                    epochs_since_improvement += 1
+                else:
+                    epochs_since_improvement = 0
+                if epochs_since_improvement >= early_stopping_patience:
+                    break
+            if interrupted:
+                break
+
+
+# }}}
--- a/image_classification/utils.py
+++ b/image_classification/utils.py
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import math
+import os
+import numpy as np
+import torch
+import shutil
+import signal
+import torch.distributed as dist
+
+
+class Checkpointer:
+    def __init__(self, last_filename, checkpoint_dir="./", keep_last_n=0):
+        self.last_filename = last_filename
+        self.checkpoints = []
+        self.checkpoint_dir = checkpoint_dir
+        self.keep_last_n = keep_last_n
+
+    def cleanup(self):
+        to_delete = self.checkpoints[: -self.keep_last_n]
+        self.checkpoints = self.checkpoints[-self.keep_last_n :]
+        for f in to_delete:
+            full_path = os.path.join(self.checkpoint_dir, f)
+            os.remove(full_path)
+
+    def get_full_path(self, filename):
+        return os.path.join(self.checkpoint_dir, filename)
+
+    def save_checkpoint(
+        self,
+        state,
+        is_best,
+        filename,
+    ):
+        if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0:
+            assert False
+
+        full_path = self.get_full_path(filename)
+
+        print("SAVING {}".format(full_path))
+        torch.save(state, full_path)
+        self.checkpoints.append(filename)
+
+        shutil.copyfile(
+            full_path, self.get_full_path(self.last_filename)
+        )
+
+        if is_best:
+            shutil.copyfile(
+                full_path, self.get_full_path("model_best.pth.tar")
+            )
+
+        self.cleanup()
+
+
+def timed_generator(gen):
+    start = time.time()
+    for g in gen:
+        end = time.time()
+        t = end - start
+        yield g, t
+        start = time.time()
+
+
+def timed_function(f):
+    def _timed_function(*args, **kwargs):
+        start = time.time()
+        ret = f(*args, **kwargs)
+        return ret, time.time() - start
+
+    return _timed_function
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].float().sum()
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+def reduce_tensor(tensor):
+    rt = tensor.clone().detach()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= (
+        torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
+    )
+    return rt
+
+
+def first_n(n, generator):
+    for i, d in zip(range(n), generator):
+        yield d
+
+
+class TimeoutHandler:
+    def __init__(self, sig=signal.SIGTERM):
+        self.sig = sig
+        self.device = torch.device("cuda")
+
+    @property
+    def interrupted(self):
+        if not dist.is_initialized():
+            return self._interrupted
+
+        interrupted = torch.tensor(self._interrupted).int().to(self.device)
+        dist.broadcast(interrupted, 0)
+        interrupted = bool(interrupted.item())
+        return interrupted
+
+    def __enter__(self):
+        self._interrupted = False
+        self.released = False
+        self.original_handler = signal.getsignal(self.sig)
+
+        def master_handler(signum, frame):
+            self.release()
+            self._interrupted = True
+            print(f"Received SIGTERM")
+
+        def ignoring_handler(signum, frame):
+            self.release()
+            print("Received SIGTERM, ignoring")
+
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        if rank == 0:
+            signal.signal(self.sig, master_handler)
+        else:
+            signal.signal(self.sig, ignoring_handler)
+        return self
+
+    def __exit__(self, type, value, tb):
+        self.release()
+
+    def release(self):
+        if self.released:
+            return False
+        signal.signal(self.sig, self.original_handler)
+        self.released = True
+        return True
+
+
+def calc_ips(batch_size, time):
+    world_size = (
+        torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
+    )
+    tbs = world_size * batch_size
+    return tbs / time