add new model resnet50v1.5

e129194a · Sugon_ldc · e129194a · e129194a · e129194a · e129194a
Commit e129194a authored Sep 26, 2023 by Sugon_ldc
20 changed files
--- a/efficientnet/training/FP32/DGX1V-16G_efficientnet-widese-b0_FP32.sh
+++ b/efficientnet/training/FP32/DGX1V-16G_efficientnet-widese-b0_FP32.sh
+python ./multiproc.py --nproc_per_node 8 ./launch.py --model efficientnet-widese-b0 --precision FP32 --mode convergence --platform DGX1V-16G /imagenet --workspace ${1:-./} --raport-file raport.json
--- a/efficientnet/training/TF32/DGXA100_efficientnet-b0_TF32.sh
+++ b/efficientnet/training/TF32/DGXA100_efficientnet-b0_TF32.sh
+python ./multiproc.py --nproc_per_node 8 ./launch.py --model efficientnet-b0 --precision TF32 --mode convergence --platform DGXA100 /imagenet --workspace ${1:-./} --raport-file raport.json
--- a/efficientnet/training/TF32/DGXA100_efficientnet-b4_TF32.sh
+++ b/efficientnet/training/TF32/DGXA100_efficientnet-b4_TF32.sh
+python ./multiproc.py --nproc_per_node 8 ./launch.py --model efficientnet-b4 --precision TF32 --mode convergence --platform DGXA100 /imagenet --workspace ${1:-./} --raport-file raport.json
--- a/efficientnet/training/TF32/DGXA100_efficientnet-widese-b0_TF32.sh
+++ b/efficientnet/training/TF32/DGXA100_efficientnet-widese-b0_TF32.sh
+python ./multiproc.py --nproc_per_node 8 ./launch.py --model efficientnet-widese-b0 --precision TF32 --mode convergence --platform DGXA100 /imagenet --workspace ${1:-./} --raport-file raport.json
--- a/efficientnet/training/TF32/DGXA100_efficientnet-widese-b4_TF32.sh
+++ b/efficientnet/training/TF32/DGXA100_efficientnet-widese-b4_TF32.sh
+python ./multiproc.py --nproc_per_node 8 ./launch.py --model efficientnet-widese-b4 --precision TF32 --mode convergence --platform DGXA100 /imagenet --workspace ${1:-./} --raport-file raport.json
--- a/image_classification/__init__.py
+++ b/image_classification/__init__.py
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#from . import logger
+#from . import dataloaders
+#from . import training
+#from . import utils
+#from . import mixup
+#from . import smoothing
+from . import models
--- a/image_classification/__pycache__/__init__.cpython-37.pyc
+++ b/image_classification/__pycache__/__init__.cpython-37.pyc
--- a/image_classification/__pycache__/autoaugment.cpython-37.pyc
+++ b/image_classification/__pycache__/autoaugment.cpython-37.pyc
--- a/image_classification/__pycache__/dataloaders.cpython-37.pyc
+++ b/image_classification/__pycache__/dataloaders.cpython-37.pyc
--- a/image_classification/__pycache__/gpu_affinity.cpython-37.pyc
+++ b/image_classification/__pycache__/gpu_affinity.cpython-37.pyc
--- a/image_classification/__pycache__/logger.cpython-37.pyc
+++ b/image_classification/__pycache__/logger.cpython-37.pyc
--- a/image_classification/__pycache__/mixup.cpython-37.pyc
+++ b/image_classification/__pycache__/mixup.cpython-37.pyc
--- a/image_classification/__pycache__/optimizers.cpython-37.pyc
+++ b/image_classification/__pycache__/optimizers.cpython-37.pyc
--- a/image_classification/__pycache__/smoothing.cpython-37.pyc
+++ b/image_classification/__pycache__/smoothing.cpython-37.pyc
--- a/image_classification/__pycache__/training.cpython-37.pyc
+++ b/image_classification/__pycache__/training.cpython-37.pyc
--- a/image_classification/__pycache__/utils.cpython-37.pyc
+++ b/image_classification/__pycache__/utils.cpython-37.pyc
--- a/image_classification/autoaugment.py
+++ b/image_classification/autoaugment.py
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+class AutoaugmentImageNetPolicy(object):
+    """
+    Randomly choose one of the best 24 Sub-policies on ImageNet.
+    Reference: https://arxiv.org/abs/1805.09501
+    """
+    def __init__(self):
+        self.policies = [
+            SubPolicy(0.8, "equalize", 1, 0.8, "shearY", 4),
+            SubPolicy(0.4, "color", 9, 0.6, "equalize", 3),
+            SubPolicy(0.4, "color", 1, 0.6, "rotate", 8),
+            SubPolicy(0.8, "solarize", 3, 0.4, "equalize", 7),
+            SubPolicy(0.4, "solarize", 2, 0.6, "solarize", 2),
+            SubPolicy(0.2, "color", 0, 0.8, "equalize", 8),
+            SubPolicy(0.4, "equalize", 8, 0.8, "solarizeadd", 3),
+            SubPolicy(0.2, "shearX", 9, 0.6, "rotate", 8),
+            SubPolicy(0.6, "color", 1, 1.0, "equalize", 2),
+            SubPolicy(0.4, "invert", 9, 0.6, "rotate", 0),
+            SubPolicy(1.0, "equalize", 9, 0.6, "shearY", 3),
+            SubPolicy(0.4, "color", 7, 0.6, "equalize", 0),
+            SubPolicy(0.4, "posterize", 6, 0.4, "autocontrast", 7),
+            SubPolicy(0.6, "solarize", 8, 0.6, "color", 9),
+            SubPolicy(0.2, "solarize", 4, 0.8, "rotate", 9),
+            SubPolicy(1.0, "rotate", 7, 0.8, "translateY", 9),
+            SubPolicy(0.0, "shearX", 0, 0.8, "solarize", 4),
+            SubPolicy(0.8, "shearY", 0, 0.6, "color", 4),
+            SubPolicy(1.0, "color", 0, 0.6, "rotate", 2),
+            SubPolicy(0.8, "equalize", 4, 0.0, "equalize", 8),
+            SubPolicy(1.0, "equalize", 4, 0.6, "autocontrast", 2),
+            SubPolicy(0.4, "shearY", 7, 0.6, "solarizeadd", 7),
+            SubPolicy(0.8, "posterize", 2, 0.6, "solarize", 10),
+            SubPolicy(0.6, "solarize", 8, 0.6, "equalize", 1),
+            SubPolicy(0.8, "color", 6, 0.4, "rotate", 5),
+        ]
+    def __call__(self, img):
+        policy_idx = random.randint(0, len(self.policies) - 1)
+        return self.policies[policy_idx](img)
+    def __repr__(self):
+        return "AutoAugment ImageNet Policy"
+class SubPolicy(object):
+    def __init__(self, p1, method1, magnitude_idx1, p2, method2, magnitude_idx2):
+        operation_factory = OperationFactory()
+        self.p1 = p1
+        self.p2 = p2
+        self.operation1 = operation_factory.get_operation(method1, magnitude_idx1)
+        self.operation2 = operation_factory.get_operation(method2, magnitude_idx2)
+    def __call__(self, img):
+        if random.random() < self.p1:
+            img = self.operation1(img)
+        if random.random() < self.p2:
+            img = self.operation2(img)
+        return img
+class OperationFactory:
+    def __init__(self):
+        fillcolor = (128, 128, 128)
+        self.ranges = {
+            "shearX": np.linspace(0, 0.3, 11),
+            "shearY": np.linspace(0, 0.3, 11),
+            "translateX": np.linspace(0, 250, 11),
+            "translateY": np.linspace(0, 250, 11),
+            "rotate": np.linspace(0, 30, 11),
+            "color": np.linspace(0.1, 1.9, 11),
+            "posterize": np.round(np.linspace(0, 4, 11), 0).astype(np.int),
+            "solarize": np.linspace(0, 256, 11),
+            "solarizeadd": np.linspace(0, 110, 11),
+            "contrast": np.linspace(0.1, 1.9, 11),
+            "sharpness": np.linspace(0.1, 1.9, 11),
+            "brightness": np.linspace(0.1, 1.9, 11),
+            "autocontrast": [0] * 10,
+            "equalize": [0] * 10,
+            "invert": [0] * 10
+        }
+        def rotate_with_fill(img, magnitude):
+            magnitude *= random.choice([-1, 1])
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot, Image.new("RGBA", rot.size, (128,) * 4), rot).convert(img.mode)
+        def solarize_add(image, addition=0, threshold=128):
+            lut = []
+            for i in range(256):
+                if i < threshold:
+                    res = i + addition if i + addition <= 255 else 255
+                    res = res if res >= 0 else 0
+                    lut.append(res)
+                else:
+                    lut.append(i)
+            from PIL.ImageOps import _lut
+            return _lut(image, lut)
+        self.operations = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(magnitude),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "solarizeadd": lambda img, magnitude: solarize_add(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(magnitude),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(magnitude),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(magnitude),
+            "autocontrast": lambda img, _: ImageOps.autocontrast(img),
+            "equalize": lambda img, _: ImageOps.equalize(img),
+            "invert": lambda img, _: ImageOps.invert(img)
+        }
+    def get_operation(self, method, magnitude_idx):
+        magnitude = self.ranges[method][magnitude_idx]
+        return lambda img: self.operations[method](img, magnitude)
--- a/image_classification/dataloaders.py
+++ b/image_classification/dataloaders.py
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import torch
+import numpy as np
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from PIL import Image
+from functools import partial
+from torchvision.transforms.functional import InterpolationMode
+from image_classification.autoaugment import AutoaugmentImageNetPolicy
+DATA_BACKEND_CHOICES = ["pytorch", "synthetic"]
+#try:
+#    from nvidia.dali.plugin.pytorch import DALIClassificationIterator
+#    from nvidia.dali.pipeline import Pipeline
+#    import nvidia.dali.ops as ops
+#    import nvidia.dali.types as types
+#
+#    DATA_BACKEND_CHOICES.append("dali-gpu")
+#    DATA_BACKEND_CHOICES.append("dali-cpu")
+#except ImportError:
+#    print(
+#        "Please install DALI from https://www.github.com/NVIDIA/DALI to run this example."
+#    )
+def load_jpeg_from_file(path, cuda=True):
+    img_transforms = transforms.Compose(
+        [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
+    )
+    img = img_transforms(Image.open(path))
+    with torch.no_grad():
+        # mean and std are not multiplied by 255 as they are in training script
+        # torch dataloader reads data into bytes whereas loading directly
+        # through PIL creates a tensor with floats in [0,1] range
+        mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+        std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+        if cuda:
+            mean = mean.cuda()
+            std = std.cuda()
+            img = img.cuda()
+        img = img.float()
+        input = img.unsqueeze(0).sub_(mean).div_(std)
+    return input
+#class HybridTrainPipe(Pipeline):
+#    def __init__(
+#        self,
+#        batch_size,
+#        num_threads,
+#        device_id,
+#        data_dir,
+#        interpolation,
+#        crop,
+#        dali_cpu=False,
+#    ):
+#        super(HybridTrainPipe, self).__init__(
+#            batch_size, num_threads, device_id, seed=12 + device_id
+#        )
+#        interpolation = {
+#            "bicubic": types.INTERP_CUBIC,
+#            "bilinear": types.INTERP_LINEAR,
+#            "triangular": types.INTERP_TRIANGULAR,
+#        }[interpolation]
+#        if torch.distributed.is_initialized():
+#            rank = torch.distributed.get_rank()
+#            world_size = torch.distributed.get_world_size()
+#        else:
+#            rank = 0
+#            world_size = 1
+#
+#        self.input = ops.FileReader(
+#            file_root=data_dir,
+#            shard_id=rank,
+#            num_shards=world_size,
+#            random_shuffle=True,
+#            pad_last_batch=True,
+#        )
+#
+#        if dali_cpu:
+#            dali_device = "cpu"
+#            self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB)
+#        else:
+#            dali_device = "gpu"
+#            # This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet
+#            # without additional reallocations
+#            self.decode = ops.ImageDecoder(
+#                device="mixed",
+#                output_type=types.RGB,
+#                device_memory_padding=211025920,
+#                host_memory_padding=140544512,
+#            )
+#
+#        self.res = ops.RandomResizedCrop(
+#            device=dali_device,
+#            size=[crop, crop],
+#            interp_type=interpolation,
+#            random_aspect_ratio=[0.75, 4.0 / 3.0],
+#            random_area=[0.08, 1.0],
+#            num_attempts=100,
+#            antialias=False,
+#        )
+#
+#        self.cmnp = ops.CropMirrorNormalize(
+#            device="gpu",
+#            dtype=types.FLOAT,
+#            output_layout=types.NCHW,
+#            crop=(crop, crop),
+#            mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+#            std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+#        )
+#        self.coin = ops.CoinFlip(probability=0.5)
+#
+#    def define_graph(self):
+#        rng = self.coin()
+#        self.jpegs, self.labels = self.input(name="Reader")
+#        images = self.decode(self.jpegs)
+#        images = self.res(images)
+#        output = self.cmnp(images.gpu(), mirror=rng)
+#        return [output, self.labels]
+#class HybridValPipe(Pipeline):
+#    def __init__(
+#        self, batch_size, num_threads, device_id, data_dir, interpolation, crop, size
+#    ):
+#        super(HybridValPipe, self).__init__(
+#            batch_size, num_threads, device_id, seed=12 + device_id
+#        )
+#        interpolation = {
+#            "bicubic": types.INTERP_CUBIC,
+#            "bilinear": types.INTERP_LINEAR,
+#            "triangular": types.INTERP_TRIANGULAR,
+#        }[interpolation]
+#        if torch.distributed.is_initialized():
+#            rank = torch.distributed.get_rank()
+#            world_size = torch.distributed.get_world_size()
+#        else:
+#            rank = 0
+#            world_size = 1
+#
+#        self.input = ops.FileReader(
+#            file_root=data_dir,
+#            shard_id=rank,
+#            num_shards=world_size,
+#            random_shuffle=False,
+#            pad_last_batch=True,
+#        )
+#
+#        self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
+#        self.res = ops.Resize(
+#            device="gpu",
+#            resize_shorter=size,
+#            interp_type=interpolation,
+#            antialias=False,
+#        )
+#        self.cmnp = ops.CropMirrorNormalize(
+#            device="gpu",
+#            dtype=types.FLOAT,
+#            output_layout=types.NCHW,
+#            crop=(crop, crop),
+#            mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+#            std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+#        )
+#
+#    def define_graph(self):
+#        self.jpegs, self.labels = self.input(name="Reader")
+#        images = self.decode(self.jpegs)
+#        images = self.res(images)
+#        output = self.cmnp(images)
+#        return [output, self.labels]
+#class DALIWrapper(object):
+#    def gen_wrapper(dalipipeline, num_classes, one_hot, memory_format):
+#        for data in dalipipeline:
+#            input = data[0]["data"].contiguous(memory_format=memory_format)
+#            target = torch.reshape(data[0]["label"], [-1]).cuda().long()
+#            if one_hot:
+#                target = expand(num_classes, torch.float, target)
+#            yield input, target
+#        dalipipeline.reset()
+#
+#    def __init__(self, dalipipeline, num_classes, one_hot, memory_format):
+#        self.dalipipeline = dalipipeline
+#        self.num_classes = num_classes
+#        self.one_hot = one_hot
+#        self.memory_format = memory_format
+#
+#    def __iter__(self):
+#        return DALIWrapper.gen_wrapper(
+#            self.dalipipeline, self.num_classes, self.one_hot, self.memory_format
+#        )
+#def get_dali_train_loader(dali_cpu=False):
+#    def gdtl(
+#        data_path,
+#        image_size,
+#        batch_size,
+#        num_classes,
+#        one_hot,
+#        interpolation="bilinear",
+#        augmentation=None,
+#        start_epoch=0,
+#        workers=5,
+#        _worker_init_fn=None,
+#        memory_format=torch.contiguous_format,
+#        **kwargs,
+#    ):
+#        if torch.distributed.is_initialized():
+#            rank = torch.distributed.get_rank()
+#            world_size = torch.distributed.get_world_size()
+#        else:
+#            rank = 0
+#            world_size = 1
+#
+#        traindir = os.path.join(data_path, "train")
+#        if augmentation is not None:
+#            raise NotImplementedError(
+#                f"Augmentation {augmentation} for dali loader is not supported"
+#            )
+#
+#        pipe = HybridTrainPipe(
+#            batch_size=batch_size,
+#            num_threads=workers,
+#            device_id=rank % torch.cuda.device_count(),
+#            data_dir=traindir,
+#            interpolation=interpolation,
+#            crop=image_size,
+#            dali_cpu=dali_cpu,
+#        )
+#
+#        pipe.build()
+#        train_loader = DALIClassificationIterator(
+#            pipe, reader_name="Reader", fill_last_batch=False
+#        )
+#
+#        return (
+#            DALIWrapper(train_loader, num_classes, one_hot, memory_format),
+#            int(pipe.epoch_size("Reader") / (world_size * batch_size)),
+#        )
+#
+#    return gdtl
+#def get_dali_val_loader():
+#    def gdvl(
+#        data_path,
+#        image_size,
+#        batch_size,
+#        num_classes,
+#        one_hot,
+#        interpolation="bilinear",
+#        crop_padding=32,
+#        workers=5,
+#        _worker_init_fn=None,
+#        memory_format=torch.contiguous_format,
+#        **kwargs,
+#    ):
+#        if torch.distributed.is_initialized():
+#            rank = torch.distributed.get_rank()
+#            world_size = torch.distributed.get_world_size()
+#        else:
+#            rank = 0
+#            world_size = 1
+#
+#        valdir = os.path.join(data_path, "val")
+#
+#        pipe = HybridValPipe(
+#            batch_size=batch_size,
+#            num_threads=workers,
+#            device_id=rank % torch.cuda.device_count(),
+#            data_dir=valdir,
+#            interpolation=interpolation,
+#            crop=image_size,
+#            size=image_size + crop_padding,
+#        )
+#
+#        pipe.build()
+#        val_loader = DALIClassificationIterator(
+#            pipe, reader_name="Reader", fill_last_batch=False
+#        )
+#
+#        return (
+#            DALIWrapper(val_loader, num_classes, one_hot, memory_format),
+#            int(pipe.epoch_size("Reader") / (world_size * batch_size)),
+#        )
+#
+#    return gdvl
+def fast_collate(memory_format, batch):
+    imgs = [img[0] for img in batch]
+    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
+    w = imgs[0].size[0]
+    h = imgs[0].size[1]
+    tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8).contiguous(
+        memory_format=memory_format
+    )
+    for i, img in enumerate(imgs):
+        nump_array = np.asarray(img, dtype=np.uint8)
+        if nump_array.ndim < 3:
+            nump_array = np.expand_dims(nump_array, axis=-1)
+        nump_array = np.rollaxis(nump_array, 2)
+        tensor[i] += torch.from_numpy(nump_array.copy())
+    return tensor, targets
+def expand(num_classes, dtype, tensor):
+    e = torch.zeros(
+        tensor.size(0), num_classes, dtype=dtype, device=torch.device("cuda")
+    )
+    e = e.scatter(1, tensor.unsqueeze(1), 1.0)
+    return e
+class PrefetchedWrapper(object):
+    def prefetched_loader(loader, num_classes, one_hot):
+        mean = (
+            torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255])
+            .cuda()
+            .view(1, 3, 1, 1)
+        )
+        std = (
+            torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255])
+            .cuda()
+            .view(1, 3, 1, 1)
+        )
+        stream = torch.cuda.Stream()
+        first = True
+        for next_input, next_target in loader:
+            with torch.cuda.stream(stream):
+                next_input = next_input.cuda(non_blocking=True)
+                next_target = next_target.cuda(non_blocking=True)
+                next_input = next_input.float()
+                if one_hot:
+                    next_target = expand(num_classes, torch.float, next_target)
+                next_input = next_input.sub_(mean).div_(std)
+            if not first:
+                yield input, target
+            else:
+                first = False
+            torch.cuda.current_stream().wait_stream(stream)
+            input = next_input
+            target = next_target
+        yield input, target
+    def __init__(self, dataloader, start_epoch, num_classes, one_hot):
+        self.dataloader = dataloader
+        self.epoch = start_epoch
+        self.one_hot = one_hot
+        self.num_classes = num_classes
+    def __iter__(self):
+        if self.dataloader.sampler is not None and isinstance(
+            self.dataloader.sampler, torch.utils.data.distributed.DistributedSampler
+        ):
+            self.dataloader.sampler.set_epoch(self.epoch)
+        self.epoch += 1
+        return PrefetchedWrapper.prefetched_loader(
+            self.dataloader, self.num_classes, self.one_hot
+        )
+    def __len__(self):
+        return len(self.dataloader)
+def get_pytorch_train_loader(
+    data_path,
+    image_size,
+    batch_size,
+    num_classes,
+    one_hot,
+    interpolation="bilinear",
+    augmentation=None,
+    start_epoch=0,
+    workers=5,
+    _worker_init_fn=None,
+    prefetch_factor=2,
+    memory_format=torch.contiguous_format,
+):
+    interpolation = {
+        "bicubic": InterpolationMode.BICUBIC,
+        "bilinear": InterpolationMode.BILINEAR,
+    }[interpolation]
+    traindir = os.path.join(data_path, "train")
+    transforms_list = [
+        transforms.RandomResizedCrop(image_size, interpolation=interpolation),
+        transforms.RandomHorizontalFlip(),
+    ]
+    if augmentation == "autoaugment":
+        transforms_list.append(AutoaugmentImageNetPolicy())
+    train_dataset = datasets.ImageFolder(traindir, transforms.Compose(transforms_list))
+    if torch.distributed.is_initialized():
+        train_sampler = torch.utils.data.distributed.DistributedSampler(
+            train_dataset, shuffle=True
+        )
+    else:
+        train_sampler = None
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        sampler=train_sampler,
+        batch_size=batch_size,
+        shuffle=(train_sampler is None),
+        num_workers=workers,
+        worker_init_fn=_worker_init_fn,
+        pin_memory=True,
+        collate_fn=partial(fast_collate, memory_format),
+        drop_last=True,
+        persistent_workers=True,
+        prefetch_factor=prefetch_factor,
+    )
+    return (
+        PrefetchedWrapper(train_loader, start_epoch, num_classes, one_hot),
+        len(train_loader),
+    )
+def get_pytorch_val_loader(
+    data_path,
+    image_size,
+    batch_size,
+    num_classes,
+    one_hot,
+    interpolation="bilinear",
+    workers=5,
+    _worker_init_fn=None,
+    crop_padding=32,
+    memory_format=torch.contiguous_format,
+    prefetch_factor=2,
+):
+    interpolation = {
+        "bicubic": InterpolationMode.BICUBIC,
+        "bilinear": InterpolationMode.BILINEAR,
+    }[interpolation]
+    valdir = os.path.join(data_path, "val")
+    val_dataset = datasets.ImageFolder(
+        valdir,
+        transforms.Compose(
+            [
+                transforms.Resize(
+                    image_size + crop_padding, interpolation=interpolation
+                ),
+                transforms.CenterCrop(image_size),
+            ]
+        ),
+    )
+    if torch.distributed.is_initialized():
+        val_sampler = torch.utils.data.distributed.DistributedSampler(
+            val_dataset, shuffle=False
+        )
+    else:
+        val_sampler = None
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset,
+        sampler=val_sampler,
+        batch_size=batch_size,
+        shuffle=(val_sampler is None),
+        num_workers=workers,
+        worker_init_fn=_worker_init_fn,
+        pin_memory=True,
+        collate_fn=partial(fast_collate, memory_format),
+        drop_last=False,
+        persistent_workers=True,
+        prefetch_factor=prefetch_factor,
+    )
+    return PrefetchedWrapper(val_loader, 0, num_classes, one_hot), len(val_loader)
+class SynteticDataLoader(object):
+    def __init__(
+        self,
+        batch_size,
+        num_classes,
+        num_channels,
+        height,
+        width,
+        one_hot,
+        memory_format=torch.contiguous_format,
+    ):
+        input_data = (
+            torch.randn(batch_size, num_channels, height, width)
+            .contiguous(memory_format=memory_format)
+            .cuda()
+            .normal_(0, 1.0)
+        )
+        if one_hot:
+            input_target = torch.empty(batch_size, num_classes).cuda()
+            input_target[:, 0] = 1.0
+        else:
+            input_target = torch.randint(0, num_classes, (batch_size,))
+        input_target = input_target.cuda()
+        self.input_data = input_data
+        self.input_target = input_target
+    def __iter__(self):
+        while True:
+            yield self.input_data, self.input_target
+def get_synthetic_loader(
+    data_path,
+    image_size,
+    batch_size,
+    num_classes,
+    one_hot,
+    interpolation=None,
+    augmentation=None,
+    start_epoch=0,
+    workers=None,
+    _worker_init_fn=None,
+    memory_format=torch.contiguous_format,
+    **kwargs,
+):
+    return (
+        SynteticDataLoader(
+            batch_size,
+            num_classes,
+            3,
+            image_size,
+            image_size,
+            one_hot,
+            memory_format=memory_format,
+        ),
+        -1,
+    )
--- a/image_classification/dataloaders.py.bak
+++ b/image_classification/dataloaders.py.bak
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import torch
+import numpy as np
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from PIL import Image
+from functools import partial
+from torchvision.transforms.functional import InterpolationMode
+from image_classification.autoaugment import AutoaugmentImageNetPolicy
+DATA_BACKEND_CHOICES = ["pytorch", "synthetic"]
+try:
+    from nvidia.dali.plugin.pytorch import DALIClassificationIterator
+    from nvidia.dali.pipeline import Pipeline
+    import nvidia.dali.ops as ops
+    import nvidia.dali.types as types
+    DATA_BACKEND_CHOICES.append("dali-gpu")
+    DATA_BACKEND_CHOICES.append("dali-cpu")
+except ImportError:
+    print(
+        "Please install DALI from https://www.github.com/NVIDIA/DALI to run this example."
+    )
+def load_jpeg_from_file(path, cuda=True):
+    img_transforms = transforms.Compose(
+        [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
+    )
+    img = img_transforms(Image.open(path))
+    with torch.no_grad():
+        # mean and std are not multiplied by 255 as they are in training script
+        # torch dataloader reads data into bytes whereas loading directly
+        # through PIL creates a tensor with floats in [0,1] range
+        mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+        std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+        if cuda:
+            mean = mean.cuda()
+            std = std.cuda()
+            img = img.cuda()
+        img = img.float()
+        input = img.unsqueeze(0).sub_(mean).div_(std)
+    return input
+class HybridTrainPipe(Pipeline):
+    def __init__(
+        self,
+        batch_size,
+        num_threads,
+        device_id,
+        data_dir,
+        interpolation,
+        crop,
+        dali_cpu=False,
+    ):
+        super(HybridTrainPipe, self).__init__(
+            batch_size, num_threads, device_id, seed=12 + device_id
+        )
+        interpolation = {
+            "bicubic": types.INTERP_CUBIC,
+            "bilinear": types.INTERP_LINEAR,
+            "triangular": types.INTERP_TRIANGULAR,
+        }[interpolation]
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+        else:
+            rank = 0
+            world_size = 1
+        self.input = ops.FileReader(
+            file_root=data_dir,
+            shard_id=rank,
+            num_shards=world_size,
+            random_shuffle=True,
+            pad_last_batch=True,
+        )
+        if dali_cpu:
+            dali_device = "cpu"
+            self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB)
+        else:
+            dali_device = "gpu"
+            # This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet
+            # without additional reallocations
+            self.decode = ops.ImageDecoder(
+                device="mixed",
+                output_type=types.RGB,
+                device_memory_padding=211025920,
+                host_memory_padding=140544512,
+            )
+        self.res = ops.RandomResizedCrop(
+            device=dali_device,
+            size=[crop, crop],
+            interp_type=interpolation,
+            random_aspect_ratio=[0.75, 4.0 / 3.0],
+            random_area=[0.08, 1.0],
+            num_attempts=100,
+            antialias=False,
+        )
+        self.cmnp = ops.CropMirrorNormalize(
+            device="gpu",
+            dtype=types.FLOAT,
+            output_layout=types.NCHW,
+            crop=(crop, crop),
+            mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+            std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+        )
+        self.coin = ops.CoinFlip(probability=0.5)
+    def define_graph(self):
+        rng = self.coin()
+        self.jpegs, self.labels = self.input(name="Reader")
+        images = self.decode(self.jpegs)
+        images = self.res(images)
+        output = self.cmnp(images.gpu(), mirror=rng)
+        return [output, self.labels]
+class HybridValPipe(Pipeline):
+    def __init__(
+        self, batch_size, num_threads, device_id, data_dir, interpolation, crop, size
+    ):
+        super(HybridValPipe, self).__init__(
+            batch_size, num_threads, device_id, seed=12 + device_id
+        )
+        interpolation = {
+            "bicubic": types.INTERP_CUBIC,
+            "bilinear": types.INTERP_LINEAR,
+            "triangular": types.INTERP_TRIANGULAR,
+        }[interpolation]
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+        else:
+            rank = 0
+            world_size = 1
+        self.input = ops.FileReader(
+            file_root=data_dir,
+            shard_id=rank,
+            num_shards=world_size,
+            random_shuffle=False,
+            pad_last_batch=True,
+        )
+        self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
+        self.res = ops.Resize(
+            device="gpu",
+            resize_shorter=size,
+            interp_type=interpolation,
+            antialias=False,
+        )
+        self.cmnp = ops.CropMirrorNormalize(
+            device="gpu",
+            dtype=types.FLOAT,
+            output_layout=types.NCHW,
+            crop=(crop, crop),
+            mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+            std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+        )
+    def define_graph(self):
+        self.jpegs, self.labels = self.input(name="Reader")
+        images = self.decode(self.jpegs)
+        images = self.res(images)
+        output = self.cmnp(images)
+        return [output, self.labels]
+class DALIWrapper(object):
+    def gen_wrapper(dalipipeline, num_classes, one_hot, memory_format):
+        for data in dalipipeline:
+            input = data[0]["data"].contiguous(memory_format=memory_format)
+            target = torch.reshape(data[0]["label"], [-1]).cuda().long()
+            if one_hot:
+                target = expand(num_classes, torch.float, target)
+            yield input, target
+        dalipipeline.reset()
+    def __init__(self, dalipipeline, num_classes, one_hot, memory_format):
+        self.dalipipeline = dalipipeline
+        self.num_classes = num_classes
+        self.one_hot = one_hot
+        self.memory_format = memory_format
+    def __iter__(self):
+        return DALIWrapper.gen_wrapper(
+            self.dalipipeline, self.num_classes, self.one_hot, self.memory_format
+        )
+def get_dali_train_loader(dali_cpu=False):
+    def gdtl(
+        data_path,
+        image_size,
+        batch_size,
+        num_classes,
+        one_hot,
+        interpolation="bilinear",
+        augmentation=None,
+        start_epoch=0,
+        workers=5,
+        _worker_init_fn=None,
+        memory_format=torch.contiguous_format,
+        **kwargs,
+    ):
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+        else:
+            rank = 0
+            world_size = 1
+        traindir = os.path.join(data_path, "train")
+        if augmentation is not None:
+            raise NotImplementedError(
+                f"Augmentation {augmentation} for dali loader is not supported"
+            )
+        pipe = HybridTrainPipe(
+            batch_size=batch_size,
+            num_threads=workers,
+            device_id=rank % torch.cuda.device_count(),
+            data_dir=traindir,
+            interpolation=interpolation,
+            crop=image_size,
+            dali_cpu=dali_cpu,
+        )
+        pipe.build()
+        train_loader = DALIClassificationIterator(
+            pipe, reader_name="Reader", fill_last_batch=False
+        )
+        return (
+            DALIWrapper(train_loader, num_classes, one_hot, memory_format),
+            int(pipe.epoch_size("Reader") / (world_size * batch_size)),
+        )
+    return gdtl
+def get_dali_val_loader():
+    def gdvl(
+        data_path,
+        image_size,
+        batch_size,
+        num_classes,
+        one_hot,
+        interpolation="bilinear",
+        crop_padding=32,
+        workers=5,
+        _worker_init_fn=None,
+        memory_format=torch.contiguous_format,
+        **kwargs,
+    ):
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+        else:
+            rank = 0
+            world_size = 1
+        valdir = os.path.join(data_path, "val")
+        pipe = HybridValPipe(
+            batch_size=batch_size,
+            num_threads=workers,
+            device_id=rank % torch.cuda.device_count(),
+            data_dir=valdir,
+            interpolation=interpolation,
+            crop=image_size,
+            size=image_size + crop_padding,
+        )
+        pipe.build()
+        val_loader = DALIClassificationIterator(
+            pipe, reader_name="Reader", fill_last_batch=False
+        )
+        return (
+            DALIWrapper(val_loader, num_classes, one_hot, memory_format),
+            int(pipe.epoch_size("Reader") / (world_size * batch_size)),
+        )
+    return gdvl
+def fast_collate(memory_format, batch):
+    imgs = [img[0] for img in batch]
+    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
+    w = imgs[0].size[0]
+    h = imgs[0].size[1]
+    tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8).contiguous(
+        memory_format=memory_format
+    )
+    for i, img in enumerate(imgs):
+        nump_array = np.asarray(img, dtype=np.uint8)
+        if nump_array.ndim < 3:
+            nump_array = np.expand_dims(nump_array, axis=-1)
+        nump_array = np.rollaxis(nump_array, 2)
+        tensor[i] += torch.from_numpy(nump_array.copy())
+    return tensor, targets
+def expand(num_classes, dtype, tensor):
+    e = torch.zeros(
+        tensor.size(0), num_classes, dtype=dtype, device=torch.device("cuda")
+    )
+    e = e.scatter(1, tensor.unsqueeze(1), 1.0)
+    return e
+class PrefetchedWrapper(object):
+    def prefetched_loader(loader, num_classes, one_hot):
+        mean = (
+            torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255])
+            .cuda()
+            .view(1, 3, 1, 1)
+        )
+        std = (
+            torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255])
+            .cuda()
+            .view(1, 3, 1, 1)
+        )
+        stream = torch.cuda.Stream()
+        first = True
+        for next_input, next_target in loader:
+            with torch.cuda.stream(stream):
+                next_input = next_input.cuda(non_blocking=True)
+                next_target = next_target.cuda(non_blocking=True)
+                next_input = next_input.float()
+                if one_hot:
+                    next_target = expand(num_classes, torch.float, next_target)
+                next_input = next_input.sub_(mean).div_(std)
+            if not first:
+                yield input, target
+            else:
+                first = False
+            torch.cuda.current_stream().wait_stream(stream)
+            input = next_input
+            target = next_target
+        yield input, target
+    def __init__(self, dataloader, start_epoch, num_classes, one_hot):
+        self.dataloader = dataloader
+        self.epoch = start_epoch
+        self.one_hot = one_hot
+        self.num_classes = num_classes
+    def __iter__(self):
+        if self.dataloader.sampler is not None and isinstance(
+            self.dataloader.sampler, torch.utils.data.distributed.DistributedSampler
+        ):
+            self.dataloader.sampler.set_epoch(self.epoch)
+        self.epoch += 1
+        return PrefetchedWrapper.prefetched_loader(
+            self.dataloader, self.num_classes, self.one_hot
+        )
+    def __len__(self):
+        return len(self.dataloader)
+def get_pytorch_train_loader(
+    data_path,
+    image_size,
+    batch_size,
+    num_classes,
+    one_hot,
+    interpolation="bilinear",
+    augmentation=None,
+    start_epoch=0,
+    workers=5,
+    _worker_init_fn=None,
+    prefetch_factor=2,
+    memory_format=torch.contiguous_format,
+):
+    interpolation = {
+        "bicubic": InterpolationMode.BICUBIC,
+        "bilinear": InterpolationMode.BILINEAR,
+    }[interpolation]
+    traindir = os.path.join(data_path, "train")
+    transforms_list = [
+        transforms.RandomResizedCrop(image_size, interpolation=interpolation),
+        transforms.RandomHorizontalFlip(),
+    ]
+    if augmentation == "autoaugment":
+        transforms_list.append(AutoaugmentImageNetPolicy())
+    train_dataset = datasets.ImageFolder(traindir, transforms.Compose(transforms_list))
+    if torch.distributed.is_initialized():
+        train_sampler = torch.utils.data.distributed.DistributedSampler(
+            train_dataset, shuffle=True
+        )
+    else:
+        train_sampler = None
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        sampler=train_sampler,
+        batch_size=batch_size,
+        shuffle=(train_sampler is None),
+        num_workers=workers,
+        worker_init_fn=_worker_init_fn,
+        pin_memory=True,
+        collate_fn=partial(fast_collate, memory_format),
+        drop_last=True,
+        persistent_workers=True,
+        prefetch_factor=prefetch_factor,
+    )
+    return (
+        PrefetchedWrapper(train_loader, start_epoch, num_classes, one_hot),
+        len(train_loader),
+    )
+def get_pytorch_val_loader(
+    data_path,
+    image_size,
+    batch_size,
+    num_classes,
+    one_hot,
+    interpolation="bilinear",
+    workers=5,
+    _worker_init_fn=None,
+    crop_padding=32,
+    memory_format=torch.contiguous_format,
+    prefetch_factor=2,
+):
+    interpolation = {
+        "bicubic": InterpolationMode.BICUBIC,
+        "bilinear": InterpolationMode.BILINEAR,
+    }[interpolation]
+    valdir = os.path.join(data_path, "val")
+    val_dataset = datasets.ImageFolder(
+        valdir,
+        transforms.Compose(
+            [
+                transforms.Resize(
+                    image_size + crop_padding, interpolation=interpolation
+                ),
+                transforms.CenterCrop(image_size),
+            ]
+        ),
+    )
+    if torch.distributed.is_initialized():
+        val_sampler = torch.utils.data.distributed.DistributedSampler(
+            val_dataset, shuffle=False
+        )
+    else:
+        val_sampler = None
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset,
+        sampler=val_sampler,
+        batch_size=batch_size,
+        shuffle=(val_sampler is None),
+        num_workers=workers,
+        worker_init_fn=_worker_init_fn,
+        pin_memory=True,
+        collate_fn=partial(fast_collate, memory_format),
+        drop_last=False,
+        persistent_workers=True,
+        prefetch_factor=prefetch_factor,
+    )
+    return PrefetchedWrapper(val_loader, 0, num_classes, one_hot), len(val_loader)
+class SynteticDataLoader(object):
+    def __init__(
+        self,
+        batch_size,
+        num_classes,
+        num_channels,
+        height,
+        width,
+        one_hot,
+        memory_format=torch.contiguous_format,
+    ):
+        input_data = (
+            torch.randn(batch_size, num_channels, height, width)
+            .contiguous(memory_format=memory_format)
+            .cuda()
+            .normal_(0, 1.0)
+        )
+        if one_hot:
+            input_target = torch.empty(batch_size, num_classes).cuda()
+            input_target[:, 0] = 1.0
+        else:
+            input_target = torch.randint(0, num_classes, (batch_size,))
+        input_target = input_target.cuda()
+        self.input_data = input_data
+        self.input_target = input_target
+    def __iter__(self):
+        while True:
+            yield self.input_data, self.input_target
+def get_synthetic_loader(
+    data_path,
+    image_size,
+    batch_size,
+    num_classes,
+    one_hot,
+    interpolation=None,
+    augmentation=None,
+    start_epoch=0,
+    workers=None,
+    _worker_init_fn=None,
+    memory_format=torch.contiguous_format,
+    **kwargs,
+):
+    return (
+        SynteticDataLoader(
+            batch_size,
+            num_classes,
+            3,
+            image_size,
+            image_size,
+            one_hot,
+            memory_format=memory_format,
+        ),
+        -1,
+    )
--- a/image_classification/gpu_affinity.py
+++ b/image_classification/gpu_affinity.py
+import collections
+import itertools
+import os
+import pathlib
+import re
+import pynvml
+from typing import Union
+class Device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_bit_affinity = 64
+    _nvml_affinity_elements = (
+        os.cpu_count() + _nvml_bit_affinity - 1
+    ) // _nvml_bit_affinity
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+    def get_name(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+    def get_uuid(self):
+        return pynvml.nvmlDeviceGetUUID(self.handle)
+    def get_cpu_affinity(self):
+        affinity_string = ""
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, Device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = "{:064b}".format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+def get_thread_siblings_list():
+    """
+    Returns a list of 2-element integer tuples representing pairs of
+    hyperthreading cores.
+    """
+    path = "/sys/devices/system/cpu/cpu*/topology/thread_siblings_list"
+    thread_siblings_list = []
+    pattern = re.compile(r"(\d+)\D(\d+)")
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(sorted(map(int, res[0])))
+                thread_siblings_list.append(pair)
+    thread_siblings_list = list(set(thread_siblings_list))
+    return thread_siblings_list
+def build_thread_siblings_dict(siblings_list):
+    siblings_dict = {}
+    for siblings_tuple in siblings_list:
+        for core in siblings_tuple:
+            siblings_dict[core] = siblings_tuple
+    return siblings_dict
+def group_list_by_dict(affinity, siblings_dict):
+    sorted_affinity = sorted(affinity, key=lambda x: siblings_dict.get(x, (x,)))
+    grouped = itertools.groupby(
+        sorted_affinity, key=lambda x: siblings_dict.get(x, (x,))
+    )
+    grouped_affinity = []
+    for key, group in grouped:
+        grouped_affinity.append(tuple(group))
+    return grouped_affinity
+def group_affinity_by_siblings(socket_affinities):
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = build_thread_siblings_dict(siblings_list)
+    grouped_socket_affinities = []
+    for socket_affinity in socket_affinities:
+        grouped_socket_affinities.append(
+            group_list_by_dict(socket_affinity, siblings_dict)
+        )
+    return grouped_socket_affinities
+def ungroup_affinities(affinities, cores):
+    ungrouped_affinities = []
+    for affinity in affinities:
+        if cores == "all_logical":
+            ungrouped_affinities.append(list(itertools.chain(*affinity)))
+        elif cores == "single_logical":
+            ungrouped_affinities.append([group[0] for group in affinity])
+        else:
+            raise RuntimeError("Unknown cores mode")
+    return ungrouped_affinities
+def check_socket_affinities(socket_affinities):
+    # sets of cores should be either identical or disjoint
+    for i, j in itertools.product(socket_affinities, socket_affinities):
+        if not set(i) == set(j) and not set(i).isdisjoint(set(j)):
+            raise RuntimeError(
+                f"Sets of cores should be either identical or disjoint, "
+                f"but got {i} and {j}."
+            )
+def get_socket_affinities(nproc_per_node, exclude_unavailable_cores=True):
+    devices = [Device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.get_cpu_affinity() for dev in devices]
+    if exclude_unavailable_cores:
+        available_cores = os.sched_getaffinity(0)
+        socket_affinities = [
+            list(set(affinity) & available_cores) for affinity in socket_affinities
+        ]
+    check_socket_affinities(socket_affinities)
+    return socket_affinities
+def get_grouped_socket_affinities(nproc_per_node, exclude_unavailable_cores=True):
+    socket_affinities = get_socket_affinities(nproc_per_node, exclude_unavailable_cores)
+    grouped_socket_affinities = group_affinity_by_siblings(socket_affinities)
+    return grouped_socket_affinities
+def set_socket_affinity(gpu_id, nproc_per_node, cores):
+    """
+    The process is assigned with all available physical CPU cores from the CPU
+    socket connected to the GPU with a given id.
+    Args:
+        gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        cores: 'all_logical' or 'single_logical'
+    """
+    grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
+    ungrouped_affinities = ungroup_affinities(grouped_socket_affinities, cores)
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
+def set_socket_single_affinity(gpu_id, nproc_per_node, cores):
+    """
+    The process is assigned with the first available physical CPU core from the
+    list of all CPU physical cores from the CPU socket connected to the GPU with
+    a given id.
+    Args:
+        gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        cores: 'all_logical' or 'single_logical'
+    """
+    grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
+    single_grouped_socket_affinities = [
+        group[:1] for group in grouped_socket_affinities
+    ]
+    ungrouped_affinities = ungroup_affinities(single_grouped_socket_affinities, cores)
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
+def set_socket_single_unique_affinity(gpu_id, nproc_per_node, cores):
+    """
+    The process is assigned with a single unique available physical CPU core
+    from the list of all CPU cores from the CPU socket connected to the GPU with
+    a given id.
+    Args:
+        gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        cores: 'all_logical' or 'single_logical'
+    """
+    grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
+    affinities = []
+    assigned_groups = set()
+    for grouped_socket_affinity in grouped_socket_affinities:
+        for group in grouped_socket_affinity:
+            if group not in assigned_groups:
+                affinities.append([group])
+                assigned_groups.add(group)
+                break
+    ungrouped_affinities = ungroup_affinities(affinities, cores)
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
+def set_socket_unique_affinity(gpu_id, nproc_per_node, cores, mode, balanced=True):
+    """
+    The process is assigned with a unique subset of available physical CPU
+    cores from the CPU socket connected to a GPU with a given id.
+    Assignment automatically includes hyperthreading siblings (if siblings are
+    available).
+    Args:
+        gpu_id: index of a GPU
+        nproc_per_node: number of processes per node
+        cores: 'all_logical' or 'single_logical'
+        mode: 'contiguous' or 'interleaved'
+        balanced: assign an equal number of physical cores to each process,
+    """
+    grouped_socket_affinities = get_grouped_socket_affinities(nproc_per_node)
+    grouped_socket_affinities_to_device_ids = collections.defaultdict(list)
+    for idx, grouped_socket_affinity in enumerate(grouped_socket_affinities):
+        grouped_socket_affinities_to_device_ids[tuple(grouped_socket_affinity)].append(
+            idx
+        )
+    # compute minimal number of physical cores per GPU across all GPUs and
+    # sockets, code assigns this number of cores per GPU if balanced == True
+    min_physical_cores_per_gpu = min(
+        [
+            len(cores) // len(gpus)
+            for cores, gpus in grouped_socket_affinities_to_device_ids.items()
+        ]
+    )
+    grouped_unique_affinities = [None] * nproc_per_node
+    for (
+        grouped_socket_affinity,
+        device_ids,
+    ) in grouped_socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        if balanced:
+            cores_per_device = min_physical_cores_per_gpu
+            grouped_socket_affinity = grouped_socket_affinity[
+                : devices_per_group * min_physical_cores_per_gpu
+            ]
+        else:
+            cores_per_device = len(grouped_socket_affinity) // devices_per_group
+        for socket_subgroup_id, device_id in enumerate(device_ids):
+            # In theory there should be no difference in performance between
+            # 'interleaved' and 'contiguous' pattern on Intel-based DGX-1,
+            # but 'contiguous' should be better for DGX A100 because on AMD
+            # Rome 4 consecutive cores are sharing L3 cache.
+            # TODO: code doesn't attempt to automatically detect layout of
+            # L3 cache, also external environment may already exclude some
+            # cores, this code makes no attempt to detect it and to align
+            # mapping to multiples of 4.
+            if mode == "interleaved":
+                unique_grouped_affinity = list(
+                    grouped_socket_affinity[socket_subgroup_id::devices_per_group]
+                )
+            elif mode == "contiguous":
+                unique_grouped_affinity = list(
+                    grouped_socket_affinity[
+                        socket_subgroup_id
+                        * cores_per_device : (socket_subgroup_id + 1)
+                        * cores_per_device
+                    ]
+                )
+            else:
+                raise RuntimeError("Unknown set_socket_unique_affinity mode")
+            grouped_unique_affinities[device_id] = unique_grouped_affinity
+    ungrouped_affinities = ungroup_affinities(grouped_unique_affinities, cores)
+    os.sched_setaffinity(0, ungrouped_affinities[gpu_id])
+from enum import Enum, auto
+class AffinityMode(Enum):
+    none = auto()
+    socket = auto()
+    socket_single = auto()
+    socket_single_unique = auto()
+    socket_unique_interleaved = auto()
+    socket_unique_contiguous = auto()
+def set_affinity(
+    gpu_id,
+    nproc_per_node=None,
+    *,
+    mode: Union[str, AffinityMode] = AffinityMode.socket_unique_contiguous,
+    cores="all_logical",
+    balanced=True,
+):
+    """
+    The process is assigned with a proper CPU affinity that matches CPU-GPU
+    hardware architecture on a given platform. Usually, it improves and
+    stabilizes the performance of deep learning training workloads.
+    This function assumes that the workload runs in multi-process single-device
+    mode (there are multiple training processes, and each process is running on
+    a single GPU). This is typical for multi-GPU data-parallel training
+    workloads (e.g., using `torch.nn.parallel.DistributedDataParallel`).
+    Available affinity modes:
+    * 'socket' - the process is assigned with all available physical CPU cores
+    from the CPU socket connected to the GPU with a given id.
+    * 'socket_single' - the process is assigned with the first available
+    physical CPU core from the list of all CPU cores from the CPU socket
+    connected to the GPU with a given id (multiple GPUs could be assigned with
+    the same CPU core).
+    * 'socket_single_unique' - the process is assigned with a single unique
+    available physical CPU core from the list of all CPU cores from the CPU
+    socket connected to the GPU with a given id.
+    * 'socket_unique_interleaved' - the process is assigned with a unique
+    subset of available physical CPU cores from the CPU socket connected to a
+    GPU with a given id, cores are assigned with interleaved indexing pattern
+    * 'socket_unique_contiguous' - (the default) the process is assigned with a
+    unique subset of available physical CPU cores from the CPU socket connected
+    to a GPU with a given id, cores are assigned with contiguous indexing
+    pattern
+    Available "cores" modes:
+    * 'all_logical' - assigns the process with all logical cores associated with
+    a given corresponding physical core (i.e., automatically includes all
+    available hyperthreading siblings)
+    * 'single_logical' - assigns the process with only one logical core
+    associated with a given corresponding physical core (i.e., excludes
+    hyperthreading siblings)
+    'socket_unique_contiguous' is the recommended mode for deep learning
+    training workloads on NVIDIA DGX machines.
+    Args:
+        gpu_id: integer index of a GPU, value from 0 to 'nproc_per_node' - 1
+        nproc_per_node: number of processes per node
+        mode: affinity mode
+        balanced: assign an equal number of physical cores to each process,
+            affects only 'socket_unique_interleaved' and
+            'socket_unique_contiguous' affinity modes
+        cores: 'all_logical' or 'single_logical'
+    Returns a set of logical CPU cores on which the process is eligible to run.
+    Example:
+    import argparse
+    import os
+    import gpu_affinity
+    import torch
+    def main():
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            '--local_rank',
+            type=int,
+            default=os.getenv('LOCAL_RANK', 0),
+        )
+        args = parser.parse_args()
+        nproc_per_node = torch.cuda.device_count()
+        affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node)
+        print(f'{args.local_rank}: core affinity: {affinity}')
+    if __name__ == "__main__":
+        main()
+    Launch the example with:
+    python -m torch.distributed.launch --nproc_per_node <#GPUs> example.py
+    WARNING: On DGX A100, only half of the CPU cores have direct access to GPUs.
+    This function restricts execution only to the CPU cores directly connected
+    to GPUs, so on DGX A100, it will limit the code to half of the CPU cores and
+    half of CPU memory bandwidth (which may be fine for many DL models).
+    WARNING: Intel's OpenMP implementation resets affinity on the first call to
+    an OpenMP function after a fork. It's recommended to run with env variable:
+    `KMP_AFFINITY=disabled` if the affinity set by gpu_affinity should be
+    preserved after a fork (e.g. in PyTorch DataLoader workers).
+    """
+    if not isinstance(mode, AffinityMode):
+        mode = AffinityMode[mode]
+    #pynvml.nvmlInit()
+    #if nproc_per_node is None:
+    #    nproc_per_node = pynvml.nvmlDeviceGetCount()
+    if mode == AffinityMode.none:
+        pass
+    elif mode == AffinityMode.socket:
+        set_socket_affinity(gpu_id, nproc_per_node, cores)
+    elif mode == AffinityMode.socket_single:
+        set_socket_single_affinity(gpu_id, nproc_per_node, cores)
+    elif mode == AffinityMode.socket_single_unique:
+        set_socket_single_unique_affinity(gpu_id, nproc_per_node, cores)
+    elif mode == AffinityMode.socket_unique_interleaved:
+        set_socket_unique_affinity(
+            gpu_id, nproc_per_node, cores, "interleaved", balanced
+        )
+    elif mode == AffinityMode.socket_unique_contiguous:
+        set_socket_unique_affinity(
+            gpu_id, nproc_per_node, cores, "contiguous", balanced
+        )
+    else:
+        raise RuntimeError("Unknown affinity mode")
+    affinity = os.sched_getaffinity(0)
+    return affinity