Ported examples to use torch.distributed.launch

aa817132 · Michael Carilli · 12b49b98 · aa817132 · aa817132 · aa817132
Commit aa817132 authored Jul 19, 2018 by Michael Carilli
13 changed files
--- a/examples/FP16_Optimizer_simple/distributed_apex/README.md
+++ b/examples/FP16_Optimizer_simple/distributed_apex/README.md
-**distributed_data_parallel.py** shows an example using `FP16_Optimizer` with
-`apex.parallel.DistributedDataParallel`.
+**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
+`apex.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
+[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
 The usage of `FP16_Optimizer` with distributed does not need to change from ordinary 
 single-process usage.  Test via
 ```bash

--- a/examples/FP16_Optimizer_simple/distributed_apex/distributed_data_parallel.py
+++ b/examples/FP16_Optimizer_simple/distributed_apex/distributed_data_parallel.py
@@ -5,20 +5,12 @@ from apex.parallel import DistributedDataParallel as DDP
 from apex.fp16_utils import FP16_Optimizer

 parser = argparse.ArgumentParser()
-parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
-                    help='url used to set up distributed training')
-parser.add_argument('--world-size', default=2, type=int,
-                    help='Number of distributed processes.')
-parser.add_argument("--rank", type=int,
-                    help='Rank of this process')
-
+parser.add_argument("--local_rank", default=0, type=int)
 args = parser.parse_args()

-torch.cuda.set_device(args.rank)
+torch.cuda.set_device(args.local_rank)
 torch.distributed.init_process_group(backend='nccl',
-                                     init_method=args.dist_url,
-                                     world_size=args.world_size,
-                                     rank=args.rank)
+                                     init_method='env://')

 torch.backends.cudnn.benchmark = True


--- a/examples/FP16_Optimizer_simple/distributed_apex/run.sh
+++ b/examples/FP16_Optimizer_simple/distributed_apex/run.sh
 #!/bin/bash
-# By default, apex.parallel.multiproc will attempt to use all available GPUs on the system.  
-# The number of GPUs to use can be limited by setting CUDA_VISIBLE_DEVICES:
-export CUDA_VISIBLE_DEVICES=0,1
-python -m apex.parallel.multiproc distributed_data_parallel.py
+python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
--- a/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/README.md
+++ b/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/README.md
+**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
+`apex.parallel.DistributedDataParallel` in conjuction with the legacy Apex
+launcher script, `apex.parallel.multiproc`.  See 
+[FP16_Optimizer_simple/distributed_apex](https://github.com/NVIDIA/apex/tree/torch_launcher/examples/FP16_Optimizer_simple/distributed_apex) for a more up-to-date example that uses the Pytorch launcher
+script, `torch.distributed.launch`.
+The usage of `FP16_Optimizer` with distributed does not need to change from ordinary 
+single-process usage.  Test via
+```bash
+bash run.sh
+```
--- a/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/distributed_data_parallel.py
+++ b/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/distributed_data_parallel.py
+import torch
+from torch.autograd import Variable
+import argparse
+from apex.parallel import DistributedDataParallel as DDP
+from apex.fp16_utils import FP16_Optimizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--world-size', default=2, type=int,
+                    help='Number of distributed processes.')
+parser.add_argument("--rank", type=int,
+                    help='Rank of this process')
+
+args = parser.parse_args()
+
+torch.cuda.set_device(args.rank)
+torch.distributed.init_process_group(backend='nccl',
+                                     init_method=args.dist_url,
+                                     world_size=args.world_size,
+                                     rank=args.rank)
+
+torch.backends.cudnn.benchmark = True
+
+N, D_in, D_out = 64, 1024, 16
+
+x = Variable(torch.cuda.FloatTensor(N, D_in ).normal_()).half()
+y = Variable(torch.cuda.FloatTensor(N, D_out).normal_()).half()
+
+model = torch.nn.Linear(D_in, D_out).cuda().half()
+model = DDP(model)
+
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+### Construct FP16_Optimizer ###
+optimizer = FP16_Optimizer(optimizer)
+###
+
+loss_fn = torch.nn.MSELoss()
+
+for t in range(500):
+    optimizer.zero_grad()
+    y_pred = model(x)
+    loss = loss_fn(y_pred.float(), y.float())
+    ### Change loss.backward() to: ###
+    optimizer.backward(loss)
+    ###
+    optimizer.step()
+
+print("final loss = ", loss)
+
--- a/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/run.sh
+++ b/examples/FP16_Optimizer_simple/distributed_apex_legacy_launcher/run.sh
+#!/bin/bash
+# By default, apex.parallel.multiproc will attempt to use all available GPUs on the system.  
+# The number of GPUs to use can be limited by setting CUDA_VISIBLE_DEVICES:
+export CUDA_VISIBLE_DEVICES=0,1
+python -m apex.parallel.multiproc distributed_data_parallel.py
--- a/examples/FP16_Optimizer_simple/distributed_pytorch/README.md
+++ b/examples/FP16_Optimizer_simple/distributed_pytorch/README.md
-**distributed_data_parallel.py** shows an example using `FP16_Optimizer` with
-`torch.nn.parallel.DistributedDataParallel`.
+**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
+`torch.nn.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
+[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
 The usage of `FP16_Optimizer` with distributed does not need to change from ordinary 
 single-process usage.  Test via
 ```bash

--- a/examples/FP16_Optimizer_simple/distributed_pytorch/distributed_data_parallel.py
+++ b/examples/FP16_Optimizer_simple/distributed_pytorch/distributed_data_parallel.py
@@ -4,7 +4,7 @@ import argparse
 from apex.fp16_utils import FP16_Optimizer

 parser = argparse.ArgumentParser()
-parser.add_argument("--local_rank", type=int)
+parser.add_argument("--local_rank", default=0, type=int)
 args = parser.parse_args()

 torch.cuda.set_device(args.local_rank)

--- a/examples/distributed/README.md
+++ b/examples/distributed/README.md
@@ -29,27 +29,25 @@ To download the dataset, run
 ```python main.py```
 without any arguments.  Once you have downloaded the dataset, you should not need to do this again.

-You can now launch multi-process distributed data parallel jobs via
+`main.py` runs multiprocess distributed data parallel jobs using the Pytorch launcher script
+[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
+Jobs are launched via
 ```bash
-python -m apex.parallel.multiproc main.py args...
+python -m torch.distributed.launch --nproc_per_node=N main.py args...
 ```
-adding any `args...` you like.  The launch script `apex.parallel.multiproc` will 
-spawn one process for each of your system's available (visible) GPUs.
-Each process will run `python main.py args... --world-size <worldsize> --rank <rank>`
-(the `--world-size` and `--rank` arguments are determined and appended by `apex.parallel.multiproc`).
-Each `main.py` calls `torch.cuda.set_device()` and `torch.distributed.init_process_group()` 
-according to the `rank` and `world-size` arguments it receives.
-
-The number of visible GPU devices (and therefore the number of processes 
-`DistributedDataParallel` will spawn) can be controlled by setting the environment variable 
-`CUDA_VISIBLE_DEVICES`.  For example, if you `export CUDA_VISIBLE_DEVICES=0,1` and run
-```python -m apex.parallel.multiproc main.py ...```, the launch utility will spawn two processes
-which will run on devices 0 and 1.  By default, if `CUDA_VISIBLE_DEVICES` is unset, 
-`apex.parallel.multiproc` will attempt to use every device on the node.
+`torch.distributed.launch` spawns `N` processes, each of which runs as
+`python main.py args... --local_rank <rank>`.
+The `local_rank` argument for each process is determined and appended by `torch.distributed.launch`,
+and varies between  0 and `N-1`.  `torch.distributed.launch` also provides environment variables 
+for each process.
+Internally, each process calls `set_device` according to its local
+rank and `init_process_group` with `init_method=`env://' to ingest the provided environment 
+variables.
+For best performance, set `N` equal to the number of visible CUDA devices on the node.

 ## Converting your own model

 To understand how to convert your own model, please see all sections of main.py within ```#=====START: ADDED FOR DISTRIBUTED======``` and ```#=====END:   ADDED FOR DISTRIBUTED======``` flags.

 ## Requirements
-Pytorch master branch built from source. This requirement is to use NCCL as a distributed backend.
+Pytorch with NCCL available as a distributed backend.  Pytorch 0.4+, installed as a pip or conda package, should have this by default.  Otherwise, you can build Pytorch from source, in an environment where NCCL is installed and visible.
--- a/examples/distributed/main.py
+++ b/examples/distributed/main.py
 from __future__ import print_function
 import argparse
+import os
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -49,20 +50,10 @@ parser.add_argument('--log-interval', type=int, default=10, metavar='N',
 Add some distributed options. For explanation of dist-url and dist-backend please see
 http://pytorch.org/tutorials/intermediate/dist_tuto.html

--world-size and --rank are required parameters as they will be used by the multiproc.py launcher
-but do not have to be set explicitly.
+--local_rank will be supplied by the Pytorch launcher wrapper (torch.distributed.launch)
 '''
+parser.add_argument("--local_rank", default=0, type=int)

-parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
-                    help='url used to set up distributed training')
-parser.add_argument('--dist-backend', default='nccl', type=str,
-                    help='distributed backend')
-parser.add_argument('--world-size', default=1, type=int,
-                    help='Number of GPUs to use. Can either be manually set ' +
-                    'or automatically set by using \'python -m multiproc\'.')
-parser.add_argument('--rank', default=0, type=int,
-                    help='Used for multi-process training. Can either be manually set ' +
-                    'or automatically set by using \'python -m multiproc\'.')
 #=====END:   ADDED FOR DISTRIBUTED======

 args = parser.parse_args()
@@ -70,24 +61,23 @@ args.cuda = not args.no_cuda and torch.cuda.is_available()

 #======START: ADDED FOR DISTRIBUTED======
 '''Add a convenience flag to see if we are running distributed'''
-args.distributed = args.world_size > 1
+args.distributed = False
+if 'WORLD_SIZE' in os.environ:
+    args.distributed = int(os.environ['WORLD_SIZE']) > 1

-'''Check that we are running with cuda, as distributed is only supported for cuda.'''
 if args.distributed:
+    '''Check that we are running with cuda, as distributed is only supported for cuda.'''
    assert args.cuda, "Distributed mode requires running with CUDA."

-if args.distributed:
    '''
    Set cuda device so everything is done on the right GPU.
    THIS MUST BE DONE AS SOON AS POSSIBLE.
    '''
-    torch.cuda.set_device(args.rank % torch.cuda.device_count())
+    torch.cuda.set_device(args.local_rank)

    '''Initialize distributed communication'''
-    dist.init_process_group(args.dist_backend, 
-                            init_method=args.dist_url,
-                            world_size=args.world_size,
-                            rank=args.rank)
+    torch.distributed.init_process_group(backend='nccl',
+                                         init_method='env://')

 #=====END:   ADDED FOR DISTRIBUTED======

@@ -145,7 +135,7 @@ class Net(nn.Module):
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
-        return F.log_softmax(x)
+        return F.log_softmax(x, dim=1)

 model = Net()
 if args.cuda:
@@ -174,7 +164,7 @@ def train(epoch):
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
-        if batch_idx % args.log_interval == 0:
+        if batch_idx % args.log_interval == 0 and args.local_rank == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), to_python_float(loss.data)))
@@ -194,6 +184,7 @@ def test():
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
+
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
@@ -206,4 +197,5 @@ for epoch in range(1, args.epochs + 1):
    #=====END:   ADDED FOR DISTRIBUTED======

    train(epoch)
+    if args.local_rank == 0:
        test()
--- a/examples/imagenet/README.md
+++ b/examples/imagenet/README.md
@@ -3,20 +3,12 @@
 This example is based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet).
 It implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.

-`main.py` and `main_fp16_optimizer.py` have been modified to use the `DistributedDataParallel` module in Apex instead of the one in upstream PyTorch.  For description of how this works please see the distributed example included in this repo.
-
 `main.py` with the `--fp16` argument demonstrates mixed precision training with manual management of master parameters and loss scaling.

 `main_fp16_optimizer.py` with `--fp16` demonstrates use of `apex.fp16_utils.FP16_Optimizer` to automatically manage master parameters and loss scaling.

-To run multi-gpu on a single node use the command
-```python -m apex.parallel.multiproc main.py ...```
-adding any normal arguments.
-
 ## Requirements

- Apex which can be installed from https://www.github.com/nvidia/apex
- Install PyTorch from source, master branch of [pytorch on github](https://www.github.com/pytorch/pytorch).
 - `pip install -r requirements.txt`
 - Download the ImageNet dataset and move validation images to labeled subfolders
    - To do this, you can use the following script: https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh
@@ -34,7 +26,21 @@ python main.py -a alexnet --lr 0.01 /path/to/imagenet/folder
 The directory at /path/to/imagenet/directory should contain two subdirectories called "train"
 and "val" that contain the training and validation data respectively. Train images are expected to be 256x256 jpegs.

-**Example commands:** (note:  batch size `--b 256` assumes your GPUs have >=16GB of onboard memory)
+## Distributed training
+
+`main.py` and `main_fp16_optimizer.py` have been modified to use the `DistributedDataParallel` module in Apex instead of the one in upstream PyTorch. `apex.parallel.DistributedDataParallel` 
+is a drop-in replacement for `torch.nn.parallel.DistribtuedDataParallel` (see our [distributed example](https://github.com/NVIDIA/apex/tree/master/examples/distributed)).  
+The scripts can interact with 
+[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility)
+to spawn multiprocess jobs using the following syntax:
+```
+python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main.py args...
+```
+`NUM_GPUS` should be less than or equal to the number of visible GPU devices on the node.
+
+## Example commands
+
+(note:  batch size `--b 256` assumes your GPUs have >=16GB of onboard memory)

 ```bash
 ### Softlink training dataset into current directory
@@ -44,69 +50,21 @@ $ ln -sf /data/imagenet/val-jpeg/ val
 ### Single-process training
 $ python main.py -a resnet50 --fp16 --b 256 --workers 4 ./
 ### Multi-process training (uses all visible GPU on the node)
-$ python -m apex.parallel.multiproc main.py -a resnet50 --fp16 --b 256 --workers 4 ./
+$ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main.py -a resnet50 --fp16 --b 256 --workers 4 ./
 ### Multi-process training on GPUs 0 and 1 only
 $ export CUDA_VISIBLE_DEVICES=0,1
-$ python -m apex.parallel.multiproc main.py -a resnet50 --fp16 --b 256 --workers 4 ./
+$ python -m torch.distributed.launch --nproc_per_node=2 main.py -a resnet50 --fp16 --b 256 --workers 4 ./
 ### Multi-process training with FP16_Optimizer, default loss scale 1.0 (still uses FP32 master params)
-$ python -m apex.parallel.multiproc main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --workers 4 ./
+$ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --workers 4 ./
 # Multi-process training with FP16_Optimizer, static loss scale
-$ python -m apex.parallel.multiproc main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --static-loss-scale 128.0 --workers 4 ./
+$ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --static-loss-scale 128.0 --workers 4 ./
 ### Multi-process training with FP16_Optimizer, dynamic loss scaling
-$ python -m apex.parallel.multiproc main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --dynamic-loss-scale --workers 4 ./
+$ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --dynamic-loss-scale --workers 4 ./
 ```

 ## Usage for `main.py` and `main_fp16_optimizer.py`

 ```bash
-usage: main.py [-h] [--arch ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N]
-               [--lr LR] [--momentum M] [--weight-decay W] [--print-freq N]
-               [--resume PATH] [-e] [--pretrained] [--fp16]
-               [--static-loss-scale STATIC_LOSS_SCALE] [--prof]
-               [--dist-url DIST_URL] [--dist-backend DIST_BACKEND]
-               [--world-size WORLD_SIZE] [--rank RANK]
-               DIR
-
-PyTorch ImageNet Training
-
-positional arguments:
-  DIR                   path to dataset
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --arch ARCH, -a ARCH  model architecture: alexnet | densenet121 |
-                        densenet161 | densenet169 | densenet201 | inception_v3
-                        | resnet101 | resnet152 | resnet18 | resnet34 |
-                        resnet50 | squeezenet1_0 | squeezenet1_1 | vgg11 |
-                        vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn | vgg19
-                        | vgg19_bn (default: resnet18)
-  -j N, --workers N     number of data loading workers (default: 4)
-  --epochs N            number of total epochs to run
-  --start-epoch N       manual epoch number (useful on restarts)
-  -b N, --batch-size N  mini-batch size (default: 256)
-  --lr LR, --learning-rate LR
-                        initial learning rate
-  --momentum M          momentum
-  --weight-decay W, --wd W
-                        weight decay (default: 1e-4)
-  --print-freq N, -p N  print frequency (default: 10)
-  --resume PATH         path to latest checkpoint (default: none)
-  -e, --evaluate        evaluate model on validation set
-  --pretrained          use pre-trained model
-  --fp16                Run model fp16 mode.
-  --static-loss-scale STATIC_LOSS_SCALE
-                        Static loss scale, positive power of 2 values can improve
-                        fp16 convergence.
-  --prof                Only run 10 iterations for profiling.
-  --dist-url DIST_URL   url used to set up distributed training
-  --dist-backend DIST_BACKEND
-                        distributed backend
-  --world-size WORLD_SIZE
-                        Number of GPUs to use. Can either be manually set or
-                        automatically set by using 'python -m multiproc'.
-  --rank RANK           Used for multi-process training. Can either be
-                        manually set or automatically set by using 'python -m
-                        multiproc'.
 ```

 `main_fp16_optimizer.py` also accepts the optional flag

--- a/examples/imagenet/main.py
+++ b/examples/imagenet/main.py
@@ -66,17 +66,7 @@ parser.add_argument('--static-loss-scale', type=float, default=1,
 parser.add_argument('--prof', dest='prof', action='store_true',
                    help='Only run 10 iterations for profiling.')

-parser.add_argument('--dist-url', default='file://sync.file', type=str,
-                    help='url used to set up distributed training')
-parser.add_argument('--dist-backend', default='nccl', type=str,
-                    help='distributed backend')
-
-parser.add_argument('--world-size', default=1, type=int,
-                    help='Number of GPUs to use. Can either be manually set ' +
-                    'or automatically set by using \'python -m multiproc\'.')
-parser.add_argument('--rank', default=0, type=int,
-                    help='Used for multi-process training. Can either be manually set ' +
-                    'or automatically set by using \'python -m multiproc\'.')
+parser.add_argument("--local_rank", default=0, type=int)

 cudnn.benchmark = True

@@ -102,18 +92,19 @@ args = parser.parse_args()
 def main():
    global best_prec1, args

-    args.distributed = args.world_size > 1
-    args.gpu = 0
-    if args.distributed:
-        args.gpu = args.rank % torch.cuda.device_count()
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1

+    args.gpu = 0
+    args.world_size = 1
        
    if args.distributed:
+        args.gpu = args.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
-        dist.init_process_group(backend=args.dist_backend, 
-                                init_method=args.dist_url,
-                                world_size=args.world_size,
-                                rank=args.rank)
+        torch.distributed.init_process_group(backend='nccl',
+                                             init_method='env://')
+        args.world_size = torch.distributed.get_world_size()

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
@@ -130,7 +121,7 @@ def main():
    if args.fp16:
        model = network_to_half(model)
    if args.distributed:
-        #shared param turns off bucketing in DDP, for lower latency runs this can improve perf
+        # shared param turns off bucketing in DDP, for lower latency runs this can improve perf
        model = DDP(model, shared_param=True)

    global model_params, master_params
@@ -215,7 +206,7 @@ def main():
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
-        if args.rank == 0:
+        if args.local_rank == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint({
@@ -328,7 +319,7 @@ def train(train_loader, model, criterion, optimizer, epoch):
        end = time.time()
        input, target = prefetcher.next()

-        if args.rank == 0 and i % args.print_freq == 0 and i > 1:
+        if args.local_rank == 0 and i % args.print_freq == 0 and i > 1:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Speed {3:.3f} ({4:.3f})\t'
@@ -387,7 +378,7 @@ def validate(val_loader, model, criterion):
        batch_time.update(time.time() - end)
        end = time.time()

-        if args.rank == 0 and i % args.print_freq == 0:
+        if args.local_rank == 0 and i % args.print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Speed {2:.3f} ({3:.3f})\t'

--- a/examples/imagenet/main_fp16_optimizer.py
+++ b/examples/imagenet/main_fp16_optimizer.py
@@ -69,17 +69,7 @@ parser.add_argument('--dynamic-loss-scale', action='store_true',
 parser.add_argument('--prof', dest='prof', action='store_true',
                    help='Only run 10 iterations for profiling.')

-parser.add_argument('--dist-url', default='file://sync.file', type=str,
-                    help='url used to set up distributed training')
-parser.add_argument('--dist-backend', default='nccl', type=str,
-                    help='distributed backend')
-
-parser.add_argument('--world-size', default=1, type=int,
-                    help='Number of GPUs to use. Can either be manually set ' +
-                    'or automatically set by using \'python -m multiproc\'.')
-parser.add_argument('--rank', default=0, type=int,
-                    help='Used for multi-process training. Can either be manually set ' +
-                    'or automatically set by using \'python -m multiproc\'.')
+parser.add_argument("--local_rank", default=0, type=int)

 cudnn.benchmark = True

@@ -105,18 +95,18 @@ args = parser.parse_args()
 def main():
    global best_prec1, args

-    args.distributed = args.world_size > 1
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
    args.gpu = 0
    if args.distributed:
-        args.gpu = args.rank % torch.cuda.device_count()
-        
+        args.gpu = args.local_rank % torch.cuda.device_count()
        
    if args.distributed:
        torch.cuda.set_device(args.gpu)
-        dist.init_process_group(backend=args.dist_backend, 
-                                init_method=args.dist_url,
-                                world_size=args.world_size,
-                                rank=args.rank)
+        torch.distributed.init_process_group(backend='nccl',
+                                             init_method='env://')

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
@@ -137,7 +127,7 @@ def main():
    if args.fp16:
        model = network_to_half(model)
    if args.distributed:
-        #shared param turns off bucketing in DDP, for lower latency runs this can improve perf
+        # shared param turns off bucketing in DDP, for lower latency runs this can improve perf
        model = DDP(model, shared_param=True)

    # define loss function (criterion) and optimizer
@@ -220,7 +210,7 @@ def main():
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
-        if args.rank == 0:
+        if args.local_rank == 0:
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint({
@@ -325,7 +315,7 @@ def train(train_loader, model, criterion, optimizer, epoch):
        end = time.time()
        input, target = prefetcher.next()

-        if args.rank == 0 and i % args.print_freq == 0 and i > 1:
+        if args.local_rank == 0 and i % args.print_freq == 0 and i > 1:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Speed {3:.3f} ({4:.3f})\t'
@@ -334,8 +324,8 @@ def train(train_loader, model, criterion, optimizer, epoch):
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   epoch, i, len(train_loader),
-                   args.world_size * args.batch_size / batch_time.val,
-                   args.world_size * args.batch_size / batch_time.avg,
+                   torch.distributed.get_world_size() * args.batch_size / batch_time.val,
+                   torch.distributed.get_world_size() * args.batch_size / batch_time.avg,
                   batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1, top5=top5))

@@ -384,7 +374,7 @@ def validate(val_loader, model, criterion):
        batch_time.update(time.time() - end)
        end = time.time()

-        if args.rank == 0 and i % args.print_freq == 0:
+        if args.local_rank == 0 and i % args.print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Speed {2:.3f} ({3:.3f})\t'
@@ -392,8 +382,8 @@ def validate(val_loader, model, criterion):
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   i, len(val_loader),
-                   args.world_size * args.batch_size / batch_time.val,
-                   args.world_size * args.batch_size / batch_time.avg,
+                   torch.distributed.get_world_size() * args.batch_size / batch_time.val,
+                   torch.distributed.get_world_size() * args.batch_size / batch_time.avg,
                   batch_time=batch_time, loss=losses,
                   top1=top1, top5=top5))

@@ -455,7 +445,7 @@ def accuracy(output, target, topk=(1,)):
 def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
-    rt /= args.world_size
+    rt /= torch.distributed.get_world_size()
    return rt

 if __name__ == '__main__':