"...text-generation-inference.git" did not exist on "8b182eb98662ea781990a4a2e869eb8859e26073"
Commit aa817132 authored by Michael Carilli's avatar Michael Carilli
Browse files

Ported examples to use torch.distributed.launch

parent 12b49b98
**distributed_data_parallel.py** shows an example using `FP16_Optimizer` with **distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
`apex.parallel.DistributedDataParallel`. `apex.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
The usage of `FP16_Optimizer` with distributed does not need to change from ordinary The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
single-process usage. Test via single-process usage. Test via
```bash ```bash
......
...@@ -5,20 +5,12 @@ from apex.parallel import DistributedDataParallel as DDP ...@@ -5,20 +5,12 @@ from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import FP16_Optimizer from apex.fp16_utils import FP16_Optimizer
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, parser.add_argument("--local_rank", default=0, type=int)
help='url used to set up distributed training')
parser.add_argument('--world-size', default=2, type=int,
help='Number of distributed processes.')
parser.add_argument("--rank", type=int,
help='Rank of this process')
args = parser.parse_args() args = parser.parse_args()
torch.cuda.set_device(args.rank) torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', torch.distributed.init_process_group(backend='nccl',
init_method=args.dist_url, init_method='env://')
world_size=args.world_size,
rank=args.rank)
torch.backends.cudnn.benchmark = True torch.backends.cudnn.benchmark = True
......
#!/bin/bash #!/bin/bash
# By default, apex.parallel.multiproc will attempt to use all available GPUs on the system. python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
# The number of GPUs to use can be limited by setting CUDA_VISIBLE_DEVICES:
export CUDA_VISIBLE_DEVICES=0,1
python -m apex.parallel.multiproc distributed_data_parallel.py
**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
`apex.parallel.DistributedDataParallel` in conjuction with the legacy Apex
launcher script, `apex.parallel.multiproc`. See
[FP16_Optimizer_simple/distributed_apex](https://github.com/NVIDIA/apex/tree/torch_launcher/examples/FP16_Optimizer_simple/distributed_apex) for a more up-to-date example that uses the Pytorch launcher
script, `torch.distributed.launch`.
The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
single-process usage. Test via
```bash
bash run.sh
```
import torch
from torch.autograd import Variable
import argparse
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import FP16_Optimizer
parser = argparse.ArgumentParser()
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--world-size', default=2, type=int,
help='Number of distributed processes.')
parser.add_argument("--rank", type=int,
help='Rank of this process')
args = parser.parse_args()
torch.cuda.set_device(args.rank)
torch.distributed.init_process_group(backend='nccl',
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
torch.backends.cudnn.benchmark = True
N, D_in, D_out = 64, 1024, 16
x = Variable(torch.cuda.FloatTensor(N, D_in ).normal_()).half()
y = Variable(torch.cuda.FloatTensor(N, D_out).normal_()).half()
model = torch.nn.Linear(D_in, D_out).cuda().half()
model = DDP(model)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
### Construct FP16_Optimizer ###
optimizer = FP16_Optimizer(optimizer)
###
loss_fn = torch.nn.MSELoss()
for t in range(500):
optimizer.zero_grad()
y_pred = model(x)
loss = loss_fn(y_pred.float(), y.float())
### Change loss.backward() to: ###
optimizer.backward(loss)
###
optimizer.step()
print("final loss = ", loss)
#!/bin/bash
# By default, apex.parallel.multiproc will attempt to use all available GPUs on the system.
# The number of GPUs to use can be limited by setting CUDA_VISIBLE_DEVICES:
export CUDA_VISIBLE_DEVICES=0,1
python -m apex.parallel.multiproc distributed_data_parallel.py
**distributed_data_parallel.py** shows an example using `FP16_Optimizer` with **distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
`torch.nn.parallel.DistributedDataParallel`. `torch.nn.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
The usage of `FP16_Optimizer` with distributed does not need to change from ordinary The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
single-process usage. Test via single-process usage. Test via
```bash ```bash
......
...@@ -4,7 +4,7 @@ import argparse ...@@ -4,7 +4,7 @@ import argparse
from apex.fp16_utils import FP16_Optimizer from apex.fp16_utils import FP16_Optimizer
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int) parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args() args = parser.parse_args()
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
......
# Multiprocess Example based on pytorch/examples/mnist # Multiprocess Example based on pytorch/examples/mnist
main.py demonstrates how to modify a simple model to enable multiprocess distributed data parallel main.py demonstrates how to modify a simple model to enable multiprocess distributed data parallel
training using the module wrapper `apex.parallel.DistributedDataParallel` training using the module wrapper `apex.parallel.DistributedDataParallel`
(similar to `torch.nn.parallel.DistributedDataParallel`). (similar to `torch.nn.parallel.DistributedDataParallel`).
Multiprocess distributed data parallel training frequently outperforms single-process Multiprocess distributed data parallel training frequently outperforms single-process
...@@ -29,27 +29,25 @@ To download the dataset, run ...@@ -29,27 +29,25 @@ To download the dataset, run
```python main.py``` ```python main.py```
without any arguments. Once you have downloaded the dataset, you should not need to do this again. without any arguments. Once you have downloaded the dataset, you should not need to do this again.
You can now launch multi-process distributed data parallel jobs via `main.py` runs multiprocess distributed data parallel jobs using the Pytorch launcher script
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
Jobs are launched via
```bash ```bash
python -m apex.parallel.multiproc main.py args... python -m torch.distributed.launch --nproc_per_node=N main.py args...
``` ```
adding any `args...` you like. The launch script `apex.parallel.multiproc` will `torch.distributed.launch` spawns `N` processes, each of which runs as
spawn one process for each of your system's available (visible) GPUs. `python main.py args... --local_rank <rank>`.
Each process will run `python main.py args... --world-size <worldsize> --rank <rank>` The `local_rank` argument for each process is determined and appended by `torch.distributed.launch`,
(the `--world-size` and `--rank` arguments are determined and appended by `apex.parallel.multiproc`). and varies between 0 and `N-1`. `torch.distributed.launch` also provides environment variables
Each `main.py` calls `torch.cuda.set_device()` and `torch.distributed.init_process_group()` for each process.
according to the `rank` and `world-size` arguments it receives. Internally, each process calls `set_device` according to its local
rank and `init_process_group` with `init_method=`env://' to ingest the provided environment
The number of visible GPU devices (and therefore the number of processes variables.
`DistributedDataParallel` will spawn) can be controlled by setting the environment variable For best performance, set `N` equal to the number of visible CUDA devices on the node.
`CUDA_VISIBLE_DEVICES`. For example, if you `export CUDA_VISIBLE_DEVICES=0,1` and run
```python -m apex.parallel.multiproc main.py ...```, the launch utility will spawn two processes
which will run on devices 0 and 1. By default, if `CUDA_VISIBLE_DEVICES` is unset,
`apex.parallel.multiproc` will attempt to use every device on the node.
## Converting your own model ## Converting your own model
To understand how to convert your own model, please see all sections of main.py within ```#=====START: ADDED FOR DISTRIBUTED======``` and ```#=====END: ADDED FOR DISTRIBUTED======``` flags. To understand how to convert your own model, please see all sections of main.py within ```#=====START: ADDED FOR DISTRIBUTED======``` and ```#=====END: ADDED FOR DISTRIBUTED======``` flags.
## Requirements ## Requirements
Pytorch master branch built from source. This requirement is to use NCCL as a distributed backend. Pytorch with NCCL available as a distributed backend. Pytorch 0.4+, installed as a pip or conda package, should have this by default. Otherwise, you can build Pytorch from source, in an environment where NCCL is installed and visible.
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import os
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
...@@ -49,20 +50,10 @@ parser.add_argument('--log-interval', type=int, default=10, metavar='N', ...@@ -49,20 +50,10 @@ parser.add_argument('--log-interval', type=int, default=10, metavar='N',
Add some distributed options. For explanation of dist-url and dist-backend please see Add some distributed options. For explanation of dist-url and dist-backend please see
http://pytorch.org/tutorials/intermediate/dist_tuto.html http://pytorch.org/tutorials/intermediate/dist_tuto.html
--world-size and --rank are required parameters as they will be used by the multiproc.py launcher --local_rank will be supplied by the Pytorch launcher wrapper (torch.distributed.launch)
but do not have to be set explicitly.
''' '''
parser.add_argument("--local_rank", default=0, type=int)
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--world-size', default=1, type=int,
help='Number of GPUs to use. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--rank', default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
#=====END: ADDED FOR DISTRIBUTED====== #=====END: ADDED FOR DISTRIBUTED======
args = parser.parse_args() args = parser.parse_args()
...@@ -70,24 +61,23 @@ args.cuda = not args.no_cuda and torch.cuda.is_available() ...@@ -70,24 +61,23 @@ args.cuda = not args.no_cuda and torch.cuda.is_available()
#======START: ADDED FOR DISTRIBUTED====== #======START: ADDED FOR DISTRIBUTED======
'''Add a convenience flag to see if we are running distributed''' '''Add a convenience flag to see if we are running distributed'''
args.distributed = args.world_size > 1 args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
'''Check that we are running with cuda, as distributed is only supported for cuda.'''
if args.distributed: if args.distributed:
'''Check that we are running with cuda, as distributed is only supported for cuda.'''
assert args.cuda, "Distributed mode requires running with CUDA." assert args.cuda, "Distributed mode requires running with CUDA."
if args.distributed:
''' '''
Set cuda device so everything is done on the right GPU. Set cuda device so everything is done on the right GPU.
THIS MUST BE DONE AS SOON AS POSSIBLE. THIS MUST BE DONE AS SOON AS POSSIBLE.
''' '''
torch.cuda.set_device(args.rank % torch.cuda.device_count()) torch.cuda.set_device(args.local_rank)
'''Initialize distributed communication''' '''Initialize distributed communication'''
dist.init_process_group(args.dist_backend, torch.distributed.init_process_group(backend='nccl',
init_method=args.dist_url, init_method='env://')
world_size=args.world_size,
rank=args.rank)
#=====END: ADDED FOR DISTRIBUTED====== #=====END: ADDED FOR DISTRIBUTED======
...@@ -145,7 +135,7 @@ class Net(nn.Module): ...@@ -145,7 +135,7 @@ class Net(nn.Module):
x = F.relu(self.fc1(x)) x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training) x = F.dropout(x, training=self.training)
x = self.fc2(x) x = self.fc2(x)
return F.log_softmax(x) return F.log_softmax(x, dim=1)
model = Net() model = Net()
if args.cuda: if args.cuda:
...@@ -174,7 +164,7 @@ def train(epoch): ...@@ -174,7 +164,7 @@ def train(epoch):
loss = F.nll_loss(output, target) loss = F.nll_loss(output, target)
loss.backward() loss.backward()
optimizer.step() optimizer.step()
if batch_idx % args.log_interval == 0: if batch_idx % args.log_interval == 0 and args.local_rank == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), to_python_float(loss.data))) 100. * batch_idx / len(train_loader), to_python_float(loss.data)))
...@@ -194,6 +184,7 @@ def test(): ...@@ -194,6 +184,7 @@ def test():
correct += pred.eq(target.data.view_as(pred)).cpu().sum() correct += pred.eq(target.data.view_as(pred)).cpu().sum()
test_loss /= len(test_loader.dataset) test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset), test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset))) 100. * correct / len(test_loader.dataset)))
...@@ -206,4 +197,5 @@ for epoch in range(1, args.epochs + 1): ...@@ -206,4 +197,5 @@ for epoch in range(1, args.epochs + 1):
#=====END: ADDED FOR DISTRIBUTED====== #=====END: ADDED FOR DISTRIBUTED======
train(epoch) train(epoch)
test() if args.local_rank == 0:
test()
...@@ -3,20 +3,12 @@ ...@@ -3,20 +3,12 @@
This example is based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet). This example is based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet).
It implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset. It implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.
`main.py` and `main_fp16_optimizer.py` have been modified to use the `DistributedDataParallel` module in Apex instead of the one in upstream PyTorch. For description of how this works please see the distributed example included in this repo.
`main.py` with the `--fp16` argument demonstrates mixed precision training with manual management of master parameters and loss scaling. `main.py` with the `--fp16` argument demonstrates mixed precision training with manual management of master parameters and loss scaling.
`main_fp16_optimizer.py` with `--fp16` demonstrates use of `apex.fp16_utils.FP16_Optimizer` to automatically manage master parameters and loss scaling. `main_fp16_optimizer.py` with `--fp16` demonstrates use of `apex.fp16_utils.FP16_Optimizer` to automatically manage master parameters and loss scaling.
To run multi-gpu on a single node use the command
```python -m apex.parallel.multiproc main.py ...```
adding any normal arguments.
## Requirements ## Requirements
- Apex which can be installed from https://www.github.com/nvidia/apex
- Install PyTorch from source, master branch of [pytorch on github](https://www.github.com/pytorch/pytorch).
- `pip install -r requirements.txt` - `pip install -r requirements.txt`
- Download the ImageNet dataset and move validation images to labeled subfolders - Download the ImageNet dataset and move validation images to labeled subfolders
- To do this, you can use the following script: https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh - To do this, you can use the following script: https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh
...@@ -34,7 +26,21 @@ python main.py -a alexnet --lr 0.01 /path/to/imagenet/folder ...@@ -34,7 +26,21 @@ python main.py -a alexnet --lr 0.01 /path/to/imagenet/folder
The directory at /path/to/imagenet/directory should contain two subdirectories called "train" The directory at /path/to/imagenet/directory should contain two subdirectories called "train"
and "val" that contain the training and validation data respectively. Train images are expected to be 256x256 jpegs. and "val" that contain the training and validation data respectively. Train images are expected to be 256x256 jpegs.
**Example commands:** (note: batch size `--b 256` assumes your GPUs have >=16GB of onboard memory) ## Distributed training
`main.py` and `main_fp16_optimizer.py` have been modified to use the `DistributedDataParallel` module in Apex instead of the one in upstream PyTorch. `apex.parallel.DistributedDataParallel`
is a drop-in replacement for `torch.nn.parallel.DistribtuedDataParallel` (see our [distributed example](https://github.com/NVIDIA/apex/tree/master/examples/distributed)).
The scripts can interact with
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility)
to spawn multiprocess jobs using the following syntax:
```
python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main.py args...
```
`NUM_GPUS` should be less than or equal to the number of visible GPU devices on the node.
## Example commands
(note: batch size `--b 256` assumes your GPUs have >=16GB of onboard memory)
```bash ```bash
### Softlink training dataset into current directory ### Softlink training dataset into current directory
...@@ -44,69 +50,21 @@ $ ln -sf /data/imagenet/val-jpeg/ val ...@@ -44,69 +50,21 @@ $ ln -sf /data/imagenet/val-jpeg/ val
### Single-process training ### Single-process training
$ python main.py -a resnet50 --fp16 --b 256 --workers 4 ./ $ python main.py -a resnet50 --fp16 --b 256 --workers 4 ./
### Multi-process training (uses all visible GPU on the node) ### Multi-process training (uses all visible GPU on the node)
$ python -m apex.parallel.multiproc main.py -a resnet50 --fp16 --b 256 --workers 4 ./ $ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main.py -a resnet50 --fp16 --b 256 --workers 4 ./
### Multi-process training on GPUs 0 and 1 only ### Multi-process training on GPUs 0 and 1 only
$ export CUDA_VISIBLE_DEVICES=0,1 $ export CUDA_VISIBLE_DEVICES=0,1
$ python -m apex.parallel.multiproc main.py -a resnet50 --fp16 --b 256 --workers 4 ./ $ python -m torch.distributed.launch --nproc_per_node=2 main.py -a resnet50 --fp16 --b 256 --workers 4 ./
### Multi-process training with FP16_Optimizer, default loss scale 1.0 (still uses FP32 master params) ### Multi-process training with FP16_Optimizer, default loss scale 1.0 (still uses FP32 master params)
$ python -m apex.parallel.multiproc main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --workers 4 ./ $ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --workers 4 ./
# Multi-process training with FP16_Optimizer, static loss scale # Multi-process training with FP16_Optimizer, static loss scale
$ python -m apex.parallel.multiproc main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --static-loss-scale 128.0 --workers 4 ./ $ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --static-loss-scale 128.0 --workers 4 ./
### Multi-process training with FP16_Optimizer, dynamic loss scaling ### Multi-process training with FP16_Optimizer, dynamic loss scaling
$ python -m apex.parallel.multiproc main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --dynamic-loss-scale --workers 4 ./ $ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --dynamic-loss-scale --workers 4 ./
``` ```
## Usage for `main.py` and `main_fp16_optimizer.py` ## Usage for `main.py` and `main_fp16_optimizer.py`
```bash ```bash
usage: main.py [-h] [--arch ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N]
[--lr LR] [--momentum M] [--weight-decay W] [--print-freq N]
[--resume PATH] [-e] [--pretrained] [--fp16]
[--static-loss-scale STATIC_LOSS_SCALE] [--prof]
[--dist-url DIST_URL] [--dist-backend DIST_BACKEND]
[--world-size WORLD_SIZE] [--rank RANK]
DIR
PyTorch ImageNet Training
positional arguments:
DIR path to dataset
optional arguments:
-h, --help show this help message and exit
--arch ARCH, -a ARCH model architecture: alexnet | densenet121 |
densenet161 | densenet169 | densenet201 | inception_v3
| resnet101 | resnet152 | resnet18 | resnet34 |
resnet50 | squeezenet1_0 | squeezenet1_1 | vgg11 |
vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn | vgg19
| vgg19_bn (default: resnet18)
-j N, --workers N number of data loading workers (default: 4)
--epochs N number of total epochs to run
--start-epoch N manual epoch number (useful on restarts)
-b N, --batch-size N mini-batch size (default: 256)
--lr LR, --learning-rate LR
initial learning rate
--momentum M momentum
--weight-decay W, --wd W
weight decay (default: 1e-4)
--print-freq N, -p N print frequency (default: 10)
--resume PATH path to latest checkpoint (default: none)
-e, --evaluate evaluate model on validation set
--pretrained use pre-trained model
--fp16 Run model fp16 mode.
--static-loss-scale STATIC_LOSS_SCALE
Static loss scale, positive power of 2 values can improve
fp16 convergence.
--prof Only run 10 iterations for profiling.
--dist-url DIST_URL url used to set up distributed training
--dist-backend DIST_BACKEND
distributed backend
--world-size WORLD_SIZE
Number of GPUs to use. Can either be manually set or
automatically set by using 'python -m multiproc'.
--rank RANK Used for multi-process training. Can either be
manually set or automatically set by using 'python -m
multiproc'.
``` ```
`main_fp16_optimizer.py` also accepts the optional flag `main_fp16_optimizer.py` also accepts the optional flag
......
...@@ -66,17 +66,7 @@ parser.add_argument('--static-loss-scale', type=float, default=1, ...@@ -66,17 +66,7 @@ parser.add_argument('--static-loss-scale', type=float, default=1,
parser.add_argument('--prof', dest='prof', action='store_true', parser.add_argument('--prof', dest='prof', action='store_true',
help='Only run 10 iterations for profiling.') help='Only run 10 iterations for profiling.')
parser.add_argument('--dist-url', default='file://sync.file', type=str, parser.add_argument("--local_rank", default=0, type=int)
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--world-size', default=1, type=int,
help='Number of GPUs to use. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--rank', default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
cudnn.benchmark = True cudnn.benchmark = True
...@@ -102,18 +92,19 @@ args = parser.parse_args() ...@@ -102,18 +92,19 @@ args = parser.parse_args()
def main(): def main():
global best_prec1, args global best_prec1, args
args.distributed = args.world_size > 1 args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
args.gpu = 0 args.gpu = 0
if args.distributed: args.world_size = 1
args.gpu = args.rank % torch.cuda.device_count()
if args.distributed: if args.distributed:
args.gpu = args.local_rank % torch.cuda.device_count()
torch.cuda.set_device(args.gpu) torch.cuda.set_device(args.gpu)
dist.init_process_group(backend=args.dist_backend, torch.distributed.init_process_group(backend='nccl',
init_method=args.dist_url, init_method='env://')
world_size=args.world_size, args.world_size = torch.distributed.get_world_size()
rank=args.rank)
if args.fp16: if args.fp16:
assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
...@@ -130,7 +121,7 @@ def main(): ...@@ -130,7 +121,7 @@ def main():
if args.fp16: if args.fp16:
model = network_to_half(model) model = network_to_half(model)
if args.distributed: if args.distributed:
#shared param turns off bucketing in DDP, for lower latency runs this can improve perf # shared param turns off bucketing in DDP, for lower latency runs this can improve perf
model = DDP(model, shared_param=True) model = DDP(model, shared_param=True)
global model_params, master_params global model_params, master_params
...@@ -215,7 +206,7 @@ def main(): ...@@ -215,7 +206,7 @@ def main():
prec1 = validate(val_loader, model, criterion) prec1 = validate(val_loader, model, criterion)
# remember best prec@1 and save checkpoint # remember best prec@1 and save checkpoint
if args.rank == 0: if args.local_rank == 0:
is_best = prec1 > best_prec1 is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1) best_prec1 = max(prec1, best_prec1)
save_checkpoint({ save_checkpoint({
...@@ -328,7 +319,7 @@ def train(train_loader, model, criterion, optimizer, epoch): ...@@ -328,7 +319,7 @@ def train(train_loader, model, criterion, optimizer, epoch):
end = time.time() end = time.time()
input, target = prefetcher.next() input, target = prefetcher.next()
if args.rank == 0 and i % args.print_freq == 0 and i > 1: if args.local_rank == 0 and i % args.print_freq == 0 and i > 1:
print('Epoch: [{0}][{1}/{2}]\t' print('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Speed {3:.3f} ({4:.3f})\t' 'Speed {3:.3f} ({4:.3f})\t'
...@@ -387,7 +378,7 @@ def validate(val_loader, model, criterion): ...@@ -387,7 +378,7 @@ def validate(val_loader, model, criterion):
batch_time.update(time.time() - end) batch_time.update(time.time() - end)
end = time.time() end = time.time()
if args.rank == 0 and i % args.print_freq == 0: if args.local_rank == 0 and i % args.print_freq == 0:
print('Test: [{0}/{1}]\t' print('Test: [{0}/{1}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Speed {2:.3f} ({3:.3f})\t' 'Speed {2:.3f} ({3:.3f})\t'
......
...@@ -69,17 +69,7 @@ parser.add_argument('--dynamic-loss-scale', action='store_true', ...@@ -69,17 +69,7 @@ parser.add_argument('--dynamic-loss-scale', action='store_true',
parser.add_argument('--prof', dest='prof', action='store_true', parser.add_argument('--prof', dest='prof', action='store_true',
help='Only run 10 iterations for profiling.') help='Only run 10 iterations for profiling.')
parser.add_argument('--dist-url', default='file://sync.file', type=str, parser.add_argument("--local_rank", default=0, type=int)
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--world-size', default=1, type=int,
help='Number of GPUs to use. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--rank', default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
cudnn.benchmark = True cudnn.benchmark = True
...@@ -105,18 +95,18 @@ args = parser.parse_args() ...@@ -105,18 +95,18 @@ args = parser.parse_args()
def main(): def main():
global best_prec1, args global best_prec1, args
args.distributed = args.world_size > 1 args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
args.gpu = 0 args.gpu = 0
if args.distributed: if args.distributed:
args.gpu = args.rank % torch.cuda.device_count() args.gpu = args.local_rank % torch.cuda.device_count()
if args.distributed: if args.distributed:
torch.cuda.set_device(args.gpu) torch.cuda.set_device(args.gpu)
dist.init_process_group(backend=args.dist_backend, torch.distributed.init_process_group(backend='nccl',
init_method=args.dist_url, init_method='env://')
world_size=args.world_size,
rank=args.rank)
if args.fp16: if args.fp16:
assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
...@@ -137,7 +127,7 @@ def main(): ...@@ -137,7 +127,7 @@ def main():
if args.fp16: if args.fp16:
model = network_to_half(model) model = network_to_half(model)
if args.distributed: if args.distributed:
#shared param turns off bucketing in DDP, for lower latency runs this can improve perf # shared param turns off bucketing in DDP, for lower latency runs this can improve perf
model = DDP(model, shared_param=True) model = DDP(model, shared_param=True)
# define loss function (criterion) and optimizer # define loss function (criterion) and optimizer
...@@ -220,7 +210,7 @@ def main(): ...@@ -220,7 +210,7 @@ def main():
prec1 = validate(val_loader, model, criterion) prec1 = validate(val_loader, model, criterion)
# remember best prec@1 and save checkpoint # remember best prec@1 and save checkpoint
if args.rank == 0: if args.local_rank == 0:
is_best = prec1 > best_prec1 is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1) best_prec1 = max(prec1, best_prec1)
save_checkpoint({ save_checkpoint({
...@@ -325,7 +315,7 @@ def train(train_loader, model, criterion, optimizer, epoch): ...@@ -325,7 +315,7 @@ def train(train_loader, model, criterion, optimizer, epoch):
end = time.time() end = time.time()
input, target = prefetcher.next() input, target = prefetcher.next()
if args.rank == 0 and i % args.print_freq == 0 and i > 1: if args.local_rank == 0 and i % args.print_freq == 0 and i > 1:
print('Epoch: [{0}][{1}/{2}]\t' print('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Speed {3:.3f} ({4:.3f})\t' 'Speed {3:.3f} ({4:.3f})\t'
...@@ -334,8 +324,8 @@ def train(train_loader, model, criterion, optimizer, epoch): ...@@ -334,8 +324,8 @@ def train(train_loader, model, criterion, optimizer, epoch):
'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
epoch, i, len(train_loader), epoch, i, len(train_loader),
args.world_size * args.batch_size / batch_time.val, torch.distributed.get_world_size() * args.batch_size / batch_time.val,
args.world_size * args.batch_size / batch_time.avg, torch.distributed.get_world_size() * args.batch_size / batch_time.avg,
batch_time=batch_time, batch_time=batch_time,
data_time=data_time, loss=losses, top1=top1, top5=top5)) data_time=data_time, loss=losses, top1=top1, top5=top5))
...@@ -384,7 +374,7 @@ def validate(val_loader, model, criterion): ...@@ -384,7 +374,7 @@ def validate(val_loader, model, criterion):
batch_time.update(time.time() - end) batch_time.update(time.time() - end)
end = time.time() end = time.time()
if args.rank == 0 and i % args.print_freq == 0: if args.local_rank == 0 and i % args.print_freq == 0:
print('Test: [{0}/{1}]\t' print('Test: [{0}/{1}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Speed {2:.3f} ({3:.3f})\t' 'Speed {2:.3f} ({3:.3f})\t'
...@@ -392,8 +382,8 @@ def validate(val_loader, model, criterion): ...@@ -392,8 +382,8 @@ def validate(val_loader, model, criterion):
'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
i, len(val_loader), i, len(val_loader),
args.world_size * args.batch_size / batch_time.val, torch.distributed.get_world_size() * args.batch_size / batch_time.val,
args.world_size * args.batch_size / batch_time.avg, torch.distributed.get_world_size() * args.batch_size / batch_time.avg,
batch_time=batch_time, loss=losses, batch_time=batch_time, loss=losses,
top1=top1, top5=top5)) top1=top1, top5=top5))
...@@ -455,7 +445,7 @@ def accuracy(output, target, topk=(1,)): ...@@ -455,7 +445,7 @@ def accuracy(output, target, topk=(1,)):
def reduce_tensor(tensor): def reduce_tensor(tensor):
rt = tensor.clone() rt = tensor.clone()
dist.all_reduce(rt, op=dist.reduce_op.SUM) dist.all_reduce(rt, op=dist.reduce_op.SUM)
rt /= args.world_size rt /= torch.distributed.get_world_size()
return rt return rt
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment