Unverified Commit bfd88cad authored by Benjamin Lefaudeux's avatar Benjamin Lefaudeux Committed by GitHub
Browse files

[minor] OSS: bring DDP in the benchmark (#130)

More realistic benchmarks, comparing apples to apples. DDP/OSS+DDP/OSS+SDP
parent 81ac5b28
...@@ -100,7 +100,7 @@ run_oss_benchmark: &run_oss_benchmark ...@@ -100,7 +100,7 @@ run_oss_benchmark: &run_oss_benchmark
- run: - run:
name: Run OSS Benchmark name: Run OSS Benchmark
command: | command: |
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 21.2 --reference_memory 4220 --reference_loss 0.63 python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 13.7 --reference_memory 4390 --reference_loss 0.595
run_oss_gloo: &run_oss_gloo run_oss_gloo: &run_oss_gloo
- run: - run:
......
...@@ -12,6 +12,7 @@ import torch ...@@ -12,6 +12,7 @@ import torch
import torch.distributed as dist import torch.distributed as dist
import torch.multiprocessing as mp import torch.multiprocessing as mp
import torch.nn as nn import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from torchvision.datasets import FakeData from torchvision.datasets import FakeData
from torchvision.models import resnet101 from torchvision.models import resnet101
...@@ -40,7 +41,9 @@ def get_problem(rank, data_size, batch_size): ...@@ -40,7 +41,9 @@ def get_problem(rank, data_size, batch_size):
} }
dataloader = DataLoader( dataloader = DataLoader(
dataset=FakeData(transform=ToTensor(), size=data_size), batch_size=batch_size, collate_fn=collate dataset=FakeData(transform=ToTensor(), size=data_size, random_offset=rank),
batch_size=batch_size,
collate_fn=collate,
) )
loss_fn = nn.CrossEntropyLoss() loss_fn = nn.CrossEntropyLoss()
return model, dataloader, loss_fn return model, dataloader, loss_fn
...@@ -85,12 +88,13 @@ def train( ...@@ -85,12 +88,13 @@ def train(
optimizer=OPTIM, optimizer=OPTIM,
optimizer_params={"lr": 1e-4, "momentum": 0.9}, optimizer_params={"lr": 1e-4, "momentum": 0.9},
world_size=world_size, world_size=world_size,
broadcast_buffers=False, broadcast_buffers=True,
) )
ddp.train() ddp.train()
optimizer = ddp.optimizer optimizer = ddp.optimizer
model = ddp model = ddp
else: else:
model = DDP(model, device_ids=[rank], find_unused_parameters=True) # type: ignore
optimizer = ( optimizer = (
OSS(params=model.parameters(), optim=OPTIM, lr=1e-4, momentum=0.9) OSS(params=model.parameters(), optim=OPTIM, lr=1e-4, momentum=0.9)
if use_oss if use_oss
...@@ -216,7 +220,7 @@ if __name__ == "__main__": ...@@ -216,7 +220,7 @@ if __name__ == "__main__":
) )
if args.optim_type == OptimType.oss or args.optim_type == OptimType.everyone: if args.optim_type == OptimType.oss or args.optim_type == OptimType.everyone:
print("\nBenchmark OSS") print("\nBenchmark OSS with DDP")
mp.spawn( mp.spawn(
train, train,
args=( args=(
...@@ -237,7 +241,7 @@ if __name__ == "__main__": ...@@ -237,7 +241,7 @@ if __name__ == "__main__":
) )
if args.optim_type == OptimType.oss_sdp or args.optim_type == OptimType.everyone: if args.optim_type == OptimType.oss_sdp or args.optim_type == OptimType.everyone:
print("\nBenchmark OSS DDP") print("\nBenchmark OSS with SDP")
mp.spawn( mp.spawn(
train, train,
args=( args=(
...@@ -248,7 +252,7 @@ if __name__ == "__main__": ...@@ -248,7 +252,7 @@ if __name__ == "__main__":
backend, backend,
True, # OSS True, # OSS
True, # SDP True, # SDP
args.check_regression, False, # FIXME: @lefaudeux - SDP should give the same results
-1, # Not checking SDP for speed regression for now, still slower than OSS -1, # Not checking SDP for speed regression for now, still slower than OSS
args.reference_memory, args.reference_memory,
args.reference_loss, args.reference_loss,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment