Unverified Commit 10062e58 authored by Benjamin Lefaudeux's avatar Benjamin Lefaudeux Committed by GitHub
Browse files

[feat][minor] OSS: benchmark - adding a cpu option (#144)

* adding a cpu option
* adjust the reference loss
parent 61234360
...@@ -108,7 +108,7 @@ run_oss_benchmark: &run_oss_benchmark ...@@ -108,7 +108,7 @@ run_oss_benchmark: &run_oss_benchmark
- run: - run:
name: Run OSS Benchmark name: Run OSS Benchmark
command: | command: |
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 13.7 --reference_memory 4390 --reference_loss 0.152 python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 13.7 --reference_memory 4390 --reference_loss 0.302
run_oss_gloo: &run_oss_gloo run_oss_gloo: &run_oss_gloo
- run: - run:
......
...@@ -30,15 +30,15 @@ def dist_init(rank, world_size, backend): ...@@ -30,15 +30,15 @@ def dist_init(rank, world_size, backend):
dist.init_process_group(backend=backend, init_method="tcp://localhost:29501", rank=rank, world_size=world_size) dist.init_process_group(backend=backend, init_method="tcp://localhost:29501", rank=rank, world_size=world_size)
def get_problem(rank, data_size, batch_size): def get_problem(rank, data_size, batch_size, device):
# Standard RN101 # Standard RN101
model = resnet101(pretrained=False, progress=True).to(rank) model = resnet101(pretrained=False, progress=True).to(device)
# Data setup, dummy data # Data setup, dummy data
def collate(inputs: List[Any]): def collate(inputs: List[Any]):
return { return {
"inputs": torch.stack([i[0] for i in inputs]).to(torch.device(rank)), "inputs": torch.stack([i[0] for i in inputs]).to(device),
"label": torch.stack([i[1] for i in inputs]).to(torch.device(rank)), "label": torch.stack([i[1] for i in inputs]).to(device),
} }
dataloader = DataLoader( dataloader = DataLoader(
...@@ -59,20 +59,13 @@ class OptimType(str, Enum): ...@@ -59,20 +59,13 @@ class OptimType(str, Enum):
def train( def train(
rank: int, rank: int,
world_size: int, args: argparse.Namespace,
num_epochs: int = 10,
batch_size: int = 32,
data_size: int = 200,
backend: str = "gloo", backend: str = "gloo",
optim_type: OptimType = OptimType.vanilla, optim_type: OptimType = OptimType.vanilla,
profile: bool = False,
check_regression: bool = True, check_regression: bool = True,
reference_speed: float = -1.0,
reference_memory: float = -1.0,
reference_loss: float = -1.0,
): ):
# DDP # DDP
dist_init(rank=rank, world_size=world_size, backend=backend) dist_init(rank=rank, world_size=args.world_size, backend=backend)
# Setup # Setup
torch.cuda.set_device(rank) torch.cuda.set_device(rank)
...@@ -84,7 +77,8 @@ def train( ...@@ -84,7 +77,8 @@ def train(
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False torch.backends.cudnn.benchmark = False
model, dataloader, loss_fn = get_problem(rank, data_size, batch_size) device = torch.device("cpu") if args.cpu else torch.device(rank)
model, dataloader, loss_fn = get_problem(rank, args.data_size, args.batch_size, device)
# Shard the optimizer # Shard the optimizer
optimizer: Optional[torch.optim.Optimizer] = None optimizer: Optional[torch.optim.Optimizer] = None
...@@ -94,7 +88,7 @@ def train( ...@@ -94,7 +88,7 @@ def train(
module=model, module=model,
optimizer=OPTIM, optimizer=OPTIM,
optimizer_params={"lr": 1e-4, "momentum": 0.9}, optimizer_params={"lr": 1e-4, "momentum": 0.9},
world_size=world_size, world_size=args.world_size,
broadcast_buffers=True, broadcast_buffers=True,
) )
ddp.train() ddp.train()
...@@ -109,18 +103,19 @@ def train( ...@@ -109,18 +103,19 @@ def train(
) )
# Reset the memory use counter # Reset the memory use counter
torch.cuda.reset_peak_memory_stats(rank) if not args.cpu:
torch.cuda.reset_peak_memory_stats(rank)
torch.cuda.synchronize(rank)
# Dummy training loop # Dummy training loop
torch.cuda.synchronize(rank)
training_start = time.monotonic() training_start = time.monotonic()
model.train() model.train()
measurements = [] measurements = []
final_loss: Optional[float] = -1.0 final_loss: Optional[float] = -1.0
need_profiling = profile need_profiling = args.profile
for epoch in range(num_epochs): for epoch in range(args.epochs):
epoch_start = time.monotonic() epoch_start = time.monotonic()
for batch in dataloader: for batch in dataloader:
...@@ -129,7 +124,6 @@ def train( ...@@ -129,7 +124,6 @@ def train(
model.zero_grad() model.zero_grad()
outputs = model(batch["inputs"]) outputs = model(batch["inputs"])
loss = loss_fn(outputs, batch["label"]) loss = loss_fn(outputs, batch["label"])
loss /= world_size
loss.backward() loss.backward()
if optim_type == OptimType.oss_sdp: if optim_type == OptimType.oss_sdp:
...@@ -137,7 +131,7 @@ def train( ...@@ -137,7 +131,7 @@ def train(
return loss return loss
if need_profiling: if need_profiling and not args.cpu:
print("Profiling the run") print("Profiling the run")
with profiler.profile(use_cuda=True) as prof: # type: ignore with profiler.profile(use_cuda=True) as prof: # type: ignore
with profiler.record_function("batch"): with profiler.record_function("batch"):
...@@ -163,13 +157,14 @@ def train( ...@@ -163,13 +157,14 @@ def train(
_ = optimizer.state_dict() _ = optimizer.state_dict()
print("... State dict collected") print("... State dict collected")
measurements.append(data_size / (epoch_end - epoch_start)) measurements.append(args.data_size / (epoch_end - epoch_start))
if dist.get_rank() == 0: if dist.get_rank() == 0:
print(f"Epoch {epoch} - processed {measurements[-1]:.2f} img per sec. Loss {final_loss:.3f}") print(f"Epoch {epoch} - processed {measurements[-1]:.2f} img per sec. Loss {final_loss:.3f}")
torch.cuda.synchronize(rank) if not args.cpu:
torch.cuda.synchronize(rank)
training_stop = time.monotonic() training_stop = time.monotonic()
img_per_sec = data_size / (training_stop - training_start) * num_epochs img_per_sec = args.data_size / (training_stop - training_start) * args.epochs
max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20 max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20
print(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec overall") print(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec overall")
...@@ -182,9 +177,9 @@ def train( ...@@ -182,9 +177,9 @@ def train(
print(f"[{dist.get_rank()}] : Mean speed: {mean:.2f} +/- {std:.2f}") print(f"[{dist.get_rank()}] : Mean speed: {mean:.2f} +/- {std:.2f}")
if check_regression and dist.get_rank() == 0: if check_regression and dist.get_rank() == 0:
assert (mean + 3.0 * std) > reference_speed, "Speed regression detected" assert (mean + 3.0 * std) > args.reference_speed, "Speed regression detected"
assert max_memory < 1.05 * reference_memory, "Memory use regression detected" assert max_memory < 1.05 * args.reference_memory, "Memory use regression detected"
assert abs(cast(float, final_loss) - reference_loss) < 1e-3, "Loss regression detected" assert abs(cast(float, final_loss) - args.reference_loss) < 1e-3, "Loss regression detected"
print("[Regression Test] VALID") print("[Regression Test] VALID")
...@@ -208,26 +203,18 @@ if __name__ == "__main__": ...@@ -208,26 +203,18 @@ if __name__ == "__main__":
) )
parser.add_argument("--gloo", action="store_true", default=False) parser.add_argument("--gloo", action="store_true", default=False)
parser.add_argument("--profile", action="store_true", default=False) parser.add_argument("--profile", action="store_true", default=False)
parser.add_argument("--cpu", action="store_true", default=False)
args = parser.parse_args() args = parser.parse_args()
print(f"Benchmark arguments: {args}") print(f"Benchmark arguments: {args}")
backend = "nccl" if not args.gloo or not torch.cuda.is_available() else "gloo" backend = "nccl" if (not args.gloo or not torch.cuda.is_available()) and not args.cpu else "gloo"
if args.optim_type == OptimType.vanilla or args.optim_type == OptimType.everyone: if args.optim_type == OptimType.vanilla or args.optim_type == OptimType.everyone:
print("\nBenchmark vanilla optimizer") print("\nBenchmark vanilla optimizer")
mp.spawn( mp.spawn(
train, train,
args=( args=(args, backend, OptimType.vanilla, False,), # no regression check
args.world_size,
args.epochs,
args.batch_size,
args.data_size,
backend,
OptimType.vanilla,
args.profile,
False, # no regression check
),
nprocs=args.world_size, nprocs=args.world_size,
join=True, join=True,
) )
...@@ -235,41 +222,14 @@ if __name__ == "__main__": ...@@ -235,41 +222,14 @@ if __name__ == "__main__":
if args.optim_type == OptimType.oss or args.optim_type == OptimType.everyone: if args.optim_type == OptimType.oss or args.optim_type == OptimType.everyone:
print("\nBenchmark OSS with DDP") print("\nBenchmark OSS with DDP")
mp.spawn( mp.spawn(
train, train, args=(args, backend, OptimType.oss, args.check_regression), nprocs=args.world_size, join=True,
args=(
args.world_size,
args.epochs,
args.batch_size,
args.data_size,
backend,
OptimType.oss,
args.profile,
args.check_regression,
args.reference_speed,
args.reference_memory,
args.reference_loss,
),
nprocs=args.world_size,
join=True,
) )
if args.optim_type == OptimType.oss_sdp or args.optim_type == OptimType.everyone: if args.optim_type == OptimType.oss_sdp or args.optim_type == OptimType.everyone:
print("\nBenchmark OSS with SDP") print("\nBenchmark OSS with SDP")
mp.spawn( mp.spawn(
train, train,
args=( args=(args, backend, OptimType.oss_sdp, False,), # FIXME: @lefaudeux - SDP should give the same results
args.world_size,
args.epochs,
args.batch_size,
args.data_size,
backend,
OptimType.oss_sdp,
args.profile,
False, # FIXME: @lefaudeux - SDP should give the same results
-1, # Not checking SDP for speed regression for now, still slower than OSS
args.reference_memory,
args.reference_loss,
),
nprocs=args.world_size, nprocs=args.world_size,
join=True, join=True,
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment