Unverified Commit 10062e58 authored by Benjamin Lefaudeux's avatar Benjamin Lefaudeux Committed by GitHub
Browse files

[feat][minor] OSS: benchmark - adding a cpu option (#144)

* adding a cpu option
* adjust the reference loss
parent 61234360
......@@ -108,7 +108,7 @@ run_oss_benchmark: &run_oss_benchmark
- run:
name: Run OSS Benchmark
command: |
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 13.7 --reference_memory 4390 --reference_loss 0.152
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 13.7 --reference_memory 4390 --reference_loss 0.302
run_oss_gloo: &run_oss_gloo
- run:
......
......@@ -30,15 +30,15 @@ def dist_init(rank, world_size, backend):
dist.init_process_group(backend=backend, init_method="tcp://localhost:29501", rank=rank, world_size=world_size)
def get_problem(rank, data_size, batch_size):
def get_problem(rank, data_size, batch_size, device):
# Standard RN101
model = resnet101(pretrained=False, progress=True).to(rank)
model = resnet101(pretrained=False, progress=True).to(device)
# Data setup, dummy data
def collate(inputs: List[Any]):
return {
"inputs": torch.stack([i[0] for i in inputs]).to(torch.device(rank)),
"label": torch.stack([i[1] for i in inputs]).to(torch.device(rank)),
"inputs": torch.stack([i[0] for i in inputs]).to(device),
"label": torch.stack([i[1] for i in inputs]).to(device),
}
dataloader = DataLoader(
......@@ -59,20 +59,13 @@ class OptimType(str, Enum):
def train(
rank: int,
world_size: int,
num_epochs: int = 10,
batch_size: int = 32,
data_size: int = 200,
args: argparse.Namespace,
backend: str = "gloo",
optim_type: OptimType = OptimType.vanilla,
profile: bool = False,
check_regression: bool = True,
reference_speed: float = -1.0,
reference_memory: float = -1.0,
reference_loss: float = -1.0,
):
# DDP
dist_init(rank=rank, world_size=world_size, backend=backend)
dist_init(rank=rank, world_size=args.world_size, backend=backend)
# Setup
torch.cuda.set_device(rank)
......@@ -84,7 +77,8 @@ def train(
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
model, dataloader, loss_fn = get_problem(rank, data_size, batch_size)
device = torch.device("cpu") if args.cpu else torch.device(rank)
model, dataloader, loss_fn = get_problem(rank, args.data_size, args.batch_size, device)
# Shard the optimizer
optimizer: Optional[torch.optim.Optimizer] = None
......@@ -94,7 +88,7 @@ def train(
module=model,
optimizer=OPTIM,
optimizer_params={"lr": 1e-4, "momentum": 0.9},
world_size=world_size,
world_size=args.world_size,
broadcast_buffers=True,
)
ddp.train()
......@@ -109,18 +103,19 @@ def train(
)
# Reset the memory use counter
torch.cuda.reset_peak_memory_stats(rank)
if not args.cpu:
torch.cuda.reset_peak_memory_stats(rank)
torch.cuda.synchronize(rank)
# Dummy training loop
torch.cuda.synchronize(rank)
training_start = time.monotonic()
model.train()
measurements = []
final_loss: Optional[float] = -1.0
need_profiling = profile
need_profiling = args.profile
for epoch in range(num_epochs):
for epoch in range(args.epochs):
epoch_start = time.monotonic()
for batch in dataloader:
......@@ -129,7 +124,6 @@ def train(
model.zero_grad()
outputs = model(batch["inputs"])
loss = loss_fn(outputs, batch["label"])
loss /= world_size
loss.backward()
if optim_type == OptimType.oss_sdp:
......@@ -137,7 +131,7 @@ def train(
return loss
if need_profiling:
if need_profiling and not args.cpu:
print("Profiling the run")
with profiler.profile(use_cuda=True) as prof: # type: ignore
with profiler.record_function("batch"):
......@@ -163,13 +157,14 @@ def train(
_ = optimizer.state_dict()
print("... State dict collected")
measurements.append(data_size / (epoch_end - epoch_start))
measurements.append(args.data_size / (epoch_end - epoch_start))
if dist.get_rank() == 0:
print(f"Epoch {epoch} - processed {measurements[-1]:.2f} img per sec. Loss {final_loss:.3f}")
torch.cuda.synchronize(rank)
if not args.cpu:
torch.cuda.synchronize(rank)
training_stop = time.monotonic()
img_per_sec = data_size / (training_stop - training_start) * num_epochs
img_per_sec = args.data_size / (training_stop - training_start) * args.epochs
max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20
print(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec overall")
......@@ -182,9 +177,9 @@ def train(
print(f"[{dist.get_rank()}] : Mean speed: {mean:.2f} +/- {std:.2f}")
if check_regression and dist.get_rank() == 0:
assert (mean + 3.0 * std) > reference_speed, "Speed regression detected"
assert max_memory < 1.05 * reference_memory, "Memory use regression detected"
assert abs(cast(float, final_loss) - reference_loss) < 1e-3, "Loss regression detected"
assert (mean + 3.0 * std) > args.reference_speed, "Speed regression detected"
assert max_memory < 1.05 * args.reference_memory, "Memory use regression detected"
assert abs(cast(float, final_loss) - args.reference_loss) < 1e-3, "Loss regression detected"
print("[Regression Test] VALID")
......@@ -208,26 +203,18 @@ if __name__ == "__main__":
)
parser.add_argument("--gloo", action="store_true", default=False)
parser.add_argument("--profile", action="store_true", default=False)
parser.add_argument("--cpu", action="store_true", default=False)
args = parser.parse_args()
print(f"Benchmark arguments: {args}")
backend = "nccl" if not args.gloo or not torch.cuda.is_available() else "gloo"
backend = "nccl" if (not args.gloo or not torch.cuda.is_available()) and not args.cpu else "gloo"
if args.optim_type == OptimType.vanilla or args.optim_type == OptimType.everyone:
print("\nBenchmark vanilla optimizer")
mp.spawn(
train,
args=(
args.world_size,
args.epochs,
args.batch_size,
args.data_size,
backend,
OptimType.vanilla,
args.profile,
False, # no regression check
),
args=(args, backend, OptimType.vanilla, False,), # no regression check
nprocs=args.world_size,
join=True,
)
......@@ -235,41 +222,14 @@ if __name__ == "__main__":
if args.optim_type == OptimType.oss or args.optim_type == OptimType.everyone:
print("\nBenchmark OSS with DDP")
mp.spawn(
train,
args=(
args.world_size,
args.epochs,
args.batch_size,
args.data_size,
backend,
OptimType.oss,
args.profile,
args.check_regression,
args.reference_speed,
args.reference_memory,
args.reference_loss,
),
nprocs=args.world_size,
join=True,
train, args=(args, backend, OptimType.oss, args.check_regression), nprocs=args.world_size, join=True,
)
if args.optim_type == OptimType.oss_sdp or args.optim_type == OptimType.everyone:
print("\nBenchmark OSS with SDP")
mp.spawn(
train,
args=(
args.world_size,
args.epochs,
args.batch_size,
args.data_size,
backend,
OptimType.oss_sdp,
args.profile,
False, # FIXME: @lefaudeux - SDP should give the same results
-1, # Not checking SDP for speed regression for now, still slower than OSS
args.reference_memory,
args.reference_loss,
),
args=(args, backend, OptimType.oss_sdp, False,), # FIXME: @lefaudeux - SDP should give the same results
nprocs=args.world_size,
join=True,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment