Unverified Commit 168c9baa authored by anj-s's avatar anj-s Committed by GitHub
Browse files

[offload] Add golden data for offload benchmarks. (#578)

* add model

* add offload regression benchmarks

* add golden data

* remove mp pipe benchmark

* fix lint

* remove rank

* add check for model type

* lint errors
parent e41452e8
......@@ -166,6 +166,12 @@ upload_coverage: &upload_coverage
file: 'coverage.xml'
token: $CODECOV_TOKEN
run_offload_benchmark: &run_offload_benchmark
- run:
name: Run Offload Benchmark
command: |
python benchmarks/experimental/offload.py
run_pipe_benchmark: &run_pipe_benchmark
- run:
name: Run Pipe Benchmark
......@@ -544,6 +550,8 @@ jobs:
- <<: *run_pipe_benchmark
- <<: *run_offload_benchmark
- <<: *run_oss_amp
- <<: *run_oss_for_each
......
......@@ -120,7 +120,7 @@ def train_seq(model_config, benchmark_config, model_specs, args):
optimizer = model_config["optimizer"](model.parameters(), lr=benchmark_config["lr"])
dataloader, _, _ = model_config["data"]
def train_epoch(args):
def train_epoch(args, num_iters):
model.train()
for batch_inputs, batch_outputs in dataloader:
batch_inputs, batch_outputs = batch_inputs.to("cuda"), batch_outputs.to("cuda")
......@@ -143,10 +143,13 @@ def train_seq(model_config, benchmark_config, model_specs, args):
loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10 ** 9
)
)
num_iters -= 1
if num_iters == 0:
break
if args.use_profiler:
prof.export_chrome_trace("/tmp/offload_prof")
train_epoch(args)
train_epoch(args, num_iters=5)
def train(model_config, model, benchmark_config, model_specs, args):
......@@ -179,6 +182,10 @@ def train(model_config, model, benchmark_config, model_specs, args):
return data, target
for i, batch in enumerate(lm_dataloader):
# TODO(anj): Make this a flag for both "lm" and "seq" models.
if i == 5:
break
if i == 1:
epoch_start_time = time.time()
......@@ -226,24 +233,20 @@ def train(model_config, model, benchmark_config, model_specs, args):
return wps, loss.item()
def verify_peak_memory(rank, golden_config, std_dev):
print("Peak allocated bytes on cuda:0: {:1d}".format(torch.cuda.memory_stats(rank)["allocated_bytes.all.peak"]))
current_device_usage = torch.cuda.memory_stats(rank)["allocated_bytes.all.peak"]
golden_ref = golden_config["peak_mem_usage"][rank]
def verify_peak_memory(golden_config, std_dev):
print("Peak allocated bytes on cuda:0: {:1d}".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"]))
current_device_usage = torch.cuda.memory_stats(0)["allocated_bytes.all.peak"]
golden_ref = golden_config["peak_mem_usage"]
if not current_device_usage < golden_ref * std_dev:
raise RuntimeError(
"Peak memory usage for cuda device {:d} is {:d} which"
"is less than golden reference value of {:d}".format(rank, current_device_usage, golden_ref)
"is less than golden reference value of {:d}".format(0, current_device_usage, golden_ref)
)
def verify_lm_run(wps, golden_config, args):
def verify_lm_throughput(wps, golden_config, args):
"""Verify that words per second for a given benchmark run matches the golden data."""
# Verify wps only on the last rank in multiprocess pipe
if not args.multiprocess or dist.get_rank() == dist.get_world_size() - 1:
# Assert that words per second is within 3 standard deviations of the average
# of five golden runs
print("Throughput(wps) is {:.2f}.".format(wps))
if not wps > (golden_config["avg_wps"] - (3 * golden_config["std_dev_wps"])):
raise RuntimeError(
......@@ -253,12 +256,6 @@ def verify_lm_run(wps, golden_config, args):
)
)
if args.multiprocess:
verify_peak_memory(dist.get_rank(), golden_config, 1.5)
else:
for i in range(4):
verify_peak_memory(i, golden_config, 1.1)
def benchmark_language_model(model_config, model, benchmark_config, model_specs, args):
epoch = benchmark_config["epochs"]
......@@ -271,9 +268,14 @@ def benchmark_language_model(model_config, model, benchmark_config, model_specs,
print("-" * 110)
print("| end of epoch {:1d} | time: {:5.2f}s | train loss {:5.2f} ".format(epoch, elapsed_time, loss))
print("-" * 110)
print("Throughput(wps) is {:.2f}.".format(wps))
print("Peak allocated bytes on cuda:0: {:1d}".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"]))
# TODO(anj-s): Enable golden config data verification.
if args.model_name == "seq":
raise RuntimeError(
f"Golden data verification is only supported for the Transformer(lm) model and not {args.model_name}"
)
golden_config = get_golden_config(args.model_name, args)
verify_lm_throughput(wps, golden_config, args)
verify_peak_memory(golden_config, 1.1)
def get_synthetic_dataloaders(args, device, benchmark_config, model_specs):
......@@ -357,7 +359,7 @@ def get_golden_config(model_name, args):
"""Return a dict with the golden data for throughput and memory usage."""
if model_name == "lm":
return lm_wikitext2.get_golden_real_stats(False)
return lm_wikitext2.get_golden_real_stats()
else:
raise RuntimeError(f"Unrecognized args.model_mame {args.model_name}")
......@@ -403,7 +405,7 @@ def run_benchmark(args):
parser = argparse.ArgumentParser(description="benchmark")
parser.add_argument(
"--dry_run", default=True, action="store_true", help="Run a sample training run without regression testing."
"--dry_run", default=False, action="store_true", help="Run a sample training run without regression testing."
)
parser.add_argument(
"--debug",
......
......@@ -32,6 +32,13 @@ class Offload_Transformer:
"slices": 3,
}
def get_golden_real_stats():
return {
"avg_wps": 192.105,
"std_dev_wps": 39.56,
"peak_mem_usage": 1180848128,
}
class Offload_Sequential:
def get_model_config():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment