[offload] Add golden data for offload benchmarks. (#578)

* add model * add offload regression benchmarks * add golden data * remove mp pipe benchmark * fix lint * remove rank * add check for model type * lint errors

[offload] Add golden data for offload benchmarks. (#578)
* add model * add offload regression benchmarks * add golden data * remove mp pipe benchmark * fix lint * remove rank * add check for model type * lint errors
168c9baa · anj-s · GitHub · e41452e8 · 168c9baa · 168c9baa
Unverified Commit 168c9baa authored Apr 05, 2021 by anj-s Committed by GitHub Apr 05, 2021
Showing with 47 additions and 30 deletions

.circleci/config.yml .circleci/config.yml +8 -0

benchmarks/experimental/offload.py benchmarks/experimental/offload.py +32 -30

benchmarks/golden_configs/lm_wikitext2.py benchmarks/golden_configs/lm_wikitext2.py +7 -0

No files found.
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -166,6 +166,12 @@ upload_coverage: &upload_coverage
      file: 'coverage.xml'
      token: $CODECOV_TOKEN

+run_offload_benchmark: &run_offload_benchmark
+  - run:
+      name: Run Offload Benchmark
+      command: |
+        python benchmarks/experimental/offload.py
+
 run_pipe_benchmark: &run_pipe_benchmark
  - run:
      name: Run Pipe Benchmark
@@ -544,6 +550,8 @@ jobs:

      - <<: *run_pipe_benchmark

+      - <<: *run_offload_benchmark
+
      - <<: *run_oss_amp

      - <<: *run_oss_for_each

--- a/benchmarks/experimental/offload.py
+++ b/benchmarks/experimental/offload.py
@@ -120,7 +120,7 @@ def train_seq(model_config, benchmark_config, model_specs, args):
    optimizer = model_config["optimizer"](model.parameters(), lr=benchmark_config["lr"])
    dataloader, _, _ = model_config["data"]

-    def train_epoch(args):
+    def train_epoch(args, num_iters):
        model.train()
        for batch_inputs, batch_outputs in dataloader:
            batch_inputs, batch_outputs = batch_inputs.to("cuda"), batch_outputs.to("cuda")
@@ -143,10 +143,13 @@ def train_seq(model_config, benchmark_config, model_specs, args):
                    loss.item(), benchmark_config["batch_size"] / (time.time_ns() - start) * 10 ** 9
                )
            )
+            num_iters -= 1
+            if num_iters == 0:
+                break
        if args.use_profiler:
            prof.export_chrome_trace("/tmp/offload_prof")

-    train_epoch(args)
+    train_epoch(args, num_iters=5)


 def train(model_config, model, benchmark_config, model_specs, args):
@@ -179,6 +182,10 @@ def train(model_config, model, benchmark_config, model_specs, args):
        return data, target

    for i, batch in enumerate(lm_dataloader):
+        # TODO(anj): Make this a flag for both "lm" and "seq" models.
+        if i == 5:
+            break
+
        if i == 1:
            epoch_start_time = time.time()

@@ -226,24 +233,20 @@ def train(model_config, model, benchmark_config, model_specs, args):
    return wps, loss.item()


-def verify_peak_memory(rank, golden_config, std_dev):
-    print("Peak allocated bytes on cuda:0: {:1d}".format(torch.cuda.memory_stats(rank)["allocated_bytes.all.peak"]))
-    current_device_usage = torch.cuda.memory_stats(rank)["allocated_bytes.all.peak"]
-    golden_ref = golden_config["peak_mem_usage"][rank]
+def verify_peak_memory(golden_config, std_dev):
+    print("Peak allocated bytes on cuda:0: {:1d}".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"]))
+    current_device_usage = torch.cuda.memory_stats(0)["allocated_bytes.all.peak"]
+    golden_ref = golden_config["peak_mem_usage"]
    if not current_device_usage < golden_ref * std_dev:
        raise RuntimeError(
            "Peak memory usage for cuda device {:d} is {:d} which"
-            "is less than golden reference value of {:d}".format(rank, current_device_usage, golden_ref)
+            "is less than golden reference value of {:d}".format(0, current_device_usage, golden_ref)
        )


-def verify_lm_run(wps, golden_config, args):
+def verify_lm_throughput(wps, golden_config, args):
    """Verify that words per second for a given benchmark run matches the golden data."""

-    # Verify wps only on the last rank in multiprocess pipe
-    if not args.multiprocess or dist.get_rank() == dist.get_world_size() - 1:
-        # Assert that words per second is within 3 standard deviations of the average
-        # of five golden runs
    print("Throughput(wps) is {:.2f}.".format(wps))
    if not wps > (golden_config["avg_wps"] - (3 * golden_config["std_dev_wps"])):
        raise RuntimeError(
@@ -253,12 +256,6 @@ def verify_lm_run(wps, golden_config, args):
            )
        )

-    if args.multiprocess:
-        verify_peak_memory(dist.get_rank(), golden_config, 1.5)
-    else:
-        for i in range(4):
-            verify_peak_memory(i, golden_config, 1.1)
-

 def benchmark_language_model(model_config, model, benchmark_config, model_specs, args):
    epoch = benchmark_config["epochs"]
@@ -271,9 +268,14 @@ def benchmark_language_model(model_config, model, benchmark_config, model_specs,
    print("-" * 110)
    print("| end of epoch {:1d} | time: {:5.2f}s | train loss {:5.2f} ".format(epoch, elapsed_time, loss))
    print("-" * 110)
-    print("Throughput(wps) is {:.2f}.".format(wps))
-    print("Peak allocated bytes on cuda:0: {:1d}".format(torch.cuda.memory_stats(0)["allocated_bytes.all.peak"]))
-    # TODO(anj-s): Enable golden config data verification.
+
+    if args.model_name == "seq":
+        raise RuntimeError(
+            f"Golden data verification is only supported for the Transformer(lm) model and not {args.model_name}"
+        )
+    golden_config = get_golden_config(args.model_name, args)
+    verify_lm_throughput(wps, golden_config, args)
+    verify_peak_memory(golden_config, 1.1)


 def get_synthetic_dataloaders(args, device, benchmark_config, model_specs):
@@ -357,7 +359,7 @@ def get_golden_config(model_name, args):
    """Return a dict with the golden data for throughput and memory usage."""

    if model_name == "lm":
-        return lm_wikitext2.get_golden_real_stats(False)
+        return lm_wikitext2.get_golden_real_stats()
    else:
        raise RuntimeError(f"Unrecognized args.model_mame {args.model_name}")

@@ -403,7 +405,7 @@ def run_benchmark(args):

 parser = argparse.ArgumentParser(description="benchmark")
 parser.add_argument(
-    "--dry_run", default=True, action="store_true", help="Run a sample training run without regression testing."
+    "--dry_run", default=False, action="store_true", help="Run a sample training run without regression testing."
 )
 parser.add_argument(
    "--debug",

--- a/benchmarks/golden_configs/lm_wikitext2.py
+++ b/benchmarks/golden_configs/lm_wikitext2.py
@@ -32,6 +32,13 @@ class Offload_Transformer:
            "slices": 3,
        }

+    def get_golden_real_stats():
+        return {
+            "avg_wps": 192.105,
+            "std_dev_wps": 39.56,
+            "peak_mem_usage": 1180848128,
+        }
+

 class Offload_Sequential:
    def get_model_config():