[fix] More robust stats for regression testing (#204)

* testing median and MAD * synchronize on kernels to make sure that we're measuring the actual completion time * adjusting the circleci threshold, not that the speed has regressed but because we measure proper cuda execution time

[fix] More robust stats for regression testing (#204)
* testing median and MAD * synchronize on kernels to make sure that we're measuring the actual completion time * adjusting the circleci threshold, not that the speed has regressed but because we measure proper cuda execution time
2b121242 · Benjamin Lefaudeux · GitHub · ad933b34 · 2b121242 · 2b121242
Unverified Commit 2b121242 authored Nov 22, 2020 by Benjamin Lefaudeux Committed by GitHub Nov 22, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 8 deletions

.circleci/config.yml .circleci/config.yml +1 -1

benchmarks/oss.py benchmarks/oss.py +16 -7

No files found.
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -121,7 +121,7 @@ run_oss_benchmark: &run_oss_benchmark
  - run:
      name: Run OSS Benchmark
      command: |
-        python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 760 --reference_memory 1120 --reference_loss 0.023
+        python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 670 --reference_memory 1120 --reference_loss 0.023

 run_oss_gloo: &run_oss_gloo
  - run:

--- a/benchmarks/oss.py
+++ b/benchmarks/oss.py
@@ -5,7 +5,6 @@ import argparse
 from enum import Enum
 import importlib
 import logging
-import math
 import shutil
 import tempfile
 import time
@@ -128,6 +127,8 @@ def train(
        epoch_runtime = 0.0

        for batch in dataloader:
+            if not args.cpu:
+                torch.cuda.synchronize(rank)
            batch__start = time.monotonic()

            def closure(data=batch, grad_scaler=None):
@@ -192,6 +193,10 @@ def train(

            n_items += args.batch_size

+            if not args.cpu:
+                # make sure that the cuda kernels are finished before taking a timestamp
+                torch.cuda.synchronize(rank)
+
            batch_end = time.monotonic()
            epoch_runtime += batch_end - batch__start

@@ -218,14 +223,18 @@ def train(
    img_per_sec = n_items / (training_stop - training_start) * args.epochs
    logging.info(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec inc. checkpoint")

-    # Compute the mean and average img per second
-    mean = sum(measurements) / len(measurements)
-    diff = map(lambda x: pow(x - mean, 2.0), measurements)
-    std = math.sqrt(sum(diff) / (len(measurements) - 1)) if args.epochs > 2 else -1
-    logging.info(f"[{dist.get_rank()}] : Mean speed: {mean:.2f} +/- {std:.2f}")
+    # Compute the median and median of absolute differences img per second
+    measurements.sort()
+    median = measurements[len(measurements) // 2]
+
+    abs_diff = list(map(lambda x: abs(x - median), measurements))
+    abs_diff.sort()
+    mad = abs_diff[len(measurements) // 2] if args.epochs > 2 else -1
+
+    logging.info(f"[{dist.get_rank()}] : Median speed: {median:.2f} +/- {mad:.2f}")

    if check_regression and dist.get_rank() == 0:
-        assert (mean + 3.0 * std) > args.reference_speed, "Speed regression detected"
+        assert (median + 3.0 * mad) > args.reference_speed, "Speed regression detected"
        assert max_memory < 1.05 * args.reference_memory, "Memory use regression detected"
        assert abs(cast(float, final_loss) - args.reference_loss) < 1e-3, "Loss regression detected"