Unverified Commit 2b121242 authored by Benjamin Lefaudeux's avatar Benjamin Lefaudeux Committed by GitHub
Browse files

[fix] More robust stats for regression testing (#204)

* testing median and MAD

* synchronize on kernels to make sure that we're measuring the actual completion time

* adjusting the circleci threshold, not that the speed has regressed but because we measure proper cuda execution time
parent ad933b34
......@@ -121,7 +121,7 @@ run_oss_benchmark: &run_oss_benchmark
- run:
name: Run OSS Benchmark
command: |
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 760 --reference_memory 1120 --reference_loss 0.023
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 670 --reference_memory 1120 --reference_loss 0.023
run_oss_gloo: &run_oss_gloo
- run:
......
......@@ -5,7 +5,6 @@ import argparse
from enum import Enum
import importlib
import logging
import math
import shutil
import tempfile
import time
......@@ -128,6 +127,8 @@ def train(
epoch_runtime = 0.0
for batch in dataloader:
if not args.cpu:
torch.cuda.synchronize(rank)
batch__start = time.monotonic()
def closure(data=batch, grad_scaler=None):
......@@ -192,6 +193,10 @@ def train(
n_items += args.batch_size
if not args.cpu:
# make sure that the cuda kernels are finished before taking a timestamp
torch.cuda.synchronize(rank)
batch_end = time.monotonic()
epoch_runtime += batch_end - batch__start
......@@ -218,14 +223,18 @@ def train(
img_per_sec = n_items / (training_stop - training_start) * args.epochs
logging.info(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec inc. checkpoint")
# Compute the mean and average img per second
mean = sum(measurements) / len(measurements)
diff = map(lambda x: pow(x - mean, 2.0), measurements)
std = math.sqrt(sum(diff) / (len(measurements) - 1)) if args.epochs > 2 else -1
logging.info(f"[{dist.get_rank()}] : Mean speed: {mean:.2f} +/- {std:.2f}")
# Compute the median and median of absolute differences img per second
measurements.sort()
median = measurements[len(measurements) // 2]
abs_diff = list(map(lambda x: abs(x - median), measurements))
abs_diff.sort()
mad = abs_diff[len(measurements) // 2] if args.epochs > 2 else -1
logging.info(f"[{dist.get_rank()}] : Median speed: {median:.2f} +/- {mad:.2f}")
if check_regression and dist.get_rank() == 0:
assert (mean + 3.0 * std) > args.reference_speed, "Speed regression detected"
assert (median + 3.0 * mad) > args.reference_speed, "Speed regression detected"
assert max_memory < 1.05 * args.reference_memory, "Memory use regression detected"
assert abs(cast(float, final_loss) - args.reference_loss) < 1e-3, "Loss regression detected"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment