Unverified Commit 2b121242 authored by Benjamin Lefaudeux's avatar Benjamin Lefaudeux Committed by GitHub
Browse files

[fix] More robust stats for regression testing (#204)

* testing median and MAD

* synchronize on kernels to make sure that we're measuring the actual completion time

* adjusting the circleci threshold, not that the speed has regressed but because we measure proper cuda execution time
parent ad933b34
...@@ -121,7 +121,7 @@ run_oss_benchmark: &run_oss_benchmark ...@@ -121,7 +121,7 @@ run_oss_benchmark: &run_oss_benchmark
- run: - run:
name: Run OSS Benchmark name: Run OSS Benchmark
command: | command: |
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 760 --reference_memory 1120 --reference_loss 0.023 python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 670 --reference_memory 1120 --reference_loss 0.023
run_oss_gloo: &run_oss_gloo run_oss_gloo: &run_oss_gloo
- run: - run:
......
...@@ -5,7 +5,6 @@ import argparse ...@@ -5,7 +5,6 @@ import argparse
from enum import Enum from enum import Enum
import importlib import importlib
import logging import logging
import math
import shutil import shutil
import tempfile import tempfile
import time import time
...@@ -128,6 +127,8 @@ def train( ...@@ -128,6 +127,8 @@ def train(
epoch_runtime = 0.0 epoch_runtime = 0.0
for batch in dataloader: for batch in dataloader:
if not args.cpu:
torch.cuda.synchronize(rank)
batch__start = time.monotonic() batch__start = time.monotonic()
def closure(data=batch, grad_scaler=None): def closure(data=batch, grad_scaler=None):
...@@ -192,6 +193,10 @@ def train( ...@@ -192,6 +193,10 @@ def train(
n_items += args.batch_size n_items += args.batch_size
if not args.cpu:
# make sure that the cuda kernels are finished before taking a timestamp
torch.cuda.synchronize(rank)
batch_end = time.monotonic() batch_end = time.monotonic()
epoch_runtime += batch_end - batch__start epoch_runtime += batch_end - batch__start
...@@ -218,14 +223,18 @@ def train( ...@@ -218,14 +223,18 @@ def train(
img_per_sec = n_items / (training_stop - training_start) * args.epochs img_per_sec = n_items / (training_stop - training_start) * args.epochs
logging.info(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec inc. checkpoint") logging.info(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec inc. checkpoint")
# Compute the mean and average img per second # Compute the median and median of absolute differences img per second
mean = sum(measurements) / len(measurements) measurements.sort()
diff = map(lambda x: pow(x - mean, 2.0), measurements) median = measurements[len(measurements) // 2]
std = math.sqrt(sum(diff) / (len(measurements) - 1)) if args.epochs > 2 else -1
logging.info(f"[{dist.get_rank()}] : Mean speed: {mean:.2f} +/- {std:.2f}") abs_diff = list(map(lambda x: abs(x - median), measurements))
abs_diff.sort()
mad = abs_diff[len(measurements) // 2] if args.epochs > 2 else -1
logging.info(f"[{dist.get_rank()}] : Median speed: {median:.2f} +/- {mad:.2f}")
if check_regression and dist.get_rank() == 0: if check_regression and dist.get_rank() == 0:
assert (mean + 3.0 * std) > args.reference_speed, "Speed regression detected" assert (median + 3.0 * mad) > args.reference_speed, "Speed regression detected"
assert max_memory < 1.05 * args.reference_memory, "Memory use regression detected" assert max_memory < 1.05 * args.reference_memory, "Memory use regression detected"
assert abs(cast(float, final_loss) - args.reference_loss) < 1e-3, "Loss regression detected" assert abs(cast(float, final_loss) - args.reference_loss) < 1e-3, "Loss regression detected"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment