Make an eval iteration the same number of samples as a training iteration

a84a5fa0 · Jared Casper · Deepak Narayanan · 2cf1d6d0 · a84a5fa0
Commit a84a5fa0 authored Dec 03, 2020 by Jared Casper Committed by Deepak Narayanan Dec 19, 2020
Show whitespace changes
Inline Side-by-side

Showing with 26 additions and 25 deletions

megatron/training.py megatron/training.py +26 -25

No files found.
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -761,6 +761,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
                print_rank_0('Evaluating iter {}/{}'.format(iteration,
                                                            args.eval_iters))
+            for _ in range(args.num_microbatches_in_minibatch):
                if not mpu.is_pipeline_first_stage():
                    input_tensor, _ = communicate(
                        tensor_send_next=None,
@@ -793,7 +794,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
    model.train()
    for key in total_loss_dict:
-        total_loss_dict[key] /= args.eval_iters
+        total_loss_dict[key] /= args.eval_iters * args.num_microbatches_in_minibatch
    return total_loss_dict