remove duplicate accumulate gradient step arguments

34bdc8b5 · thomwolf · 0c24db9d · 34bdc8b5 · 34bdc8b5
Commit 34bdc8b5 authored Nov 09, 2018 by thomwolf
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 16 deletions

run_classifier.py run_classifier.py +4 -8

run_squad.py run_squad.py +4 -8

No files found.
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -392,10 +392,6 @@ def main():
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
-    parser.add_argument("--accumulate_gradients",
-                        type=int,
-                        default=1,
-                        help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
@@ -426,11 +422,11 @@ def main():
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

-    if args.accumulate_gradients < 1:
-        raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
-                            args.accumulate_gradients))
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))

-    args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients)
+    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)

--- a/run_squad.py
+++ b/run_squad.py
@@ -731,10 +731,6 @@ def main():
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
-    parser.add_argument("--accumulate_gradients",
-                        type=int,
-                        default=1,
-                        help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
    parser.add_argument('--seed', 
                        type=int, 
                        default=42,
@@ -756,11 +752,11 @@ def main():
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

-    if args.accumulate_gradients < 1:
-        raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
-                            args.accumulate_gradients))
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))

-    args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients)
+    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)