Commit 34bdc8b5 authored by thomwolf's avatar thomwolf
Browse files

remove duplicate accumulate gradient step arguments

parent 0c24db9d
...@@ -392,10 +392,6 @@ def main(): ...@@ -392,10 +392,6 @@ def main():
default=False, default=False,
action='store_true', action='store_true',
help="Whether not to use CUDA when available") help="Whether not to use CUDA when available")
parser.add_argument("--accumulate_gradients",
type=int,
default=1,
help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
parser.add_argument("--local_rank", parser.add_argument("--local_rank",
type=int, type=int,
default=-1, default=-1,
...@@ -426,11 +422,11 @@ def main(): ...@@ -426,11 +422,11 @@ def main():
torch.distributed.init_process_group(backend='nccl') torch.distributed.init_process_group(backend='nccl')
logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
if args.accumulate_gradients < 1: if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.accumulate_gradients)) args.gradient_accumulation_steps))
args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
random.seed(args.seed) random.seed(args.seed)
np.random.seed(args.seed) np.random.seed(args.seed)
......
...@@ -731,10 +731,6 @@ def main(): ...@@ -731,10 +731,6 @@ def main():
type=int, type=int,
default=-1, default=-1,
help="local_rank for distributed training on gpus") help="local_rank for distributed training on gpus")
parser.add_argument("--accumulate_gradients",
type=int,
default=1,
help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
parser.add_argument('--seed', parser.add_argument('--seed',
type=int, type=int,
default=42, default=42,
...@@ -756,11 +752,11 @@ def main(): ...@@ -756,11 +752,11 @@ def main():
torch.distributed.init_process_group(backend='nccl') torch.distributed.init_process_group(backend='nccl')
logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
if args.accumulate_gradients < 1: if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.accumulate_gradients)) args.gradient_accumulation_steps))
args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
random.seed(args.seed) random.seed(args.seed)
np.random.seed(args.seed) np.random.seed(args.seed)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment