Unverified Commit a765b68a authored by Rasmus Arpe Fogh Jensen's avatar Rasmus Arpe Fogh Jensen Committed by GitHub
Browse files

Update no_trainer.py scripts to include accelerate gradient accumulation wrapper (#18473)

* Added accelerate gradient accumulation wrapper to run_image_classification_no_trainer.py example script

* make fixup changes

* PR comments

* changed input to Acceletor based on PR comment, ran make fixup

* Added comment explaining the sync_gradients statement

* Fixed lr scheduler max steps

* Changed run_clm_no_trainer.py script to use accelerate gradient accum wrapper

* Fixed all scripts except wav2vec2 pretraining to use accelerate gradient accum wrapper

* Added accelerate gradient accum wrapper for wav2vec2_pretraining_no_trainer.py script

* make fixup and lr_scheduler step inserted back into run_qa_beam_search_no_trainer.py

* removed changes to run_wav2vec2_pretraining_no_trainer.py script and fixed using wrong constant in qa_beam_search_no_trainer.py script
parent f1f5de31
...@@ -212,9 +212,14 @@ def main(): ...@@ -212,9 +212,14 @@ def main():
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
# in the environment # in the environment
accelerator = ( accelerator_log_kwargs = {}
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
) if args.with_tracking:
accelerator_log_kwargs["log_with"] = args.report_to
accelerator_log_kwargs["logging_dir"] = args.output_dir
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
logger.info(accelerator.state) logger.info(accelerator.state)
# Make one log on every process with the configuration for debugging. # Make one log on every process with the configuration for debugging.
logging.basicConfig( logging.basicConfig(
...@@ -384,8 +389,8 @@ def main(): ...@@ -384,8 +389,8 @@ def main():
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
optimizer=optimizer, optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps, num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
num_training_steps=args.max_train_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
) )
# Prepare everything with our `accelerator`. # Prepare everything with our `accelerator`.
...@@ -467,17 +472,20 @@ def main(): ...@@ -467,17 +472,20 @@ def main():
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
completed_steps += 1 completed_steps += 1
continue continue
outputs = model(**batch)
loss = outputs.loss with accelerator.accumulate(model):
# We keep track of the loss at each epoch outputs = model(**batch)
if args.with_tracking: loss = outputs.loss
total_loss += loss.detach().float() # We keep track of the loss at each epoch
loss = loss / args.gradient_accumulation_steps if args.with_tracking:
accelerator.backward(loss) total_loss += loss.detach().float()
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: accelerator.backward(loss)
optimizer.step() optimizer.step()
lr_scheduler.step() lr_scheduler.step()
optimizer.zero_grad() optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1) progress_bar.update(1)
completed_steps += 1 completed_steps += 1
......
...@@ -249,9 +249,14 @@ def main(): ...@@ -249,9 +249,14 @@ def main():
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
# in the environment # in the environment
accelerator = ( accelerator_log_kwargs = {}
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
) if args.with_tracking:
accelerator_log_kwargs["log_with"] = args.report_to
accelerator_log_kwargs["logging_dir"] = args.output_dir
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
# Make one log on every process with the configuration for debugging. # Make one log on every process with the configuration for debugging.
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
...@@ -486,8 +491,8 @@ def main(): ...@@ -486,8 +491,8 @@ def main():
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
optimizer=optimizer, optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps, num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
num_training_steps=args.max_train_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
) )
# Prepare everything with our `accelerator`. # Prepare everything with our `accelerator`.
...@@ -567,17 +572,20 @@ def main(): ...@@ -567,17 +572,20 @@ def main():
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
completed_steps += 1 completed_steps += 1
continue continue
outputs = model(**batch)
loss = outputs.loss with accelerator.accumulate(model):
# We keep track of the loss at each epoch outputs = model(**batch)
if args.with_tracking: loss = outputs.loss
total_loss += loss.detach().float() # We keep track of the loss at each epoch
loss = loss / args.gradient_accumulation_steps if args.with_tracking:
accelerator.backward(loss) total_loss += loss.detach().float()
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: accelerator.backward(loss)
optimizer.step() optimizer.step()
lr_scheduler.step() lr_scheduler.step()
optimizer.zero_grad() optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1) progress_bar.update(1)
completed_steps += 1 completed_steps += 1
......
...@@ -258,9 +258,14 @@ def main(): ...@@ -258,9 +258,14 @@ def main():
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
# in the environment # in the environment
accelerator = ( accelerator_log_kwargs = {}
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
) if args.with_tracking:
accelerator_log_kwargs["log_with"] = args.report_to
accelerator_log_kwargs["logging_dir"] = args.output_dir
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
# Make one log on every process with the configuration for debugging. # Make one log on every process with the configuration for debugging.
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
...@@ -530,8 +535,8 @@ def main(): ...@@ -530,8 +535,8 @@ def main():
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
optimizer=optimizer, optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps, num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
num_training_steps=args.max_train_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
) )
# Prepare everything with our `accelerator`. # Prepare everything with our `accelerator`.
...@@ -611,17 +616,20 @@ def main(): ...@@ -611,17 +616,20 @@ def main():
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
completed_steps += 1 completed_steps += 1
continue continue
outputs = model(**batch)
loss = outputs.loss with accelerator.accumulate(model):
# We keep track of the loss at each epoch outputs = model(**batch)
if args.with_tracking: loss = outputs.loss
total_loss += loss.detach().float() # We keep track of the loss at each epoch
loss = loss / args.gradient_accumulation_steps if args.with_tracking:
accelerator.backward(loss) total_loss += loss.detach().float()
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: accelerator.backward(loss)
optimizer.step() optimizer.step()
lr_scheduler.step() lr_scheduler.step()
optimizer.zero_grad() optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1) progress_bar.update(1)
completed_steps += 1 completed_steps += 1
......
...@@ -65,7 +65,7 @@ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) ...@@ -65,7 +65,7 @@ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task") parser = argparse.ArgumentParser(description="Finetune a transformers model on a multiple choice task")
parser.add_argument( parser.add_argument(
"--dataset_name", "--dataset_name",
type=str, type=str,
...@@ -284,9 +284,14 @@ def main(): ...@@ -284,9 +284,14 @@ def main():
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
# in the environment # in the environment
accelerator = ( accelerator_log_kwargs = {}
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
) if args.with_tracking:
accelerator_log_kwargs["log_with"] = args.report_to
accelerator_log_kwargs["logging_dir"] = args.output_dir
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
# Make one log on every process with the configuration for debugging. # Make one log on every process with the configuration for debugging.
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
...@@ -483,8 +488,8 @@ def main(): ...@@ -483,8 +488,8 @@ def main():
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
optimizer=optimizer, optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps, num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
num_training_steps=args.max_train_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
) )
# Prepare everything with our `accelerator`. # Prepare everything with our `accelerator`.
...@@ -567,17 +572,20 @@ def main(): ...@@ -567,17 +572,20 @@ def main():
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
completed_steps += 1 completed_steps += 1
continue continue
outputs = model(**batch)
loss = outputs.loss with accelerator.accumulate(model):
# We keep track of the loss at each epoch outputs = model(**batch)
if args.with_tracking: loss = outputs.loss
total_loss += loss.detach().float() # We keep track of the loss at each epoch
loss = loss / args.gradient_accumulation_steps if args.with_tracking:
accelerator.backward(loss) total_loss += loss.detach().float()
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: accelerator.backward(loss)
optimizer.step() optimizer.step()
lr_scheduler.step() lr_scheduler.step()
optimizer.zero_grad() optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1) progress_bar.update(1)
completed_steps += 1 completed_steps += 1
......
...@@ -297,8 +297,16 @@ def main(): ...@@ -297,8 +297,16 @@ def main():
send_example_telemetry("run_qa_beam_search_no_trainer", args) send_example_telemetry("run_qa_beam_search_no_trainer", args)
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers
accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # in the environment
accelerator_log_kwargs = {}
if args.with_tracking:
accelerator_log_kwargs["log_with"] = args.report_to
accelerator_log_kwargs["logging_dir"] = args.output_dir
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
# Make one log on every process with the configuration for debugging. # Make one log on every process with the configuration for debugging.
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
...@@ -739,8 +747,8 @@ def main(): ...@@ -739,8 +747,8 @@ def main():
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
optimizer=optimizer, optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps, num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
num_training_steps=args.max_train_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
) )
# Prepare everything with our `accelerator`. # Prepare everything with our `accelerator`.
...@@ -818,17 +826,22 @@ def main(): ...@@ -818,17 +826,22 @@ def main():
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
completed_steps += 1 completed_steps += 1
continue continue
outputs = model(**batch)
loss = outputs.loss with accelerator.accumulate(model):
# We keep track of the loss at each epoch outputs = model(**batch)
if args.with_tracking: loss = outputs.loss
total_loss += loss.detach().float() # We keep track of the loss at each epoch
loss = loss / args.gradient_accumulation_steps if args.with_tracking:
accelerator.backward(loss) total_loss += loss.detach().float()
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
accelerator.backward(loss)
optimizer.step() optimizer.step()
lr_scheduler.step() lr_scheduler.step()
optimizer.zero_grad() optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1) progress_bar.update(1)
completed_steps += 1 completed_steps += 1
......
...@@ -337,9 +337,14 @@ def main(): ...@@ -337,9 +337,14 @@ def main():
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
# in the environment # in the environment
accelerator = ( accelerator_log_kwargs = {}
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
) if args.with_tracking:
accelerator_log_kwargs["log_with"] = args.report_to
accelerator_log_kwargs["logging_dir"] = args.output_dir
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
# Make one log on every process with the configuration for debugging. # Make one log on every process with the configuration for debugging.
logging.basicConfig( logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
...@@ -757,8 +762,8 @@ def main(): ...@@ -757,8 +762,8 @@ def main():
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
optimizer=optimizer, optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps, num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
num_training_steps=args.max_train_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
) )
# Prepare everything with our `accelerator`. # Prepare everything with our `accelerator`.
...@@ -839,17 +844,21 @@ def main(): ...@@ -839,17 +844,21 @@ def main():
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
completed_steps += 1 completed_steps += 1
continue continue
outputs = model(**batch)
loss = outputs.loss with accelerator.accumulate(model):
# We keep track of the loss at each epoch outputs = model(**batch)
if args.with_tracking: loss = outputs.loss
total_loss += loss.detach().float() # We keep track of the loss at each epoch
loss = loss / args.gradient_accumulation_steps if args.with_tracking:
accelerator.backward(loss) total_loss += loss.detach().float()
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
accelerator.backward(loss)
optimizer.step() optimizer.step()
lr_scheduler.step() lr_scheduler.step()
optimizer.zero_grad() optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1) progress_bar.update(1)
completed_steps += 1 completed_steps += 1
......
...@@ -326,9 +326,14 @@ def main(): ...@@ -326,9 +326,14 @@ def main():
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
# in the environment # in the environment
accelerator = ( accelerator_log_kwargs = {}
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
) if args.with_tracking:
accelerator_log_kwargs["log_with"] = args.report_to
accelerator_log_kwargs["logging_dir"] = args.output_dir
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
logger.info(accelerator.state, main_process_only=False) logger.info(accelerator.state, main_process_only=False)
if accelerator.is_local_main_process: if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning() datasets.utils.logging.set_verbosity_warning()
...@@ -487,8 +492,8 @@ def main(): ...@@ -487,8 +492,8 @@ def main():
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
optimizer=optimizer, optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps, num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
num_training_steps=args.max_train_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
) )
# Prepare everything with our `accelerator`. # Prepare everything with our `accelerator`.
...@@ -563,17 +568,20 @@ def main(): ...@@ -563,17 +568,20 @@ def main():
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
completed_steps += 1 completed_steps += 1
continue continue
outputs = model(**batch)
loss = outputs.loss with accelerator.accumulate(model):
# We keep track of the loss at each epoch outputs = model(**batch)
if args.with_tracking: loss = outputs.loss
total_loss += loss.detach().float() # We keep track of the loss at each epoch
loss = loss / args.gradient_accumulation_steps if args.with_tracking:
accelerator.backward(loss) total_loss += loss.detach().float()
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: accelerator.backward(loss)
optimizer.step() optimizer.step()
lr_scheduler.step() lr_scheduler.step()
optimizer.zero_grad() optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1) progress_bar.update(1)
completed_steps += 1 completed_steps += 1
......
...@@ -330,9 +330,13 @@ def main(): ...@@ -330,9 +330,13 @@ def main():
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
# in the environment # in the environment
accelerator = ( accelerator_log_kwargs = {}
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
) if args.with_tracking:
accelerator_log_kwargs["log_with"] = args.report_to
accelerator_log_kwargs["logging_dir"] = args.output_dir
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
if args.source_prefix is None and args.model_name_or_path in [ if args.source_prefix is None and args.model_name_or_path in [
"t5-small", "t5-small",
"t5-base", "t5-base",
...@@ -552,8 +556,8 @@ def main(): ...@@ -552,8 +556,8 @@ def main():
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
optimizer=optimizer, optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps, num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
num_training_steps=args.max_train_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
) )
# Prepare everything with our `accelerator`. # Prepare everything with our `accelerator`.
...@@ -635,17 +639,20 @@ def main(): ...@@ -635,17 +639,20 @@ def main():
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
completed_steps += 1 completed_steps += 1
continue continue
outputs = model(**batch)
loss = outputs.loss with accelerator.accumulate(model):
# We keep track of the loss at each epoch outputs = model(**batch)
if args.with_tracking: loss = outputs.loss
total_loss += loss.detach().float() # We keep track of the loss at each epoch
loss = loss / args.gradient_accumulation_steps if args.with_tracking:
accelerator.backward(loss) total_loss += loss.detach().float()
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: accelerator.backward(loss)
optimizer.step() optimizer.step()
lr_scheduler.step() lr_scheduler.step()
optimizer.zero_grad() optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1) progress_bar.update(1)
completed_steps += 1 completed_steps += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment