Unverified Commit c50aa67b authored by Nicolas's avatar Nicolas Committed by GitHub
Browse files

Resizing embedding matrix before sending it to the optimizer. (#3532)

* Resizing embedding matrix after sending it to the optimizer prevents from updating the newly resized matrix.

* Remove space for style matter
parent 1b101599
...@@ -233,6 +233,9 @@ def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedToke ...@@ -233,6 +233,9 @@ def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedToke
else: else:
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
model = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
model.resize_token_embeddings(len(tokenizer))
# Prepare optimizer and schedule (linear warmup and decay) # Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"] no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
...@@ -309,9 +312,6 @@ def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedToke ...@@ -309,9 +312,6 @@ def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedToke
tr_loss, logging_loss = 0.0, 0.0 tr_loss, logging_loss = 0.0, 0.0
model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
model_to_resize.resize_token_embeddings(len(tokenizer))
model.zero_grad() model.zero_grad()
train_iterator = trange( train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment