Update code used for finetuning to latest API.

b219ff00 · Jared Casper · b4b0d739 · b219ff00 · b219ff00 · b219ff00
Commit b219ff00 authored Nov 16, 2020 by Jared Casper
Showing with 5 additions and 7 deletions

megatron/model/classification.py megatron/model/classification.py +1 -2

megatron/model/multiple_choice.py megatron/model/multiple_choice.py +1 -2

tasks/finetune_utils.py tasks/finetune_utils.py +3 -3

No files found.
--- a/megatron/model/classification.py
+++ b/megatron/model/classification.py
@@ -52,8 +52,7 @@ class Classification(MegatronModule):

    def forward(self, input_ids, attention_mask, tokentype_ids):

-        extended_attention_mask = bert_extended_attention_mask(
-            attention_mask, next(self.language_model.parameters()).dtype)
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
        position_ids = bert_position_ids(input_ids)

        _, pooled_output = self.language_model(input_ids,

--- a/megatron/model/multiple_choice.py
+++ b/megatron/model/multiple_choice.py
@@ -64,8 +64,7 @@ class MultipleChoice(MegatronModule):
        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
        tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1))

-        extended_attention_mask = bert_extended_attention_mask(
-            attention_mask, next(self.language_model.parameters()).dtype)
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
        position_ids = bert_position_ids(input_ids)

        _, pooled_output = self.language_model(input_ids,

--- a/tasks/finetune_utils.py
+++ b/tasks/finetune_utils.py
@@ -161,7 +161,7 @@ def _train(model, optimizer, lr_scheduler, forward_step,
            start_iteration = 0

            # Train for one step.
-            losses_dict, _ = train_step(forward_step, batch, model,
+            losses_dict, skipped_iter = train_step(forward_step, batch, model,
                                                   optimizer, lr_scheduler)
            iteration += 1

@@ -169,7 +169,7 @@ def _train(model, optimizer, lr_scheduler, forward_step,
            report_memory_flag = training_log(losses_dict, losses_dict_sum,
                                              optimizer.param_groups[0]['lr'],
                                              iteration, optimizer.loss_scale,
-                                              report_memory_flag)
+                                              report_memory_flag, skipped_iter)

            # Autoresume
            if args.adlr_autoresume and \