gradient norm clipping should be done right before calling the optimiser -...

gradient norm clipping should be done right before calling the optimiser - fixing run_glue and run_ner as well

gradient norm clipping should be done right before calling the optimiser -...
gradient norm clipping should be done right before calling the optimiser - fixing run_glue and run_ner as well
abd7110e · Pasquale Minervini · 3775550c · abd7110e · abd7110e
Commit abd7110e authored Oct 21, 2019 by Pasquale Minervini
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 4 deletions

examples/run_glue.py examples/run_glue.py +5 -2

examples/run_ner.py examples/run_ner.py +5 -2

No files found.
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -154,13 +154,16 @@ def train(args, train_dataset, model, tokenizer):
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()

--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -133,13 +133,16 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()