"docs/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "5c1e62ff67fb40e8e1bf70714d8d4b3c7d151f41"
Commit 5d543f9b authored by Naman Goyal's avatar Naman Goyal Committed by Facebook Github Bot
Browse files

fixed roberta finetuning with --find-unused-parameters on multiGPU

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/806

Differential Revision: D16649933

fbshipit-source-id: 6eeda6e2caf8019228e3efc0c27ddfcc3c4d8674
parent 1684e166
...@@ -115,6 +115,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py IMDB-bin/ \ ...@@ -115,6 +115,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py IMDB-bin/ \
--max-epoch 10 \ --max-epoch 10 \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--truncate-sequence \ --truncate-sequence \
--find-unused-parameters \
--update-freq 4 --update-freq 4
``` ```
Above will train with effective batch-size of `32`, tested on one `Nvidia V100 32gb`. Above will train with effective batch-size of `32`, tested on one `Nvidia V100 32gb`.
......
...@@ -42,6 +42,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py RTE-bin/ \ ...@@ -42,6 +42,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py RTE-bin/ \
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--max-epoch 10 \ --max-epoch 10 \
--find-unused-parameters \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric; --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
``` ```
......
...@@ -30,7 +30,7 @@ class MaskedLmLoss(FairseqCriterion): ...@@ -30,7 +30,7 @@ class MaskedLmLoss(FairseqCriterion):
3) logging outputs to display while training 3) logging outputs to display while training
""" """
# compute MLM loss # compute MLM loss
logits = model(**sample['net_input'], last_state_only=True)[0] logits = model(**sample['net_input'], return_all_hiddens=False)[0]
targets = model.get_targets(sample, [logits]) targets = model.get_targets(sample, [logits])
loss = F.nll_loss( loss = F.nll_loss(
F.log_softmax( F.log_softmax(
......
...@@ -31,18 +31,15 @@ class SentencePredictionCriterion(FairseqCriterion): ...@@ -31,18 +31,15 @@ class SentencePredictionCriterion(FairseqCriterion):
2) the sample size, which is used as the denominator for the gradient 2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training 3) logging outputs to display while training
""" """
features, extra = model(**sample['net_input'], features_only=True)
padding_mask = sample['net_input']['src_tokens'].eq(self.padding_idx)
assert hasattr(model, 'classification_heads') and \ assert hasattr(model, 'classification_heads') and \
'sentence_classification_head' in model.classification_heads, \ 'sentence_classification_head' in model.classification_heads, \
"model must provide sentence classification head for --criterion=sentence_prediction" "model must provide sentence classification head for --criterion=sentence_prediction"
logits = model.classification_heads['sentence_classification_head']( logits, _ = model(
features, **sample['net_input'],
padding_mask=padding_mask, features_only=True,
classification_head_name='sentence_classification_head',
) )
targets = model.get_targets(sample, [logits]).view(-1) targets = model.get_targets(sample, [logits]).view(-1)
sample_size = targets.numel() sample_size = targets.numel()
......
...@@ -89,6 +89,16 @@ class RobertaModel(FairseqLanguageModel): ...@@ -89,6 +89,16 @@ class RobertaModel(FairseqLanguageModel):
encoder = RobertaEncoder(args, task.source_dictionary) encoder = RobertaEncoder(args, task.source_dictionary)
return cls(args, encoder) return cls(args, encoder)
def forward(self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs):
assert classification_head_name is None or features_only, \
"If passing classification_head_name argument, features_only must be set to True"
x, extra = self.decoder(src_tokens, features_only, return_all_hiddens, **kwargs)
if classification_head_name is not None:
x = self.classification_heads[classification_head_name](x)
return x, extra
def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs): def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
"""Register a classification head.""" """Register a classification head."""
self.classification_heads[name] = RobertaClassificationHead( self.classification_heads[name] = RobertaClassificationHead(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment