Commit 5d543f9b authored by Naman Goyal's avatar Naman Goyal Committed by Facebook Github Bot
Browse files

fixed roberta finetuning with --find-unused-parameters on multiGPU

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/806

Differential Revision: D16649933

fbshipit-source-id: 6eeda6e2caf8019228e3efc0c27ddfcc3c4d8674
parent 1684e166
......@@ -115,6 +115,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py IMDB-bin/ \
--max-epoch 10 \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--truncate-sequence \
--find-unused-parameters \
--update-freq 4
```
Above will train with effective batch-size of `32`, tested on one `Nvidia V100 32gb`.
......
......@@ -42,6 +42,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py RTE-bin/ \
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--max-epoch 10 \
--find-unused-parameters \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
```
......
......@@ -30,7 +30,7 @@ class MaskedLmLoss(FairseqCriterion):
3) logging outputs to display while training
"""
# compute MLM loss
logits = model(**sample['net_input'], last_state_only=True)[0]
logits = model(**sample['net_input'], return_all_hiddens=False)[0]
targets = model.get_targets(sample, [logits])
loss = F.nll_loss(
F.log_softmax(
......
......@@ -31,18 +31,15 @@ class SentencePredictionCriterion(FairseqCriterion):
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
features, extra = model(**sample['net_input'], features_only=True)
padding_mask = sample['net_input']['src_tokens'].eq(self.padding_idx)
assert hasattr(model, 'classification_heads') and \
'sentence_classification_head' in model.classification_heads, \
"model must provide sentence classification head for --criterion=sentence_prediction"
logits = model.classification_heads['sentence_classification_head'](
features,
padding_mask=padding_mask,
logits, _ = model(
**sample['net_input'],
features_only=True,
classification_head_name='sentence_classification_head',
)
targets = model.get_targets(sample, [logits]).view(-1)
sample_size = targets.numel()
......
......@@ -89,6 +89,16 @@ class RobertaModel(FairseqLanguageModel):
encoder = RobertaEncoder(args, task.source_dictionary)
return cls(args, encoder)
def forward(self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs):
assert classification_head_name is None or features_only, \
"If passing classification_head_name argument, features_only must be set to True"
x, extra = self.decoder(src_tokens, features_only, return_all_hiddens, **kwargs)
if classification_head_name is not None:
x = self.classification_heads[classification_head_name](x)
return x, extra
def register_classification_head(self, name, num_classes=None, inner_dim=None, **kwargs):
"""Register a classification head."""
self.classification_heads[name] = RobertaClassificationHead(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment