Add doc string for Roberta.encode function

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/969 Differential Revision: D16642388 Pulled By: myleott fbshipit-source-id: c5b1655dbddb697822feefa433f33f6bb08253ab

Add doc string for Roberta.encode function
Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/969 Differential Revision: D16642388 Pulled By: myleott fbshipit-source-id: c5b1655dbddb697822feefa433f33f6bb08253ab
1684e166 · Myle Ott · Facebook Github Bot · c728b864 · 1684e166
Commit 1684e166 authored Aug 04, 2019 by Myle Ott Committed by Facebook Github Bot Aug 04, 2019
Show whitespace changes
Inline Side-by-side

Showing with 20 additions and 0 deletions

fairseq/models/roberta/hub_interface.py fairseq/models/roberta/hub_interface.py +20 -0

No files found.
--- a/fairseq/models/roberta/hub_interface.py
+++ b/fairseq/models/roberta/hub_interface.py
@@ -33,6 +33,26 @@ class RobertaHubInterface(nn.Module):
        return self._float_tensor.device

    def encode(self, sentence: str, *addl_sentences) -> torch.LongTensor:
+        """
+        BPE-encode a sentence (or multiple sentences).
+
+        Every sequence begins with a beginning-of-sentence (`<s>`) symbol.
+        Every sentence ends with an end-of-sentence (`</s>`) and we use an
+        extra end-of-sentence (`</s>`) as a separator.
+
+        Example (single sentence): `<s> a b c </s>`
+        Example (sentence pair): `<s> d e f </s> </s> 1 2 3 </s>`
+
+        The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE
+        requires leading spaces. For example::
+
+            >>> roberta.encode('Hello world').tolist()
+            [0, 31414, 232, 2]
+            >>> roberta.encode(' world').tolist()
+            [0, 232, 2]
+            >>> roberta.encode('world').tolist()
+            [0, 8331, 2]
+        """
        bpe_sentence = '<s> ' + self.bpe.encode(sentence) + ' </s>'
        for s in addl_sentences:
            bpe_sentence += ' </s> ' + self.bpe.encode(s) + ' </s>'