added hf bert bpe

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/829 Differential Revision: D16856693 fbshipit-source-id: 545bbf4815f5c40e72a6ed241312a51dc90e34a1

added hf bert bpe
Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/829 Differential Revision: D16856693 fbshipit-source-id: 545bbf4815f5c40e72a6ed241312a51dc90e34a1
a3cfd51d · Naman Goyal · Facebook Github Bot · ed27ed8b · a3cfd51d · a3cfd51d
Commit a3cfd51d authored Aug 16, 2019 by Naman Goyal Committed by Facebook Github Bot Aug 16, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 53 additions and 2 deletions

fairseq/data/encoders/hf_bert_bpe.py fairseq/data/encoders/hf_bert_bpe.py +51 -0

fairseq/models/roberta/model.py fairseq/models/roberta/model.py +2 -2

No files found.
--- a/fairseq/data/encoders/hf_bert_bpe.py
+++ b/fairseq/data/encoders/hf_bert_bpe.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from fairseq.data.encoders import register_bpe
+@register_bpe('bert')
+class BertBPE(object):
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--bpe-cased', action='store_true',
+                            help='set for cased BPE',
+                            default=False)
+        parser.add_argument('--bpe-vocab-file', type=str,
+                            help='bpe vocab file.')
+        # fmt: on
+    def __init__(self, args):
+        try:
+            from pytorch_transformers import BertTokenizer
+            from pytorch_transformers.tokenization_utils import clean_up_tokenization
+        except ImportError:
+            raise ImportError(
+                'Please install 1.0.0 version of pytorch_transformers'
+                'with: pip install pytorch-transformers'
+            )
+        if 'bpe_vocab_file' in args:
+            self.bert_tokenizer = BertTokenizer(
+                args.bpe_vocab_file,
+                do_lower_case=not args.bpe_cased
+            )
+        else:
+            vocab_file_name = 'bert-base-cased' if args.bpe_cased else 'bert-base-uncased'
+            self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name)
+            self.clean_up_tokenization = clean_up_tokenization
+    def encode(self, x: str) -> str:
+        return ' '.join(self.bert_tokenizer.tokenize(x))
+    def decode(self, x: str) -> str:
+        return self.clean_up_tokenization(
+            self.bert_tokenizer.convert_tokens_to_string(x.split(' '))
+        )
+    def is_beginning_of_word(self, x: str) -> bool:
+        return not x.startswith('##')
--- a/fairseq/models/roberta/model.py
+++ b/fairseq/models/roberta/model.py
@@ -127,14 +127,14 @@ class RobertaModel(FairseqLanguageModel):
        return {'self'}
    @classmethod
-    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', **kwargs):
+    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='gpt2', **kwargs):
        from fairseq import hub_utils
        x = hub_utils.from_pretrained(
            model_name_or_path,
            checkpoint_file,
            data_name_or_path,
            archive_map=cls.hub_models(),
-            bpe='gpt2',
+            bpe=bpe,
            load_checkpoint_heads=True,
            **kwargs,
        )