added wordpiece - updated readme

12e013db · thomwolf · ccce66be · 12e013db · 12e013db · 12e013db
Commit 12e013db authored Oct 30, 2018 by thomwolf
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 794 additions and 68 deletions

README.md README.md +22 -1

bert_model.py bert_model.py +5 -12

data_processor.py data_processor.py +767 -37

example.py example.py +0 -18

No files found.
--- a/README.md
+++ b/README.md
 # pytorch-pretrained-BERT
-A PyTorch version of Google's pretrained BERT model
+A PyTorch version of Google's pretrained BERT model as described in
+
+No bells and whitles, just:
+- [one class](bert_model.py) with a clean commented version of Google's BERT model that can load the weights pre-trained by Google's authors,
+- [another class](data_processor.py) with all you need to pre- and post-process text data for the model (tokenize and encode),
+- and [a script](download_weigths.sh) to download Google's pre-trained weights.
+
+Here is how to use these:
+
+```python
+from .bert_model import BERT
+from .data_processor import DataProcessor
+
+bert_model = BERT(bert_model_path='.')
+data_processor = DataProcessor(bert_vocab_path='.')
+
+input_sentence = "We are playing with the BERT model."
+
+tensor_input = data_processor.encode(input_sentence)
+tensor_output = bert_model(prepared_input)
+output_sentence = data_processor.decode(tensor_output)
+```
--- a/bert_model.py
+++ b/bert_model.py
@@ -13,7 +13,6 @@ from typing import NamedTuple, List
 import copy
 import io
 import json
-import logging
 import math
 import pathlib
 import re
@@ -271,12 +270,9 @@ class BERT(torch.nn.Module):
        super().__init__()

        config = BERTConfig(
-                embedding_dim,
-                num_heads,
-                embedding_dropout_probability,
-                attention_dropout_probability,
-                residual_dropout_probability,
-                activation_function,
+            embedding_dim,
+            num_heads,
+            dropout_probability,
        )

        # the embedding size is vocab_size + n_special embeddings + n_ctx
@@ -288,7 +284,7 @@ class BERT(torch.nn.Module):
        self.num_output_layers = 1 + num_layers

        self.embed = torch.nn.Embedding(embedding_size, embedding_dim)
-        self.drop = torch.nn.Dropout(embedding_dropout_probability)
+        self.drop = torch.nn.Dropout(dropout_probability)

        block = Block(n_ctx, config, scale=True)
        self.h = torch.nn.ModuleList([copy.deepcopy(block) for _ in range(num_layers)])
@@ -332,16 +328,13 @@ class BERT(torch.nn.Module):
                     names: List[str] = _PARAMETER_NAMES) -> None:
        # pylint: disable=dangerous-default-value

-        logger.info(f"loading weights from {bert_model_path}")
-        # if `file_path` is a URL, redirect to the cache
-
        with tarfile.open(bert_model_path) as tmp:
            num_params_files = len([member for member in tmp.getmembers() if member.name.endswith('.npy')])
            shapesfile = tmp.extractfile('model/params_shapes.json')
            if shapesfile:
                shapes = json.loads(shapesfile.read())
            else:
-                raise ConfigurationError("unable to find model/params_shapes.json in the archive")
+                raise Exception("unable to find model/params_shapes.json in the archive")

            # numpy can't read from a tarfile directly, so we need a workaround
            # https://github.com/numpy/numpy/issues/7989#issuecomment-341656702

--- a/data_processor.py
+++ b/data_processor.py
--- a/example.py
+++ b/example.py
-"""
-Show how to use HuggingFace's PyTorch implementation of Google's BERT Model.
-"""
-from .bert_model import BERT
-from .prepare_inputs import DataPreprocessor
-
-bert_model = BERT()
-bert_model.load_from('.')
-data_processor = DataProcessor(encoder_file_path='.')
-
-input_sentence = "We are playing with the BERT model."
-print("BERT inputs: {}".format(input_sentence))
-
-tensor_input = data_processor.encode(input_sentence)
-tensor_output = bert_model(prepared_input)
-output_sentence = data_processor.decode(tensor_output)
-
-print("BERT predicted: {}".format(output_sentence))