Commit 12e013db authored by thomwolf's avatar thomwolf
Browse files

added wordpiece - updated readme

parent ccce66be
# pytorch-pretrained-BERT # pytorch-pretrained-BERT
A PyTorch version of Google's pretrained BERT model A PyTorch version of Google's pretrained BERT model as described in
No bells and whitles, just:
- [one class](bert_model.py) with a clean commented version of Google's BERT model that can load the weights pre-trained by Google's authors,
- [another class](data_processor.py) with all you need to pre- and post-process text data for the model (tokenize and encode),
- and [a script](download_weigths.sh) to download Google's pre-trained weights.
Here is how to use these:
```python
from .bert_model import BERT
from .data_processor import DataProcessor
bert_model = BERT(bert_model_path='.')
data_processor = DataProcessor(bert_vocab_path='.')
input_sentence = "We are playing with the BERT model."
tensor_input = data_processor.encode(input_sentence)
tensor_output = bert_model(prepared_input)
output_sentence = data_processor.decode(tensor_output)
```
...@@ -13,7 +13,6 @@ from typing import NamedTuple, List ...@@ -13,7 +13,6 @@ from typing import NamedTuple, List
import copy import copy
import io import io
import json import json
import logging
import math import math
import pathlib import pathlib
import re import re
...@@ -271,12 +270,9 @@ class BERT(torch.nn.Module): ...@@ -271,12 +270,9 @@ class BERT(torch.nn.Module):
super().__init__() super().__init__()
config = BERTConfig( config = BERTConfig(
embedding_dim, embedding_dim,
num_heads, num_heads,
embedding_dropout_probability, dropout_probability,
attention_dropout_probability,
residual_dropout_probability,
activation_function,
) )
# the embedding size is vocab_size + n_special embeddings + n_ctx # the embedding size is vocab_size + n_special embeddings + n_ctx
...@@ -288,7 +284,7 @@ class BERT(torch.nn.Module): ...@@ -288,7 +284,7 @@ class BERT(torch.nn.Module):
self.num_output_layers = 1 + num_layers self.num_output_layers = 1 + num_layers
self.embed = torch.nn.Embedding(embedding_size, embedding_dim) self.embed = torch.nn.Embedding(embedding_size, embedding_dim)
self.drop = torch.nn.Dropout(embedding_dropout_probability) self.drop = torch.nn.Dropout(dropout_probability)
block = Block(n_ctx, config, scale=True) block = Block(n_ctx, config, scale=True)
self.h = torch.nn.ModuleList([copy.deepcopy(block) for _ in range(num_layers)]) self.h = torch.nn.ModuleList([copy.deepcopy(block) for _ in range(num_layers)])
...@@ -332,16 +328,13 @@ class BERT(torch.nn.Module): ...@@ -332,16 +328,13 @@ class BERT(torch.nn.Module):
names: List[str] = _PARAMETER_NAMES) -> None: names: List[str] = _PARAMETER_NAMES) -> None:
# pylint: disable=dangerous-default-value # pylint: disable=dangerous-default-value
logger.info(f"loading weights from {bert_model_path}")
# if `file_path` is a URL, redirect to the cache
with tarfile.open(bert_model_path) as tmp: with tarfile.open(bert_model_path) as tmp:
num_params_files = len([member for member in tmp.getmembers() if member.name.endswith('.npy')]) num_params_files = len([member for member in tmp.getmembers() if member.name.endswith('.npy')])
shapesfile = tmp.extractfile('model/params_shapes.json') shapesfile = tmp.extractfile('model/params_shapes.json')
if shapesfile: if shapesfile:
shapes = json.loads(shapesfile.read()) shapes = json.loads(shapesfile.read())
else: else:
raise ConfigurationError("unable to find model/params_shapes.json in the archive") raise Exception("unable to find model/params_shapes.json in the archive")
# numpy can't read from a tarfile directly, so we need a workaround # numpy can't read from a tarfile directly, so we need a workaround
# https://github.com/numpy/numpy/issues/7989#issuecomment-341656702 # https://github.com/numpy/numpy/issues/7989#issuecomment-341656702
......
This diff is collapsed.
"""
Show how to use HuggingFace's PyTorch implementation of Google's BERT Model.
"""
from .bert_model import BERT
from .prepare_inputs import DataPreprocessor
bert_model = BERT()
bert_model.load_from('.')
data_processor = DataProcessor(encoder_file_path='.')
input_sentence = "We are playing with the BERT model."
print("BERT inputs: {}".format(input_sentence))
tensor_input = data_processor.encode(input_sentence)
tensor_output = bert_model(prepared_input)
output_sentence = data_processor.decode(tensor_output)
print("BERT predicted: {}".format(output_sentence))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment