Commit 12e013db authored by thomwolf's avatar thomwolf
Browse files

added wordpiece - updated readme

parent ccce66be
# pytorch-pretrained-BERT
A PyTorch version of Google's pretrained BERT model
A PyTorch version of Google's pretrained BERT model as described in
No bells and whitles, just:
- [one class](bert_model.py) with a clean commented version of Google's BERT model that can load the weights pre-trained by Google's authors,
- [another class](data_processor.py) with all you need to pre- and post-process text data for the model (tokenize and encode),
- and [a script](download_weigths.sh) to download Google's pre-trained weights.
Here is how to use these:
```python
from .bert_model import BERT
from .data_processor import DataProcessor
bert_model = BERT(bert_model_path='.')
data_processor = DataProcessor(bert_vocab_path='.')
input_sentence = "We are playing with the BERT model."
tensor_input = data_processor.encode(input_sentence)
tensor_output = bert_model(prepared_input)
output_sentence = data_processor.decode(tensor_output)
```
......@@ -13,7 +13,6 @@ from typing import NamedTuple, List
import copy
import io
import json
import logging
import math
import pathlib
import re
......@@ -271,12 +270,9 @@ class BERT(torch.nn.Module):
super().__init__()
config = BERTConfig(
embedding_dim,
num_heads,
embedding_dropout_probability,
attention_dropout_probability,
residual_dropout_probability,
activation_function,
embedding_dim,
num_heads,
dropout_probability,
)
# the embedding size is vocab_size + n_special embeddings + n_ctx
......@@ -288,7 +284,7 @@ class BERT(torch.nn.Module):
self.num_output_layers = 1 + num_layers
self.embed = torch.nn.Embedding(embedding_size, embedding_dim)
self.drop = torch.nn.Dropout(embedding_dropout_probability)
self.drop = torch.nn.Dropout(dropout_probability)
block = Block(n_ctx, config, scale=True)
self.h = torch.nn.ModuleList([copy.deepcopy(block) for _ in range(num_layers)])
......@@ -332,16 +328,13 @@ class BERT(torch.nn.Module):
names: List[str] = _PARAMETER_NAMES) -> None:
# pylint: disable=dangerous-default-value
logger.info(f"loading weights from {bert_model_path}")
# if `file_path` is a URL, redirect to the cache
with tarfile.open(bert_model_path) as tmp:
num_params_files = len([member for member in tmp.getmembers() if member.name.endswith('.npy')])
shapesfile = tmp.extractfile('model/params_shapes.json')
if shapesfile:
shapes = json.loads(shapesfile.read())
else:
raise ConfigurationError("unable to find model/params_shapes.json in the archive")
raise Exception("unable to find model/params_shapes.json in the archive")
# numpy can't read from a tarfile directly, so we need a workaround
# https://github.com/numpy/numpy/issues/7989#issuecomment-341656702
......
This diff is collapsed.
"""
Show how to use HuggingFace's PyTorch implementation of Google's BERT Model.
"""
from .bert_model import BERT
from .prepare_inputs import DataPreprocessor
bert_model = BERT()
bert_model.load_from('.')
data_processor = DataProcessor(encoder_file_path='.')
input_sentence = "We are playing with the BERT model."
print("BERT inputs: {}".format(input_sentence))
tensor_input = data_processor.encode(input_sentence)
tensor_output = bert_model(prepared_input)
output_sentence = data_processor.decode(tensor_output)
print("BERT predicted: {}".format(output_sentence))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment