Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
4a210c9f
Commit
4a210c9f
authored
May 31, 2019
by
VictorSanh
Browse files
Move bert_hubconf to hubconfs
parent
0c5a4fe9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
248 additions
and
0 deletions
+248
-0
hubconfs/bert_hubconf.py
hubconfs/bert_hubconf.py
+248
-0
No files found.
hubconfs/bert_hubconf.py
0 → 100644
View file @
4a210c9f
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.modeling
import
(
BertModel
,
BertForNextSentencePrediction
,
BertForMaskedLM
,
BertForMultipleChoice
,
BertForPreTraining
,
BertForQuestionAnswering
,
BertForSequenceClassification
,
BertForTokenClassification
,
)
dependencies
=
[
'torch'
,
'tqdm'
,
'boto3'
,
'requests'
,
'regex'
]
# A lot of models share the same param doc. Use a decorator
# to save typing
bert_docstring
=
"""
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
instance
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow
checkpoint
cache_dir: an optional path to a folder in which the pre-trained models
will be cached.
state_dict: an optional state dictionnary
(collections.OrderedDict object) to use instead of Google
pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
def
_append_from_pretrained_docstring
(
docstr
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
fn
.
__doc__
+
docstr
return
fn
return
docstring_decorator
def
bertTokenizer
(
*
args
,
**
kwargs
):
"""
Instantiate a BertTokenizer from a pre-trained/customized vocab file
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* bert-base-uncased
* bert-large-uncased
* bert-base-cased
* bert-large-cased
* bert-base-multilingual-uncased
* bert-base-multilingual-cased
* bert-base-chinese
Keyword args:
cache_dir: an optional path to a specific directory to download and cache
the pre-trained model weights.
Default: None
do_lower_case: Whether to lower case the input.
Only has an effect when do_wordpiece_only=False
Default: True
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
Default: True
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
Default: None
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
Example:
>>> sentence = 'Hello, World!'
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
>>> toks = tokenizer.tokenize(sentence)
['Hello', '##,', 'World', '##!']
>>> ids = tokenizer.convert_tokens_to_ids(toks)
[8667, 28136, 1291, 28125]
"""
tokenizer
=
BertTokenizer
.
from_pretrained
(
*
args
,
**
kwargs
)
return
tokenizer
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertModel
(
*
args
,
**
kwargs
):
"""
BertModel is the basic BERT Transformer model with a layer of summed token,
position and sequence embeddings followed by a series of identical
self-attention blocks (12 for BERT-base, 24 for BERT-large).
Example:
# Load the tokenizer
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
# Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
>>> tokenized_text = tokenizer.tokenize(text)
['[CLS]', 'Who', 'was', 'Jim', 'He', '##nson', '?', '[SEP]', 'Jim', 'He', '##nson', 'was', 'a', 'puppet', '##eer', '[SEP]']
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> tokens_tensor = torch.tensor([indexed_tokens])
tensor([[101, 2627, 1108, 3104, 1124, 15703, 136, 102, 3104, 1124, 15703, 1108, 170, 16797, 8284, 102]])
>>> segments_tensors = torch.tensor([segments_ids])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
# Load bertModel
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased', force_reload=False)
>>> model.eval()
# Predict hidden states features for each layer
>>> with torch.no_grad():
encoded_layers, _ = model(tokens_tensor, segments_tensors)
"""
model
=
BertModel
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForNextSentencePrediction
(
*
args
,
**
kwargs
):
"""
BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence
classification head.
"""
model
=
BertForNextSentencePrediction
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForPreTraining
(
*
args
,
**
kwargs
):
"""
BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads
- the masked language modeling head, and
- the next sentence classification head.
"""
model
=
BertForPreTraining
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForMaskedLM
(
*
args
,
**
kwargs
):
"""
BertForMaskedLM includes the BertModel Transformer followed by the
(possibly) pre-trained masked language modeling head.
Example:
# Load the tokenizer
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
# Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
>>> tokenized_text = tokenizer.tokenize(text)
>>> masked_index = 8
>>> tokenized_text[masked_index] = '[MASK]'
['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> tokens_tensor = torch.tensor([indexed_tokens])
>>> segments_tensors = torch.tensor([segments_ids])
# Load bertForMaskedLM
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased', force_reload=False)
>>> model.eval()
# Predict all tokens
>>> with torch.no_grad():
predictions = model(tokens_tensor, segments_tensors)
>>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
'henson'
"""
model
=
BertForMaskedLM
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForSequenceClassification
(
*
args
,
**
kwargs
):
"""
BertForSequenceClassification is a fine-tuning model that includes
BertModel and a sequence-level (sequence or pair of sequences) classifier
on top of the BertModel.
The sequence-level classifier is a linear layer that takes as input the
last hidden state of the first character in the input sequence
(see Figures 3a and 3b in the BERT paper).
Args:
num_labels: the number (>=2) of classes for the classifier.
Example:
>>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2, force_reload=True)
"""
model
=
BertForSequenceClassification
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForMultipleChoice
(
*
args
,
**
kwargs
):
"""
BertForMultipleChoice is a fine-tuning model that includes BertModel and a
linear layer on top of the BertModel.
Args:
num_choices: the number (>=2) of classes for the classifier.
Example:
>>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2, force_reload=True)
"""
model
=
BertForMultipleChoice
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForQuestionAnswering
(
*
args
,
**
kwargs
):
"""
BertForQuestionAnswering is a fine-tuning model that includes BertModel
with a token-level classifiers on top of the full sequence of last hidden
states.
"""
model
=
BertForQuestionAnswering
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForTokenClassification
(
*
args
,
**
kwargs
):
"""
BertForTokenClassification is a fine-tuning model that includes BertModel
and a token-level classifier on top of the BertModel.
The token-level classifier is a linear layer that takes as input the last
hidden state of the sequence.
Args:
num_labels: the number (>=2) of classes for the classifier.
Example:
>>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2, force_reload=True)
"""
model
=
BertForTokenClassification
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment