Unverified Commit 12b9cc9e authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #1110 from huggingface/automodels

Torch.hub now based on AutoModels - Updating AutoModels with AutoModelWithLMHead, Sequence Classification and Question Answering
parents 282c276e bfe93a5a
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex'] from pytorch_transformers import (
AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
from hubconfs.bert_hubconf import (
bertTokenizer,
bertModel,
bertForNextSentencePrediction,
bertForPreTraining,
bertForMaskedLM,
bertForSequenceClassification,
bertForMultipleChoice,
bertForQuestionAnswering,
bertForTokenClassification
)
from hubconfs.gpt_hubconf import (
openAIGPTTokenizer,
openAIGPTModel,
openAIGPTLMHeadModel,
openAIGPTDoubleHeadsModel
)
from hubconfs.gpt2_hubconf import (
gpt2Tokenizer,
gpt2Model,
gpt2LMHeadModel,
gpt2DoubleHeadsModel
)
from hubconfs.transformer_xl_hubconf import (
transformerXLTokenizer,
transformerXLModel,
transformerXLLMHeadModel
) )
from pytorch_transformers.modeling_utils import add_start_docstrings
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
@add_start_docstrings(AutoConfig.__doc__)
def config(*args, **kwargs):
r"""
# Using torch.hub !
import torch
config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased') # Download configuration from S3 and cache.
config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
assert config.output_attention == True
config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
assert config.output_attention == True
assert unused_kwargs == {'foo': False}
"""
return AutoConfig.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoTokenizer.__doc__)
def tokenizer(*args, **kwargs):
r"""
# Using torch.hub !
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache.
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
"""
return AutoTokenizer.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModel.__doc__)
def model(*args, **kwargs):
r"""
# Using torch.hub !
import torch
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased') # Download model and configuration from S3 and cache.
model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True) # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
return AutoModel.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModelWithLMHead.__doc__)
def modelWithLMHead(*args, **kwargs):
r"""
# Using torch.hub !
import torch
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased') # Download model and configuration from S3 and cache.
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True) # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
def modelForSequenceClassification(*args, **kwargs):
r"""
# Using torch.hub !
import torch
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased') # Download model and configuration from S3 and cache.
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True) # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
def modelForQuestionAnswering(*args, **kwargs):
r"""
# Using torch.hub !
import torch
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased') # Download model and configuration from S3 and cache.
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True) # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.modeling_bert import (
BertModel,
BertForNextSentencePrediction,
BertForMaskedLM,
BertForMultipleChoice,
BertForPreTraining,
BertForQuestionAnswering,
BertForSequenceClassification,
BertForTokenClassification,
)
# A lot of models share the same param doc. Use a decorator
# to save typing
bert_docstring = """
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
. `bert-base-german-cased`
. `bert-large-uncased-whole-word-masking`
. `bert-large-cased-whole-word-masking`
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
instance
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow
checkpoint
cache_dir: an optional path to a folder in which the pre-trained models
will be cached.
state_dict: an optional state dictionary
(collections.OrderedDict object) to use instead of Google
pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
def _append_from_pretrained_docstring(docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + docstr
return fn
return docstring_decorator
def bertTokenizer(*args, **kwargs):
"""
Instantiate a BertTokenizer from a pre-trained/customized vocab file
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* bert-base-uncased
* bert-large-uncased
* bert-base-cased
* bert-large-cased
* bert-base-multilingual-uncased
* bert-base-multilingual-cased
* bert-base-chinese
Keyword args:
cache_dir: an optional path to a specific directory to download and cache
the pre-trained model weights.
Default: None
do_lower_case: Whether to lower case the input.
Only has an effect when do_wordpiece_only=False
Default: True
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
Default: True
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
Default: None
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
Example:
import torch
sentence = 'Hello, World!'
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
toks = tokenizer.tokenize(sentence)
['Hello', '##,', 'World', '##!']
ids = tokenizer.convert_tokens_to_ids(toks)
[8667, 28136, 1291, 28125]
"""
tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
return tokenizer
@_append_from_pretrained_docstring(bert_docstring)
def bertModel(*args, **kwargs):
"""
BertModel is the basic BERT Transformer model with a layer of summed token,
position and sequence embeddings followed by a series of identical
self-attention blocks (12 for BERT-base, 24 for BERT-large).
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertModel
model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
encoded_layers, _ = model(tokens_tensor, segments_tensors)
"""
model = BertModel.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(bert_docstring)
def bertForNextSentencePrediction(*args, **kwargs):
"""
BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence
classification head.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForNextSentencePrediction
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
model.eval()
# Predict the next sentence classification logits
with torch.no_grad():
next_sent_classif_logits = model(tokens_tensor, segments_tensors)
"""
model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(bert_docstring)
def bertForPreTraining(*args, **kwargs):
"""
BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads
- the masked language modeling head, and
- the next sentence classification head.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForPreTraining
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
"""
model = BertForPreTraining.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(bert_docstring)
def bertForMaskedLM(*args, **kwargs):
"""
BertForMaskedLM includes the BertModel Transformer followed by the
(possibly) pre-trained masked language modeling head.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForMaskedLM
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
model.eval()
# Predict all tokens
with torch.no_grad():
predictions = model(tokens_tensor, segments_tensors)
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
'henson'
"""
model = BertForMaskedLM.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(bert_docstring)
def bertForSequenceClassification(*args, **kwargs):
"""
BertForSequenceClassification is a fine-tuning model that includes
BertModel and a sequence-level (sequence or pair of sequences) classifier
on top of the BertModel. Note that the classification head is only initialized
and has to be trained.
The sequence-level classifier is a linear layer that takes as input the
last hidden state of the first character in the input sequence
(see Figures 3a and 3b in the BERT paper).
Args:
num_labels: the number (>=2) of classes for the classifier.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForSequenceClassification
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
model.eval()
# Predict the sequence classification logits
with torch.no_grad():
seq_classif_logits = model(tokens_tensor, segments_tensors)
# Or get the sequence classification loss
labels = torch.tensor([1])
seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
"""
model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(bert_docstring)
def bertForMultipleChoice(*args, **kwargs):
"""
BertForMultipleChoice is a fine-tuning model that includes BertModel and a
linear layer on top of the BertModel. Note that the multiple choice head is
only initialized and has to be trained.
Args:
num_choices: the number (>=2) of classes for the classifier.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
# Load bertForMultipleChoice
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
model.eval()
# Predict the multiple choice logits
with torch.no_grad():
multiple_choice_logits = model(tokens_tensor, segments_tensors)
# Or get the multiple choice loss
labels = torch.tensor([1])
multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
"""
model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(bert_docstring)
def bertForQuestionAnswering(*args, **kwargs):
"""
BertForQuestionAnswering is a fine-tuning model that includes BertModel
with a token-level classifiers on top of the full sequence of last hidden
states. Note that the classification head is only initialized
and has to be trained.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForQuestionAnswering
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
model.eval()
# Predict the start and end positions logits
with torch.no_grad():
start_logits, end_logits = model(tokens_tensor, segments_tensors)
# Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
# set model.train() before if training this loss
multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
"""
model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(bert_docstring)
def bertForTokenClassification(*args, **kwargs):
"""
BertForTokenClassification is a fine-tuning model that includes BertModel
and a token-level classifier on top of the BertModel. Note that the classification
head is only initialized and has to be trained.
The token-level classifier is a linear layer that takes as input the last
hidden state of the sequence.
Args:
num_labels: the number (>=2) of classes for the classifier.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load bertForTokenClassification
model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
model.eval()
# Predict the token classification logits
with torch.no_grad():
classif_logits = model(tokens_tensor, segments_tensors)
# Or get the token classification loss
labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
"""
model = BertForTokenClassification.from_pretrained(*args, **kwargs)
return model
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
from pytorch_transformers.modeling_gpt2 import (
GPT2Model,
GPT2LMHeadModel,
GPT2DoubleHeadsModel
)
# A lot of models share the same param doc. Use a decorator
# to save typing
gpt2_docstring = """
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `gpt2`, `gpt2-medium`
- a path or url to a pretrained model archive containing:
. `gpt2_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
- a path or url to a pretrained model archive containing:
. `gpt2_config.json` a configuration file for the model
. a TensorFlow checkpoint with trained weights
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
*inputs, **kwargs: additional input for the specific GPT-2 class
"""
def _append_from_pretrained_docstring(docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + docstr
return fn
return docstring_decorator
def gpt2Tokenizer(*args, **kwargs):
"""
Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
Peculiarities:
- Byte-level BPE
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* gpt2
Keyword args:
special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
Default: None
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
Default: None
Example:
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
text = "Who was Jim Henson ?"
indexed_tokens = tokenizer.encode(tokenized_text)
"""
tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
return tokenizer
@_append_from_pretrained_docstring(gpt2_docstring)
def gpt2Model(*args, **kwargs):
"""
gpt2Model is the basic OpenAI GPT-2 Transformer model based on
identical stacked masked self-attention blocks and pre-trained
on large scale dataset using language modeling signal.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
# Prepare tokenized input
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
indexed_tokens_1 = tokenizer.encode(text_1)
indexed_tokens_2 = tokenizer.encode(text_2)
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
# Load gpt2Model
model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
model.eval()
# Predict hidden states features for each layer
# past can be used to reuse precomputed hidden state in a subsequent predictions
with torch.no_grad():
hidden_states_1, past = model(tokens_tensor_1)
hidden_states_2, past = model(tokens_tensor_2, past=past)
"""
model = GPT2Model.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(gpt2_docstring)
def gpt2LMHeadModel(*args, **kwargs):
"""
gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
tied (pre-trained) language modeling head on top.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
# Prepare tokenized input
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
indexed_tokens_1 = tokenizer.encode(text_1)
indexed_tokens_2 = tokenizer.encode(text_2)
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
# Load gpt2LMHeadModel
model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
model.eval()
# Predict hidden states features for each layer
# past can be used to reuse precomputed hidden state in a subsequent predictions
with torch.no_grad():
predictions_1, past = model(tokens_tensor_1)
predictions_2, past = model(tokens_tensor_2, past=past)
# Get the predicted last token
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
predicted_token = tokenizer.decode([predicted_index])
assert predicted_token == ' who'
"""
model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(gpt2_docstring)
def gpt2DoubleHeadsModel(*args, **kwargs):
"""
gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
tied (pre-trained) language modeling head and a multiple choice
classification head (only initialized, not pre-trained).
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
# Prepare tokenized input
text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
tokenized_text1 = tokenizer.tokenize(text1)
tokenized_text2 = tokenizer.tokenize(text2)
indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# Load gpt2DoubleHeadsModel
model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
"""
model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
return model
from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
from pytorch_transformers.modeling_openai import (
OpenAIGPTModel,
OpenAIGPTLMHeadModel,
OpenAIGPTDoubleHeadsModel
)
# Dependecies that are not specified in global hubconf.py
specific_dependencies = ['spacy', 'ftfy']
# A lot of models share the same param doc. Use a decorator
# to save typing
gpt_docstring = """
OpenAI GPT use a single embedding matrix to store the word and special embeddings.
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
Special tokens need to be trained during the fine-tuning if you use them.
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
The embeddings are ordered as follow in the token embeddings matrice:
[0, ----------------------
... -> word embeddings
config.vocab_size - 1, ______________________
config.vocab_size,
... -> special embeddings
config.vocab_size + config.n_special - 1] ______________________
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
total_tokens_embeddings = config.vocab_size + config.n_special
You should use the associate indices to index the embeddings.
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `openai-gpt`
- a path or url to a pretrained model archive containing:
. `openai_gpt_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
- a path or url to a pretrained model archive containing:
. `openai-gpt-config.json` a configuration file for the model
. a series of NumPy files containing OpenAI TensorFlow trained weights
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionary (collections.OrderedDict object)
to use instead of pre-trained models
*inputs, **kwargs: additional input for the specific OpenAI-GPT class
"""
def _append_from_pretrained_docstring(docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + docstr
return fn
return docstring_decorator
def openAIGPTTokenizer(*args, **kwargs):
"""
Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
Peculiarities:
- lower case all inputs
- uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
- argument special_tokens and function set_special_tokens:
can be used to add additional symbols (ex: "__classify__") to a vocabulary.
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* openai-gpt
Keyword args:
special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
Default: None
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
Default: None
Example:
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
[763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
"""
tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
return tokenizer
@_append_from_pretrained_docstring(gpt_docstring)
def openAIGPTModel(*args, **kwargs):
"""
OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
identical stacked masked self-attention blocks and pre-trained
on large scale dataset using language modeling signal.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
# Prepare tokenized input
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
# Load openAIGPTModel
model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
hidden_states = model(tokens_tensor)
"""
model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(gpt_docstring)
def openAIGPTLMHeadModel(*args, **kwargs):
"""
OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
tied (pre-trained) language modeling head on top.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
# Prepare tokenized input
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
# Load openAIGPTLMHeadModel
model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
predictions = model(tokens_tensor)
# Get the predicted last token
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
'.</w>'
"""
model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(gpt_docstring)
def openAIGPTDoubleHeadsModel(*args, **kwargs):
"""
OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
tied (pre-trained) language modeling head and a multiple choice
classification head (only initialized, not pre-trained).
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
# Prepare tokenized input
text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
tokenized_text1 = tokenizer.tokenize(text1)
tokenized_text2 = tokenizer.tokenize(text2)
indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# Load openAIGPTDoubleHeadsModel
model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
"""
model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
return model
from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
from pytorch_transformers.modeling_transfo_xl import (
TransfoXLModel,
TransfoXLLMHeadModel
)
# A lot of models share the same param doc. Use a decorator
# to save typing
transformer_xl_docstring = """
Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
- you don't need to specify positioning embeddings indices
- the tokens in the vocabulary have to be sorted to decreasing frequency.
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `transfo-xl-wt103`
- a path or url to a pretrained model archive containing:
. `transfo_xl_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
- a path or url to a pretrained model archive containing:
. `transfo_xl_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
*inputs, **kwargs: additional input for the specific TransformerXL class
"""
def _append_from_pretrained_docstring(docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + docstr
return fn
return docstring_decorator
def transformerXLTokenizer(*args, **kwargs):
"""
Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* transfo-xl-wt103
Example:
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
text = "Who was Jim Henson ?"
tokenized_text = tokenizer.tokenize(tokenized_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
"""
tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
return tokenizer
@_append_from_pretrained_docstring(transformer_xl_docstring)
def transformerXLModel(*args, **kwargs):
"""
transformerXLModel is the basic Transformer XL model.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
# Prepare tokenized input
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
tokenized_text_1 = tokenizer.tokenize(text_1)
tokenized_text_2 = tokenizer.tokenize(text_2)
indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
# Load transformerXLModel
model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
model.eval()
# Predict hidden states features for each layer
# We can re-use the memory cells in a subsequent call to attend a longer context
with torch.no_grad():
hidden_states_1, mems_1 = model(tokens_tensor_1)
hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
"""
model = TransfoXLModel.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(transformer_xl_docstring)
def transformerXLLMHeadModel(*args, **kwargs):
"""
transformerXLModel is the basic Transformer XL model with the
tied (pre-trained) language modeling head on top.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
# Prepare tokenized input
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
tokenized_text_1 = tokenizer.tokenize(text_1)
tokenized_text_2 = tokenizer.tokenize(text_2)
indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
# Load transformerXLLMHeadModel
model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
model.eval()
# Predict hidden states features for each layer
# We can re-use the memory cells in a subsequent call to attend a longer context
with torch.no_grad():
predictions_1, mems_1 = model(tokens_tensor_1)
predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
# Get the predicted last token
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
assert predicted_token == 'who'
"""
model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
return model
from pytorch_transformers.tokenization_xlm import XLMTokenizer
from pytorch_transformers.modeling_xlm import (
XLMConfig,
XLMModel,
XLMWithLMHeadModel,
XLMForSequenceClassification,
XLMForQuestionAnswering
)
# A lot of models share the same param doc. Use a decorator
# to save typing
xlm_start_docstring = """
Model class adapted from the XLM Transformer model of
"Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
Paper: https://arxiv.org/abs/1901.07291
Original code: https://github.com/facebookresearch/XLM
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
# Prepare tokenized input
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
indexed_tokens_1 = tokenizer.encode(text_1)
indexed_tokens_2 = tokenizer.encode(text_2)
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
"""
# A lot of models share the same param doc. Use a decorator
# to save typing
xlm_end_docstring = """
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `xlm-mlm-en-2048`
- a path or url to a pretrained model archive containing:
. `config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
*inputs, **kwargs: additional input for the specific XLM class
"""
def _begin_with_docstring(docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + docstr
return fn
return docstring_decorator
def _end_with_docstring(docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + docstr
return fn
return docstring_decorator
def xlmTokenizer(*args, **kwargs):
"""
Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* xlm-mlm-en-2048
Keyword args:
special_tokens: Special tokens in vocabulary that are not pretrained
Default: None
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying model's
sequence length.
Default: None
Example:
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
text = "Who was Jim Henson ?"
indexed_tokens = tokenizer.encode(tokenized_text)
"""
tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
return tokenizer
@_begin_with_docstring(xlm_start_docstring)
@_end_with_docstring(xlm_end_docstring)
def xlmModel(*args, **kwargs):
"""
# Load xlmModel
model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
hidden_states_1, mems = model(tokens_tensor_1)
hidden_states_2, mems = model(tokens_tensor_2, past=mems)
"""
model = XLMModel.from_pretrained(*args, **kwargs)
return model
@_begin_with_docstring(xlm_start_docstring)
@_end_with_docstring(xlm_end_docstring)
def xlmLMHeadModel(*args, **kwargs):
"""
# Prepare tokenized input
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
indexed_tokens_1 = tokenizer.encode(text_1)
indexed_tokens_2 = tokenizer.encode(text_2)
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
# Load xlnetLMHeadModel
model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
predictions_1, mems = model(tokens_tensor_1)
predictions_2, mems = model(tokens_tensor_2, mems=mems)
# Get the predicted last token
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
predicted_token = tokenizer.decode([predicted_index])
assert predicted_token == ' who'
"""
model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
return model
# @_end_with_docstring(xlnet_docstring)
# def xlnetForSequenceClassification(*args, **kwargs):
# """
# xlnetModel is the basic XLNet Transformer model from
# "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
# by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
# Example:
# # Load the tokenizer
# import torch
# tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
# # Prepare tokenized input
# text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
# text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
# tokenized_text1 = tokenizer.tokenize(text1)
# tokenized_text2 = tokenizer.tokenize(text2)
# indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
# indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
# tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
# mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# # Load xlnetForSequenceClassification
# model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
# model.eval()
# # Predict sequence classes logits
# with torch.no_grad():
# lm_logits, mems = model(tokens_tensor)
# """
# model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
# return model
from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
from pytorch_transformers.modeling_xlnet import (
XLNetConfig,
XLNetModel,
XLNetLMHeadModel,
# XLNetForSequenceClassification
)
# A lot of models share the same param doc. Use a decorator
# to save typing
xlnet_docstring = """
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `xlnet-large-cased`
- a path or url to a pretrained model archive containing:
. `config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
- a path or url to a pretrained model archive containing:
. `xlnet_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
*inputs, **kwargs: additional input for the specific XLNet class
"""
def _append_from_pretrained_docstring(docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + docstr
return fn
return docstring_decorator
def xlnetTokenizer(*args, **kwargs):
"""
Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
Peculiarities:
- require Google sentencepiece (https://github.com/google/sentencepiece)
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* xlnet-large-cased
Keyword args:
special_tokens: Special tokens in vocabulary that are not pretrained
Default: None
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying model's
sequence length.
Default: None
Example:
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
text = "Who was Jim Henson ?"
indexed_tokens = tokenizer.encode(tokenized_text)
"""
tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
return tokenizer
@_append_from_pretrained_docstring(xlnet_docstring)
def xlnetModel(*args, **kwargs):
"""
xlnetModel is the basic XLNet Transformer model from
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
# Prepare tokenized input
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
indexed_tokens_1 = tokenizer.encode(text_1)
indexed_tokens_2 = tokenizer.encode(text_2)
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
# Load xlnetModel
model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
hidden_states_1, mems = model(tokens_tensor_1)
hidden_states_2, mems = model(tokens_tensor_2, past=mems)
"""
model = XLNetModel.from_pretrained(*args, **kwargs)
return model
@_append_from_pretrained_docstring(xlnet_docstring)
def xlnetLMHeadModel(*args, **kwargs):
"""
xlnetModel is the basic XLNet Transformer model from
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
with a tied (pre-trained) language modeling head on top.
Example:
# Load the tokenizer
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
# Prepare tokenized input
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
indexed_tokens_1 = tokenizer.encode(text_1)
indexed_tokens_2 = tokenizer.encode(text_2)
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
# Load xlnetLMHeadModel
model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
predictions_1, mems = model(tokens_tensor_1)
predictions_2, mems = model(tokens_tensor_2, mems=mems)
# Get the predicted last token
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
predicted_token = tokenizer.decode([predicted_index])
assert predicted_token == ' who'
"""
model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
return model
# @_append_from_pretrained_docstring(xlnet_docstring)
# def xlnetForSequenceClassification(*args, **kwargs):
# """
# xlnetModel is the basic XLNet Transformer model from
# "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
# by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
# Example:
# # Load the tokenizer
# import torch
# tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
# # Prepare tokenized input
# text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
# text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
# tokenized_text1 = tokenizer.tokenize(text1)
# tokenized_text2 = tokenizer.tokenize(text2)
# indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
# indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
# tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
# mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# # Load xlnetForSequenceClassification
# model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
# model.eval()
# # Predict sequence classes logits
# with torch.no_grad():
# lm_logits, mems = model(tokens_tensor)
# """
# model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
# return model
...@@ -11,7 +11,8 @@ from .tokenization_distilbert import DistilBertTokenizer ...@@ -11,7 +11,8 @@ from .tokenization_distilbert import DistilBertTokenizer
from .tokenization_utils import (PreTrainedTokenizer) from .tokenization_utils import (PreTrainedTokenizer)
from .modeling_auto import (AutoConfig, AutoModel) from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
AutoModelWithLMHead)
from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining, from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
BertForMaskedLM, BertForNextSentencePrediction, BertForMaskedLM, BertForNextSentencePrediction,
......
This diff is collapsed.
...@@ -59,6 +59,12 @@ if not six.PY2: ...@@ -59,6 +59,12 @@ if not six.PY2:
fn.__doc__ = ''.join(docstr) + fn.__doc__ fn.__doc__ = ''.join(docstr) + fn.__doc__
return fn return fn
return docstring_decorator return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + ''.join(docstr)
return fn
return docstring_decorator
else: else:
# Not possible to update class docstrings on python2 # Not possible to update class docstrings on python2
def add_start_docstrings(*docstr): def add_start_docstrings(*docstr):
...@@ -66,6 +72,11 @@ else: ...@@ -66,6 +72,11 @@ else:
return fn return fn
return docstring_decorator return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
return fn
return docstring_decorator
class PretrainedConfig(object): class PretrainedConfig(object):
r""" Base class for all configuration classes. r""" Base class for all configuration classes.
......
...@@ -21,7 +21,11 @@ import shutil ...@@ -21,7 +21,11 @@ import shutil
import pytest import pytest
import logging import logging
from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel from pytorch_transformers import (AutoConfig, BertConfig,
AutoModel, BertModel,
AutoModelWithLMHead, BertForMaskedLM,
AutoModelForSequenceClassification, BertForSequenceClassification,
AutoModelForQuestionAnswering, BertForQuestionAnswering)
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
...@@ -42,6 +46,42 @@ class AutoModelTest(unittest.TestCase): ...@@ -42,6 +46,42 @@ class AutoModelTest(unittest.TestCase):
for value in loading_info.values(): for value in loading_info.values():
self.assertEqual(len(value), 0) self.assertEqual(len(value), 0)
def test_lmhead_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelWithLMHead.from_pretrained(model_name)
model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForMaskedLM)
def test_sequence_classification_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForSequenceClassification)
def test_question_answering_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForQuestionAnswering)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -25,6 +25,7 @@ from .tokenization_transfo_xl import TransfoXLTokenizer ...@@ -25,6 +25,7 @@ from .tokenization_transfo_xl import TransfoXLTokenizer
from .tokenization_xlnet import XLNetTokenizer from .tokenization_xlnet import XLNetTokenizer
from .tokenization_xlm import XLMTokenizer from .tokenization_xlm import XLMTokenizer
from .tokenization_roberta import RobertaTokenizer from .tokenization_roberta import RobertaTokenizer
from .tokenization_distilbert import DistilBertTokenizer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -39,13 +40,14 @@ class AutoTokenizer(object): ...@@ -39,13 +40,14 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert`: BertTokenizer (Bert model) - contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `xlnet`: XLNetTokenizer (XLNet model) - contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlm`: XLMTokenizer (XLM model) - contains `xlm`: XLMTokenizer (XLM model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
This class cannot be instantiated using `__init__()` (throw an error). This class cannot be instantiated using `__init__()` (throw an error).
""" """
...@@ -60,32 +62,45 @@ class AutoTokenizer(object): ...@@ -60,32 +62,45 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (XLM model)
- contains `bert`: BertTokenizer (Bert model) - contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `xlnet`: XLNetTokenizer (XLNet model) - contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlm`: XLMTokenizer (XLM model) - contains `xlm`: XLMTokenizer (XLM model)
- contains `roberta`: RobertaTokenizer (XLM model)
Params: Params:
**pretrained_model_name_or_path**: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
or download and cache if not already stored in cache (e.g. 'bert-base-uncased'). - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing a configuration file saved - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
using the `save_pretrained(save_directory)` method. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
- a path or url to a saved configuration `file`.
**cache_dir**: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
configuration should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the vocabulary files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
Examples:: Examples::
config = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache. tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache.
config = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')` tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
""" """
if 'roberta' in pretrained_model_name_or_path: if 'distilbert' in pretrained_model_name_or_path:
return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'bert' in pretrained_model_name_or_path: elif 'bert' in pretrained_model_name_or_path:
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment