Unverified Commit 4fc9f9ef authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #910 from huggingface/auto_models

Adding AutoTokenizer and AutoModel classes that automatically detect architecture - Clean up tokenizers
parents 3a126e73 d43dc48b
...@@ -17,16 +17,16 @@ xlm_start_docstring = """ ...@@ -17,16 +17,16 @@ xlm_start_docstring = """
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048') tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
# Prepare tokenized input # Prepare tokenized input
>>> text_1 = "Who was Jim Henson ?" text_1 = "Who was Jim Henson ?"
>>> text_2 = "Jim Henson was a puppeteer" text_2 = "Jim Henson was a puppeteer"
>>> indexed_tokens_1 = tokenizer.encode(text_1) indexed_tokens_1 = tokenizer.encode(text_1)
>>> indexed_tokens_2 = tokenizer.encode(text_2) indexed_tokens_2 = tokenizer.encode(text_2)
>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_1 = torch.tensor([indexed_tokens_1])
>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2]) tokens_tensor_2 = torch.tensor([indexed_tokens_2])
""" """
# A lot of models share the same param doc. Use a decorator # A lot of models share the same param doc. Use a decorator
...@@ -76,11 +76,11 @@ def xlmTokenizer(*args, **kwargs): ...@@ -76,11 +76,11 @@ def xlmTokenizer(*args, **kwargs):
Default: None Default: None
Example: Example:
>>> import torch import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048') tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
>>> text = "Who was Jim Henson ?" text = "Who was Jim Henson ?"
>>> indexed_tokens = tokenizer.encode(tokenized_text) indexed_tokens = tokenizer.encode(tokenized_text)
""" """
tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs) tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
return tokenizer return tokenizer
...@@ -91,11 +91,11 @@ def xlmTokenizer(*args, **kwargs): ...@@ -91,11 +91,11 @@ def xlmTokenizer(*args, **kwargs):
def xlmModel(*args, **kwargs): def xlmModel(*args, **kwargs):
""" """
# Load xlmModel # Load xlmModel
>>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048') model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
>>> model.eval() model.eval()
# Predict hidden states features for each layer # Predict hidden states features for each layer
>>> with torch.no_grad(): with torch.no_grad():
hidden_states_1, mems = model(tokens_tensor_1) hidden_states_1, mems = model(tokens_tensor_1)
hidden_states_2, mems = model(tokens_tensor_2, past=mems) hidden_states_2, mems = model(tokens_tensor_2, past=mems)
""" """
...@@ -108,26 +108,26 @@ def xlmModel(*args, **kwargs): ...@@ -108,26 +108,26 @@ def xlmModel(*args, **kwargs):
def xlmLMHeadModel(*args, **kwargs): def xlmLMHeadModel(*args, **kwargs):
""" """
# Prepare tokenized input # Prepare tokenized input
>>> text_1 = "Who was Jim Henson ?" text_1 = "Who was Jim Henson ?"
>>> text_2 = "Jim Henson was a puppeteer" text_2 = "Jim Henson was a puppeteer"
>>> indexed_tokens_1 = tokenizer.encode(text_1) indexed_tokens_1 = tokenizer.encode(text_1)
>>> indexed_tokens_2 = tokenizer.encode(text_2) indexed_tokens_2 = tokenizer.encode(text_2)
>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_1 = torch.tensor([indexed_tokens_1])
>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2]) tokens_tensor_2 = torch.tensor([indexed_tokens_2])
# Load xlnetLMHeadModel # Load xlnetLMHeadModel
>>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048') model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
>>> model.eval() model.eval()
# Predict hidden states features for each layer # Predict hidden states features for each layer
>>> with torch.no_grad(): with torch.no_grad():
predictions_1, mems = model(tokens_tensor_1) predictions_1, mems = model(tokens_tensor_1)
predictions_2, mems = model(tokens_tensor_2, mems=mems) predictions_2, mems = model(tokens_tensor_2, mems=mems)
# Get the predicted last token # Get the predicted last token
>>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item() predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
>>> predicted_token = tokenizer.decode([predicted_index]) predicted_token = tokenizer.decode([predicted_index])
>>> assert predicted_token == ' who' assert predicted_token == ' who'
""" """
model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs) model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
return model return model
...@@ -142,25 +142,25 @@ def xlmLMHeadModel(*args, **kwargs): ...@@ -142,25 +142,25 @@ def xlmLMHeadModel(*args, **kwargs):
# Example: # Example:
# # Load the tokenizer # # Load the tokenizer
# >>> import torch # import torch
# >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048') # tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
# # Prepare tokenized input # # Prepare tokenized input
# >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer" # text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
# >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man" # text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
# >>> tokenized_text1 = tokenizer.tokenize(text1) # tokenized_text1 = tokenizer.tokenize(text1)
# >>> tokenized_text2 = tokenizer.tokenize(text2) # tokenized_text2 = tokenizer.tokenize(text2)
# >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1) # indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
# >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2) # indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
# >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]]) # tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
# >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]]) # mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# # Load xlnetForSequenceClassification # # Load xlnetForSequenceClassification
# >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048') # model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
# >>> model.eval() # model.eval()
# # Predict sequence classes logits # # Predict sequence classes logits
# >>> with torch.no_grad(): # with torch.no_grad():
# lm_logits, mems = model(tokens_tensor) # lm_logits, mems = model(tokens_tensor)
# """ # """
# model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs) # model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
......
...@@ -53,11 +53,11 @@ def xlnetTokenizer(*args, **kwargs): ...@@ -53,11 +53,11 @@ def xlnetTokenizer(*args, **kwargs):
Default: None Default: None
Example: Example:
>>> import torch import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased') tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
>>> text = "Who was Jim Henson ?" text = "Who was Jim Henson ?"
>>> indexed_tokens = tokenizer.encode(tokenized_text) indexed_tokens = tokenizer.encode(tokenized_text)
""" """
tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs) tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
return tokenizer return tokenizer
...@@ -72,23 +72,23 @@ def xlnetModel(*args, **kwargs): ...@@ -72,23 +72,23 @@ def xlnetModel(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased') tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
# Prepare tokenized input # Prepare tokenized input
>>> text_1 = "Who was Jim Henson ?" text_1 = "Who was Jim Henson ?"
>>> text_2 = "Jim Henson was a puppeteer" text_2 = "Jim Henson was a puppeteer"
>>> indexed_tokens_1 = tokenizer.encode(text_1) indexed_tokens_1 = tokenizer.encode(text_1)
>>> indexed_tokens_2 = tokenizer.encode(text_2) indexed_tokens_2 = tokenizer.encode(text_2)
>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_1 = torch.tensor([indexed_tokens_1])
>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2]) tokens_tensor_2 = torch.tensor([indexed_tokens_2])
# Load xlnetModel # Load xlnetModel
>>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased') model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
>>> model.eval() model.eval()
# Predict hidden states features for each layer # Predict hidden states features for each layer
>>> with torch.no_grad(): with torch.no_grad():
hidden_states_1, mems = model(tokens_tensor_1) hidden_states_1, mems = model(tokens_tensor_1)
hidden_states_2, mems = model(tokens_tensor_2, past=mems) hidden_states_2, mems = model(tokens_tensor_2, past=mems)
""" """
...@@ -106,30 +106,30 @@ def xlnetLMHeadModel(*args, **kwargs): ...@@ -106,30 +106,30 @@ def xlnetLMHeadModel(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased') tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
# Prepare tokenized input # Prepare tokenized input
>>> text_1 = "Who was Jim Henson ?" text_1 = "Who was Jim Henson ?"
>>> text_2 = "Jim Henson was a puppeteer" text_2 = "Jim Henson was a puppeteer"
>>> indexed_tokens_1 = tokenizer.encode(text_1) indexed_tokens_1 = tokenizer.encode(text_1)
>>> indexed_tokens_2 = tokenizer.encode(text_2) indexed_tokens_2 = tokenizer.encode(text_2)
>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_1 = torch.tensor([indexed_tokens_1])
>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2]) tokens_tensor_2 = torch.tensor([indexed_tokens_2])
# Load xlnetLMHeadModel # Load xlnetLMHeadModel
>>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased') model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
>>> model.eval() model.eval()
# Predict hidden states features for each layer # Predict hidden states features for each layer
>>> with torch.no_grad(): with torch.no_grad():
predictions_1, mems = model(tokens_tensor_1) predictions_1, mems = model(tokens_tensor_1)
predictions_2, mems = model(tokens_tensor_2, mems=mems) predictions_2, mems = model(tokens_tensor_2, mems=mems)
# Get the predicted last token # Get the predicted last token
>>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item() predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
>>> predicted_token = tokenizer.decode([predicted_index]) predicted_token = tokenizer.decode([predicted_index])
>>> assert predicted_token == ' who' assert predicted_token == ' who'
""" """
model = XLNetLMHeadModel.from_pretrained(*args, **kwargs) model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
return model return model
...@@ -144,25 +144,25 @@ def xlnetLMHeadModel(*args, **kwargs): ...@@ -144,25 +144,25 @@ def xlnetLMHeadModel(*args, **kwargs):
# Example: # Example:
# # Load the tokenizer # # Load the tokenizer
# >>> import torch # import torch
# >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased') # tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
# # Prepare tokenized input # # Prepare tokenized input
# >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer" # text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
# >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man" # text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
# >>> tokenized_text1 = tokenizer.tokenize(text1) # tokenized_text1 = tokenizer.tokenize(text1)
# >>> tokenized_text2 = tokenizer.tokenize(text2) # tokenized_text2 = tokenizer.tokenize(text2)
# >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1) # indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
# >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2) # indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
# >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]]) # tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
# >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]]) # mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# # Load xlnetForSequenceClassification # # Load xlnetForSequenceClassification
# >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased') # model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
# >>> model.eval() # model.eval()
# # Predict sequence classes logits # # Predict sequence classes logits
# >>> with torch.no_grad(): # with torch.no_grad():
# lm_logits, mems = model(tokens_tensor) # lm_logits, mems = model(tokens_tensor)
# """ # """
# model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs) # model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
......
__version__ = "1.0.0" __version__ = "1.0.0"
from .tokenization_auto import AutoTokenizer
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_gpt2 import GPT2Tokenizer
from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
from .tokenization_xlm import XLMTokenizer from .tokenization_xlm import XLMTokenizer
from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization) from .tokenization_utils import (PreTrainedTokenizer)
from .modeling_auto import (AutoConfig, AutoModel)
from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining, from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
BertForMaskedLM, BertForNextSentencePrediction, BertForMaskedLM, BertForNextSentencePrediction,
...@@ -39,4 +42,4 @@ from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME, ...@@ -39,4 +42,4 @@ from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule, from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule) WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path) from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
...@@ -20,7 +20,7 @@ import argparse ...@@ -20,7 +20,7 @@ import argparse
import torch import torch
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from pytorch_pretrained_bert.modeling import BertModel from pytorch_transformers.modeling import BertModel
def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str): def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
......
...@@ -38,10 +38,13 @@ except ImportError: ...@@ -38,10 +38,13 @@ except ImportError:
try: try:
from pathlib import Path from pathlib import Path
PYTORCH_PRETRAINED_BERT_CACHE = Path( PYTORCH_PRETRAINED_BERT_CACHE = Path(
os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)) os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)))
except (AttributeError, ImportError): except (AttributeError, ImportError):
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE',
default_cache_path) os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
default_cache_path))
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
logger = logging.getLogger(__name__) # pylint: disable=invalid-name logger = logging.getLogger(__name__) # pylint: disable=invalid-name
...@@ -70,7 +73,7 @@ def filename_to_url(filename, cache_dir=None): ...@@ -70,7 +73,7 @@ def filename_to_url(filename, cache_dir=None):
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
""" """
if cache_dir is None: if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE cache_dir = PYTORCH_TRANSFORMERS_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path): if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir) cache_dir = str(cache_dir)
...@@ -98,7 +101,7 @@ def cached_path(url_or_filename, cache_dir=None): ...@@ -98,7 +101,7 @@ def cached_path(url_or_filename, cache_dir=None):
make sure the file exists and then return the path. make sure the file exists and then return the path.
""" """
if cache_dir is None: if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE cache_dir = PYTORCH_TRANSFORMERS_CACHE
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
url_or_filename = str(url_or_filename) url_or_filename = str(url_or_filename)
if sys.version_info[0] == 3 and isinstance(cache_dir, Path): if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
...@@ -187,7 +190,7 @@ def get_from_cache(url, cache_dir=None): ...@@ -187,7 +190,7 @@ def get_from_cache(url, cache_dir=None):
If it's not there, download it. Then return the path to the cached file. If it's not there, download it. Then return the path to the cached file.
""" """
if cache_dir is None: if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE cache_dir = PYTORCH_TRANSFORMERS_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path): if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir) cache_dir = str(cache_dir)
if sys.version_info[0] == 2 and not isinstance(cache_dir, str): if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
......
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Auto Model class. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn.parameter import Parameter
from .modeling_bert import BertConfig, BertModel
from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel
from .modeling_gpt2 import GPT2Config, GPT2Model
from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
from .modeling_xlnet import XLNetConfig, XLNetModel
from .modeling_xlm import XLMConfig, XLMModel
from .modeling_utils import PreTrainedModel, SequenceSummary
logger = logging.getLogger(__name__)
class AutoConfig(object):
r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
class method.
The `from_pretrained()` method take care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string.
The base model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
This class cannot be instantiated using `__init__()` (throw an error).
"""
def __init__(self):
raise EnvironmentError("AutoConfig is designed to be instantiated "
"using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r""" Instantiate a one of the configuration classes of the library
from a pre-trained model configuration.
The configuration class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
Params:
**pretrained_model_name_or_path**: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
- a path to a `directory` containing a configuration file saved
using the `save_pretrained(save_directory)` method.
- a path or url to a saved configuration `file`.
**cache_dir**: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
**return_unused_kwargs**: (`optional`) bool:
- If False, then this function returns just the final configuration object.
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
**kwargs**: (`optional`) dict:
Dictionary of key/value pairs with which to update the configuration object after loading.
- The values in kwargs of any keys which are configuration attributes will be used
to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
by the `return_unused_kwargs` keyword parameter.
Examples::
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
assert config.output_attention == True
config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
foo=False, return_unused_kwargs=True)
assert config.output_attention == True
assert unused_kwargs == {'foo': False}
"""
if 'bert' in pretrained_model_name_or_path:
return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path:
return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'gpt2' in pretrained_model_name_or_path:
return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'transfo-xl' in pretrained_model_name_or_path:
return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm'".format(pretrained_model_name_or_path))
class AutoModel(object):
r"""
:class:`~pytorch_transformers.AutoModel` is a generic model class
that will be instantiated as one of the base model classes of the library
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
class method.
The `from_pretrained()` method take care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string.
The base model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
This class cannot be instantiated using `__init__()` (throw an error).
"""
def __init__(self):
raise EnvironmentError("AutoModel is designed to be instantiated "
"using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiate a one of the base model classes of the library
from a pre-trained model configuration.
The base model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
Params:
**pretrained_model_name_or_path**: either:
- a string with the `shortcut name` of a pre-trained model to load from cache
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
- a path to a `directory` containing a configuration file saved
using the `save_pretrained(save_directory)` method.
- a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
In this case, ``from_tf`` should be set to True and a configuration object should be
provided as `config` argument. This loading option is slower than converting the TensorFlow
checkpoint in a PyTorch model using the provided conversion scripts and loading
the PyTorch model afterwards.
**model_args**: (`optional`) Sequence:
All remaning positional arguments will be passed to the underlying model's __init__ function
**config**: an optional configuration for the model to use instead of an automatically loaded configuation.
Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
- the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
**state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
a simpler option.
**cache_dir**: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
**output_loading_info**: (`optional`) boolean:
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
**kwargs**: (`optional`) dict:
Dictionary of key, values to update the configuration object after loading.
Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
- If a configuration is provided with `config`, **kwargs will be directly passed
to the underlying model's __init__ method.
- If a configuration is not provided, **kwargs will be first passed to the pretrained
model configuration class loading function (`PretrainedConfig.from_pretrained`).
Each key of **kwargs that corresponds to a configuration attribute
will be used to override said attribute with the supplied **kwargs value.
Remaining keys that do not correspond to any configuration attribute will
be passed to the underlying model's __init__ function.
Examples::
model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if 'bert' in pretrained_model_name_or_path:
return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path:
return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'gpt2' in pretrained_model_name_or_path:
return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'transfo-xl' in pretrained_model_name_or_path:
return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm'".format(pretrained_model_name_or_path))
...@@ -643,12 +643,12 @@ class BertModel(BertPreTrainedModel): ...@@ -643,12 +643,12 @@ class BertModel(BertPreTrainedModel):
Examples:: Examples::
>>> config = BertConfig.from_pretrained('bert-base-uncased') config = BertConfig.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>> model = BertModel(config) model = BertModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids) outputs = model(input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
def __init__(self, config): def __init__(self, config):
...@@ -754,13 +754,13 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -754,13 +754,13 @@ class BertForPreTraining(BertPreTrainedModel):
Examples:: Examples::
>>> config = BertConfig.from_pretrained('bert-base-uncased') config = BertConfig.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>>
>>> model = BertForPreTraining(config) model = BertForPreTraining(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids) outputs = model(input_ids)
>>> prediction_scores, seq_relationship_scores = outputs[:2] prediction_scores, seq_relationship_scores = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -824,13 +824,13 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -824,13 +824,13 @@ class BertForMaskedLM(BertPreTrainedModel):
Examples:: Examples::
>>> config = BertConfig.from_pretrained('bert-base-uncased') config = BertConfig.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>>
>>> model = BertForMaskedLM(config) model = BertForMaskedLM(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, masked_lm_labels=input_ids) outputs = model(input_ids, masked_lm_labels=input_ids)
>>> loss, prediction_scores = outputs[:2] loss, prediction_scores = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -891,13 +891,13 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -891,13 +891,13 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
Examples:: Examples::
>>> config = BertConfig.from_pretrained('bert-base-uncased') config = BertConfig.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>>
>>> model = BertForNextSentencePrediction(config) model = BertForNextSentencePrediction(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids) outputs = model(input_ids)
>>> seq_relationship_scores = outputs[0] seq_relationship_scores = outputs[0]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -951,14 +951,14 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -951,14 +951,14 @@ class BertForSequenceClassification(BertPreTrainedModel):
Examples:: Examples::
>>> config = BertConfig.from_pretrained('bert-base-uncased') config = BertConfig.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>>
>>> model = BertForSequenceClassification(config) model = BertForSequenceClassification(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, labels=labels) outputs = model(input_ids, labels=labels)
>>> loss, logits = outputs[:2] loss, logits = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -1057,15 +1057,15 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1057,15 +1057,15 @@ class BertForMultipleChoice(BertPreTrainedModel):
Examples:: Examples::
>>> config = BertConfig.from_pretrained('bert-base-uncased') config = BertConfig.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>>
>>> model = BertForMultipleChoice(config) model = BertForMultipleChoice(config)
>>> choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
>>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
>>> labels = torch.tensor(1).unsqueeze(0) # Batch size 1 labels = torch.tensor(1).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, labels=labels) outputs = model(input_ids, labels=labels)
>>> loss, classification_scores = outputs[:2] loss, classification_scores = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -1127,14 +1127,14 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1127,14 +1127,14 @@ class BertForTokenClassification(BertPreTrainedModel):
Examples:: Examples::
>>> config = BertConfig.from_pretrained('bert-base-uncased') config = BertConfig.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>>
>>> model = BertForTokenClassification(config) model = BertForTokenClassification(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, labels=labels) outputs = model(input_ids, labels=labels)
>>> loss, scores = outputs[:2] loss, scores = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -1203,15 +1203,15 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1203,15 +1203,15 @@ class BertForQuestionAnswering(BertPreTrainedModel):
Examples:: Examples::
>>> config = BertConfig.from_pretrained('bert-base-uncased') config = BertConfig.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>>
>>> model = BertForQuestionAnswering(config) model = BertForQuestionAnswering(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1]) start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3]) end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss, start_scores, end_scores = outputs[:2] loss, start_scores, end_scores = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
......
...@@ -137,7 +137,7 @@ class GPT2Config(PretrainedConfig): ...@@ -137,7 +137,7 @@ class GPT2Config(PretrainedConfig):
initializer_range=0.02, initializer_range=0.02,
num_labels=1, num_labels=1,
summary_type='token_ids', summary_type='cls_index',
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
summary_proj_to_labels=True, summary_proj_to_labels=True,
...@@ -433,12 +433,12 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -433,12 +433,12 @@ class GPT2Model(GPT2PreTrainedModel):
Examples:: Examples::
>>> config = GPT2Config.from_pretrained('gpt2') config = GPT2Config.from_pretrained('gpt2')
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
>>> model = GPT2Model(config) model = GPT2Model(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids) outputs = model(input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
def __init__(self, config): def __init__(self, config):
...@@ -567,12 +567,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -567,12 +567,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
Examples:: Examples::
>>> config = GPT2Config.from_pretrained('gpt2') config = GPT2Config.from_pretrained('gpt2')
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
>>> model = GPT2LMHeadModel(config) model = GPT2LMHeadModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, labels=input_ids) outputs = model(input_ids, labels=input_ids)
>>> loss, logits = outputs[:2] loss, logits = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -683,14 +683,14 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -683,14 +683,14 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
Examples:: Examples::
>>> config = GPT2Config.from_pretrained('gpt2') config = GPT2Config.from_pretrained('gpt2')
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
>>> model = GPT2DoubleHeadsModel(config) model = GPT2DoubleHeadsModel(config)
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] # Assume you've added [CLS] to the vocabulary choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] # Assume you've added [CLS] to the vocabulary
>>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
>>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0) # Batch size 1 mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, mc_token_ids) outputs = model(input_ids, mc_token_ids)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2] lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
......
...@@ -171,7 +171,7 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -171,7 +171,7 @@ class OpenAIGPTConfig(PretrainedConfig):
predict_special_tokens=True, predict_special_tokens=True,
num_labels=1, num_labels=1,
summary_type='token_ids', summary_type='cls_index',
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
summary_proj_to_labels=True, summary_proj_to_labels=True,
...@@ -439,12 +439,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -439,12 +439,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
Examples:: Examples::
>>> config = OpenAIGPTConfig.from_pretrained('openai-gpt') config = OpenAIGPTConfig.from_pretrained('openai-gpt')
>>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
>>> model = OpenAIGPTModel(config) model = OpenAIGPTModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids) outputs = model(input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
def __init__(self, config): def __init__(self, config):
...@@ -558,12 +558,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -558,12 +558,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
Examples:: Examples::
>>> config = OpenAIGPTConfig.from_pretrained('openai-gpt') config = OpenAIGPTConfig.from_pretrained('openai-gpt')
>>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
>>> model = OpenAIGPTLMHeadModel(config) model = OpenAIGPTLMHeadModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, labels=input_ids) outputs = model(input_ids, labels=input_ids)
>>> loss, logits = outputs[:2] loss, logits = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -665,14 +665,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -665,14 +665,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
Examples:: Examples::
>>> config = OpenAIGPTConfig.from_pretrained('openai-gpt') config = OpenAIGPTConfig.from_pretrained('openai-gpt')
>>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
>>> model = OpenAIGPTDoubleHeadsModel(config) model = OpenAIGPTDoubleHeadsModel(config)
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] # Assume you've added [CLS] to the vocabulary choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] # Assume you've added [CLS] to the vocabulary
>>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
>>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0) # Batch size 1 mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, mc_token_ids) outputs = model(input_ids, mc_token_ids)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2] lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
......
...@@ -968,12 +968,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -968,12 +968,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
Examples:: Examples::
>>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103') config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
>>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
>>> model = TransfoXLModel(config) model = TransfoXLModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids) outputs = model(input_ids)
>>> last_hidden_states, mems = outputs[:2] last_hidden_states, mems = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -1284,12 +1284,12 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1284,12 +1284,12 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
Examples:: Examples::
>>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103') config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
>>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
>>> model = TransfoXLLMHeadModel(config) model = TransfoXLLMHeadModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids) outputs = model(input_ids)
>>> prediction_scores, mems = outputs[:2] prediction_scores, mems = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
......
...@@ -68,8 +68,18 @@ else: ...@@ -68,8 +68,18 @@ else:
class PretrainedConfig(object): class PretrainedConfig(object):
""" Base class for all configuration classes. r""" Base class for all configuration classes.
Handle a few common parameters and methods for loading/downloading/saving configurations. Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
Class attributes (overridden by derived classes):
- ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
Parameters:
``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
``torchscript``: string, default `False`. Is the model used with Torchscript.
""" """
pretrained_config_archive_map = {} pretrained_config_archive_map = {}
...@@ -81,8 +91,8 @@ class PretrainedConfig(object): ...@@ -81,8 +91,8 @@ class PretrainedConfig(object):
self.torchscript = kwargs.pop('torchscript', False) self.torchscript = kwargs.pop('torchscript', False)
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save a configuration object to a directory, so that it """ Save a configuration object to the directory `save_directory`, so that it
can be re-loaded using the `from_pretrained(save_directory)` class method. can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
""" """
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
...@@ -93,41 +103,42 @@ class PretrainedConfig(object): ...@@ -93,41 +103,42 @@ class PretrainedConfig(object):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r""" Instantiate a PretrainedConfig from a pre-trained model configuration. r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
Params: Parameters:
**pretrained_model_name_or_path**: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
or download and cache if not already stored in cache (e.g. 'bert-base-uncased'). - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing a configuration file saved - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
using the `save_pretrained(save_directory)` method. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
- a path or url to a saved configuration `file`.
**cache_dir**: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used. configuration should be cached if the standard cache should not be used.
**return_unused_kwargs**: (`optional`) bool:
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
return_unused_kwargs: (`optional`) bool:
- If False, then this function returns just the final configuration object. - If False, then this function returns just the final configuration object.
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
**kwargs**: (`optional`) dict:
Dictionary of key/value pairs with which to update the configuration object after loading.
- The values in kwargs of any keys which are configuration attributes will be used
to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
by the `return_unused_kwargs` keyword parameter.
Examples:: Examples::
>>> config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
>>> config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` # derived class: BertConfig
>>> config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
>>> config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
>>> assert config.output_attention == True config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
>>> config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
>>> foo=False, return_unused_kwargs=True) assert config.output_attention == True
>>> assert config.output_attention == True config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
>>> assert unused_kwargs == {'foo': False} foo=False, return_unused_kwargs=True)
assert config.output_attention == True
assert unused_kwargs == {'foo': False}
""" """
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
...@@ -217,14 +228,26 @@ class PretrainedConfig(object): ...@@ -217,14 +228,26 @@ class PretrainedConfig(object):
class PreTrainedModel(nn.Module): class PreTrainedModel(nn.Module):
""" Base class for all models. Handle loading/storing model config and r""" Base class for all models.
a simple interface for dowloading and loading pretrained models.
:class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
- ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
- ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
- ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
- ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
- ``path``: a path (string) to the TensorFlow checkpoint.
- ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
""" """
config_class = PretrainedConfig config_class = None
pretrained_model_archive_map = {} pretrained_model_archive_map = {}
load_tf_weights = lambda model, config, path: None load_tf_weights = lambda model, config, path: None
base_model_prefix = "" base_model_prefix = ""
input_embeddings = None
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(PreTrainedModel, self).__init__() super(PreTrainedModel, self).__init__()
...@@ -282,17 +305,16 @@ class PreTrainedModel(nn.Module): ...@@ -282,17 +305,16 @@ class PreTrainedModel(nn.Module):
def resize_token_embeddings(self, new_num_tokens=None): def resize_token_embeddings(self, new_num_tokens=None):
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
Args: Arguments:
new_num_tokens: (`optional`) int
New number of tokens in the embedding matrix. new_num_tokens: (`optional`) int:
Increasing the size will add newly initialized vectors at the end New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
Reducing the size will remove vectors from the end If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
If not provided or None: does nothing and just returns a pointer to the input tokens Embedding Module of the model.
Return: ``torch.nn.Embeddings`` Return: ``torch.nn.Embeddings``
Pointer to the input tokens Embedding Module of the model Pointer to the input tokens Embeddings Module of the model
""" """
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
model_embeds = base_model._resize_token_embeddings(new_num_tokens) model_embeds = base_model._resize_token_embeddings(new_num_tokens)
...@@ -311,15 +333,17 @@ class PreTrainedModel(nn.Module): ...@@ -311,15 +333,17 @@ class PreTrainedModel(nn.Module):
def prune_heads(self, heads_to_prune): def prune_heads(self, heads_to_prune):
""" Prunes heads of the base model. """ Prunes heads of the base model.
Args:
heads_to_prune: dict of {layer_num (int): list of heads to prune in this layer (list of int)} Arguments:
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
""" """
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
base_model._prune_heads(heads_to_prune) base_model._prune_heads(heads_to_prune)
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save a model with its configuration file to a directory, so that it """ Save a model and its configuration file to a directory, so that it
can be re-loaded using the `from_pretrained(save_directory)` class method. can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
""" """
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
...@@ -338,58 +362,53 @@ class PreTrainedModel(nn.Module): ...@@ -338,58 +362,53 @@ class PreTrainedModel(nn.Module):
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r"""Instantiate a pretrained pytorch model from a pre-trained model configuration. r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are desactivated) The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()` To train the model, you should first set it back in training mode with ``model.train()``
Params: Parameters:
**pretrained_model_name_or_path**: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache
or download and cache if not already stored in cache (e.g. 'bert-base-uncased'). - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing a configuration file saved - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
using the `save_pretrained(save_directory)` method. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
In this case, ``from_tf`` should be set to True and a configuration object should be model_args: (`optional`) Sequence of positional arguments:
provided as `config` argument. This loading option is slower than converting the TensorFlow All remaning positional arguments will be passed to the underlying model's ``__init__`` method
checkpoint in a PyTorch model using the provided conversion scripts and loading
the PyTorch model afterwards. config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
**model_args**: (`optional`) Sequence: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
All remaning positional arguments will be passed to the underlying model's __init__ function
**config**: an optional configuration for the model to use instead of an automatically loaded configuation. - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
Configuration can be automatically loaded when: - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
- the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
**state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded state_dict: (`optional`) dict:
from saved weights file. an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuraton but load your own weights. This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
a simpler option.
**cache_dir**: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used. configuration should be cached if the standard cache should not be used.
**output_loading_info**: (`optional`) boolean:
output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
**kwargs**: (`optional`) dict:
Dictionary of key, values to update the configuration object after loading. kwargs: (`optional`) Remaining dictionary of keyword arguments:
Can be used to override selected configuration parameters. E.g. ``output_attention=True``. Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with `config`, **kwargs will be directly passed - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
to the underlying model's __init__ method. - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
- If a configuration is not provided, **kwargs will be first passed to the pretrained
model configuration class loading function (`PretrainedConfig.from_pretrained`).
Each key of **kwargs that corresponds to a configuration attribute
will be used to override said attribute with the supplied **kwargs value.
Remaining keys that do not correspond to any configuration attribute will
be passed to the underlying model's __init__ function.
Examples:: Examples::
>>> model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
>>> model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
>>> model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
>>> assert model.config.output_attention == True assert model.config.output_attention == True
>>> # Loading from a TF checkpoint file instead of a PyTorch model (slower) # Loading from a TF checkpoint file instead of a PyTorch model (slower)
>>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
>>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
""" """
config = kwargs.pop('config', None) config = kwargs.pop('config', None)
...@@ -760,7 +779,7 @@ class SequenceSummary(nn.Module): ...@@ -760,7 +779,7 @@ class SequenceSummary(nn.Module):
- 'last' => [default] take the last token hidden state (like XLNet) - 'last' => [default] take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert) - 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states - 'mean' => take the mean of all tokens hidden states
- 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2) - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention - 'attn' => Not implemented now, use multi-head attention
summary_use_proj: Add a projection after the vector extraction summary_use_proj: Add a projection after the vector extraction
summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
...@@ -772,7 +791,7 @@ class SequenceSummary(nn.Module): ...@@ -772,7 +791,7 @@ class SequenceSummary(nn.Module):
super(SequenceSummary, self).__init__() super(SequenceSummary, self).__init__()
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last' self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
if config.summary_type == 'attn': if self.summary_type == 'attn':
# We should use a standard multi-head attention module with absolute positional embedding for that. # We should use a standard multi-head attention module with absolute positional embedding for that.
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
# We can probably just use the multi-head attention module of PyTorch >=1.1.0 # We can probably just use the multi-head attention module of PyTorch >=1.1.0
...@@ -798,11 +817,11 @@ class SequenceSummary(nn.Module): ...@@ -798,11 +817,11 @@ class SequenceSummary(nn.Module):
if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0: if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
self.last_dropout = nn.Dropout(config.summary_last_dropout) self.last_dropout = nn.Dropout(config.summary_last_dropout)
def forward(self, hidden_states, token_ids=None): def forward(self, hidden_states, cls_index=None):
""" hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer. """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
token_ids: [optional] index of the classification token if summary_type == 'token_ids', cls_index: [optional] position of the classification token if summary_type == 'cls_index',
shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states. shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
if summary_type == 'token_ids' and token_ids is None: if summary_type == 'cls_index' and cls_index is None:
we take the last token of the sequence as classification token we take the last token of the sequence as classification token
""" """
if self.summary_type == 'last': if self.summary_type == 'last':
...@@ -811,14 +830,14 @@ class SequenceSummary(nn.Module): ...@@ -811,14 +830,14 @@ class SequenceSummary(nn.Module):
output = hidden_states[:, 0] output = hidden_states[:, 0]
elif self.summary_type == 'mean': elif self.summary_type == 'mean':
output = hidden_states.mean(dim=1) output = hidden_states.mean(dim=1)
elif self.summary_type == 'token_ids': elif self.summary_type == 'cls_index':
if token_ids is None: if cls_index is None:
token_ids = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long) cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
else: else:
token_ids = token_ids.unsqueeze(-1).unsqueeze(-1) cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
token_ids = token_ids.expand((-1,) * (token_ids.dim()-1) + (hidden_states.size(-1),)) cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
# shape of token_ids: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
output = hidden_states.gather(-2, token_ids).squeeze(-2) # shape (bsz, XX, hidden_size) output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
elif self.summary_type == 'attn': elif self.summary_type == 'attn':
raise NotImplementedError raise NotImplementedError
......
...@@ -472,12 +472,12 @@ class XLMModel(XLMPreTrainedModel): ...@@ -472,12 +472,12 @@ class XLMModel(XLMPreTrainedModel):
Examples:: Examples::
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048') config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMModel(config) model = XLMModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids) outputs = model(input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output', ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
...@@ -745,12 +745,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -745,12 +745,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
Examples:: Examples::
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048') config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMWithLMHeadModel(config) model = XLMWithLMHeadModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids) outputs = model(input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
def __init__(self, config): def __init__(self, config):
...@@ -805,14 +805,14 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -805,14 +805,14 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
Examples:: Examples::
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048') config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>>
>>> model = XLMForSequenceClassification(config) model = XLMForSequenceClassification(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, labels=labels) outputs = model(input_ids, labels=labels)
>>> loss, logits = outputs[:2] loss, logits = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -885,15 +885,15 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -885,15 +885,15 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
Examples:: Examples::
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048') config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>>
>>> model = XLMForQuestionAnswering(config) model = XLMForQuestionAnswering(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1]) start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3]) end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss, start_scores, end_scores = outputs[:2] loss, start_scores, end_scores = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
......
...@@ -712,12 +712,12 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -712,12 +712,12 @@ class XLNetModel(XLNetPreTrainedModel):
Examples:: Examples::
>>> config = XLNetConfig.from_pretrained('xlnet-large-cased') config = XLNetConfig.from_pretrained('xlnet-large-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
>>> model = XLNetModel(config) model = XLNetModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids) outputs = model(input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
def __init__(self, config): def __init__(self, config):
...@@ -1019,17 +1019,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1019,17 +1019,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
Examples:: Examples::
>>> config = XLNetConfig.from_pretrained('xlnet-large-cased') config = XLNetConfig.from_pretrained('xlnet-large-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
>>> model = XLNetLMHeadModel(config) model = XLNetLMHeadModel(config)
>>> # We show how to setup inputs to predict a next token using a bi-directional context. # We show how to setup inputs to predict a next token using a bi-directional context.
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0) # We will predict the masked token input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0) # We will predict the masked token
>>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
>>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
>>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token
>>> target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping) outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
>>> next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -1100,14 +1100,14 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1100,14 +1100,14 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
Examples:: Examples::
>>> config = XLNetConfig.from_pretrained('xlnet-large-cased') config = XLNetConfig.from_pretrained('xlnet-large-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
>>>
>>> model = XLNetForSequenceClassification(config) model = XLNetForSequenceClassification(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids, labels=labels) outputs = model(input_ids, labels=labels)
>>> loss, logits = outputs[:2] loss, logits = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
...@@ -1200,15 +1200,15 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1200,15 +1200,15 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
Examples:: Examples::
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048') config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>>
>>> model = XLMForQuestionAnswering(config) model = XLMForQuestionAnswering(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1]) start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3]) end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss, start_scores, end_scores = outputs[:2] loss, start_scores, end_scores = outputs[:2]
""" """
def __init__(self, config): def __init__(self, config):
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
import pytest
import logging
from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
class AutoModelTest(unittest.TestCase):
def test_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModel.from_pretrained(model_name)
model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertModel)
for value in loading_info.values():
self.assertEqual(len(value), 0)
if __name__ == "__main__":
unittest.main()
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
import pytest
import logging
from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
class AutoTokenizerTest(unittest.TestCase):
def test_tokenizer_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
self.assertIsNotNone(tokenizer)
self.assertIsInstance(tokenizer, BertTokenizer)
self.assertGreater(len(tokenizer), 0)
for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
self.assertIsNotNone(tokenizer)
self.assertIsInstance(tokenizer, GPT2Tokenizer)
self.assertGreater(len(tokenizer), 0)
if __name__ == "__main__":
unittest.main()
...@@ -24,30 +24,37 @@ from pytorch_transformers.tokenization_bert import (BasicTokenizer, ...@@ -24,30 +24,37 @@ from pytorch_transformers.tokenization_bert import (BasicTokenizer,
_is_control, _is_punctuation, _is_control, _is_punctuation,
_is_whitespace, VOCAB_FILES_NAMES) _is_whitespace, VOCAB_FILES_NAMES)
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory from .tokenization_tests_commons import CommonTestCases
class TokenizationTest(unittest.TestCase): class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer_class = BertTokenizer
def setUp(self):
super(BertTokenizationTest, self).setUp()
def test_full_tokenizer(self):
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
"##ing", ",", "low", "lowest", "##ing", ",", "low", "lowest",
] ]
with TemporaryDirectory() as tmpdirname: self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file']) with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
with open(vocab_file, "w", encoding='utf-8') as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
input_text = u"UNwant\u00E9d,running" def get_tokenizer(self):
output_text = u"unwanted, running" return BertTokenizer.from_pretrained(self.tmpdirname)
create_and_check_tokenizer_commons(self, input_text, output_text, BertTokenizer, tmpdirname) def get_input_output_texts(self):
input_text = u"UNwant\u00E9d,running"
output_text = u"unwanted, running"
return input_text, output_text
tokenizer = BertTokenizer(vocab_file) def test_full_tokenizer(self):
tokenizer = BertTokenizer(self.vocab_file)
tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def test_chinese(self): def test_chinese(self):
tokenizer = BasicTokenizer() tokenizer = BasicTokenizer()
......
...@@ -20,42 +20,49 @@ import json ...@@ -20,42 +20,49 @@ import json
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory from .tokenization_tests_commons import CommonTestCases
class GPT2TokenizationTest(unittest.TestCase): class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
def test_full_tokenizer(self): tokenizer_class = GPT2Tokenizer
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
def setUp(self):
super(GPT2TokenizationTest, self).setUp()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
"lo", "low", "er", "lo", "low", "er",
"low", "lowest", "newer", "wider", "<unk>"] "low", "lowest", "newer", "wider", "<unk>"]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w", "e r", ""] merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
special_tokens_map = {"unk_token": "<unk>"} self.special_tokens_map = {"unk_token": "<unk>"}
with TemporaryDirectory() as tmpdirname: self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file']) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file']) with open(self.vocab_file, "w") as fp:
with open(vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
fp.write(json.dumps(vocab_tokens)) with open(self.merges_file, "w") as fp:
with open(merges_file, "w") as fp: fp.write("\n".join(merges))
fp.write("\n".join(merges))
def get_tokenizer(self):
input_text = u"lower newer" return GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
output_text = u"lower<unk>newer"
def get_input_output_texts(self):
create_and_check_tokenizer_commons(self, input_text, output_text, GPT2Tokenizer, tmpdirname, **special_tokens_map) input_text = u"lower newer"
output_text = u"lower<unk>newer"
tokenizer = GPT2Tokenizer(vocab_file, merges_file, **special_tokens_map) return input_text, output_text
text = "lower"
bpe_tokens = ["low", "er"] def test_full_tokenizer(self):
tokens = tokenizer.tokenize(text) tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
self.assertListEqual(tokens, bpe_tokens) text = "lower"
bpe_tokens = ["low", "er"]
input_tokens = tokens + [tokenizer.unk_token] tokens = tokenizer.tokenize(text)
input_bpe_tokens = [13, 12, 17] self.assertListEqual(tokens, bpe_tokens)
self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [13, 12, 17]
self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -20,13 +20,17 @@ import json ...@@ -20,13 +20,17 @@ import json
from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory from .tokenization_tests_commons import CommonTestCases
class OpenAIGPTTokenizationTest(unittest.TestCase): class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
def test_full_tokenizer(self): tokenizer_class = OpenAIGPTTokenizer
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
def setUp(self):
super(OpenAIGPTTokenizationTest, self).setUp()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
"w</w>", "r</w>", "t</w>", "w</w>", "r</w>", "t</w>",
"lo", "low", "er</w>", "lo", "low", "er</w>",
...@@ -34,30 +38,34 @@ class OpenAIGPTTokenizationTest(unittest.TestCase): ...@@ -34,30 +38,34 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""] merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
with TemporaryDirectory() as tmpdirname: self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file']) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file']) with open(self.vocab_file, "w") as fp:
with open(vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
fp.write(json.dumps(vocab_tokens)) with open(self.merges_file, "w") as fp:
with open(merges_file, "w") as fp: fp.write("\n".join(merges))
fp.write("\n".join(merges))
input_text = u"lower newer" def get_tokenizer(self):
output_text = u"lower newer" return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname)
create_and_check_tokenizer_commons(self, input_text, output_text, OpenAIGPTTokenizer, tmpdirname) def get_input_output_texts(self):
input_text = u"lower newer"
output_text = u"lower newer"
return input_text, output_text
tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file)
text = "lower" def test_full_tokenizer(self):
bpe_tokens = ["low", "er</w>"] tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens) text = "lower"
bpe_tokens = ["low", "er</w>"]
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + ["<unk>"] input_tokens = tokens + ["<unk>"]
input_bpe_tokens = [14, 15, 20] input_bpe_tokens = [14, 15, 20]
self.assertListEqual( self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -19,6 +19,7 @@ import sys ...@@ -19,6 +19,7 @@ import sys
from io import open from io import open
import tempfile import tempfile
import shutil import shutil
import unittest
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
import cPickle as pickle import cPickle as pickle
...@@ -36,113 +37,124 @@ else: ...@@ -36,113 +37,124 @@ else:
unicode = str unicode = str
def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs): class CommonTestCases:
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running") class CommonTokenizerTester(unittest.TestCase):
with TemporaryDirectory() as tmpdirname: tokenizer_class = None
tokenizer.save_pretrained(tmpdirname)
tokenizer = tokenizer.from_pretrained(tmpdirname)
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running") def setUp(self):
tester.assertListEqual(before_tokens, after_tokens) self.tmpdirname = tempfile.mkdtemp()
def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs): def tearDown(self):
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs) shutil.rmtree(self.tmpdirname)
tester.assertIsNotNone(tokenizer)
text = u"Munich and Berlin are nice cities" def get_tokenizer(self):
subwords = tokenizer.tokenize(text) raise NotImplementedError
with TemporaryDirectory() as tmpdirname: def get_input_output_texts(self):
raise NotImplementedError
filename = os.path.join(tmpdirname, u"tokenizer.bin") def test_save_and_load_tokenizer(self):
pickle.dump(tokenizer, open(filename, "wb")) tokenizer = self.get_tokenizer()
tokenizer_new = pickle.load(open(filename, "rb")) before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
subwords_loaded = tokenizer_new.tokenize(text) with TemporaryDirectory() as tmpdirname:
tokenizer.save_pretrained(tmpdirname)
tokenizer = tokenizer.from_pretrained(tmpdirname)
tester.assertListEqual(subwords, subwords_loaded) after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
self.assertListEqual(before_tokens, after_tokens)
def test_pickle_tokenizer(self):
tokenizer = self.get_tokenizer()
self.assertIsNotNone(tokenizer)
def create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs): text = u"Munich and Berlin are nice cities"
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs) subwords = tokenizer.tokenize(text)
vocab_size = tokenizer.vocab_size with TemporaryDirectory() as tmpdirname:
all_size = len(tokenizer)
tester.assertNotEqual(vocab_size, 0) filename = os.path.join(tmpdirname, u"tokenizer.bin")
tester.assertEqual(vocab_size, all_size) pickle.dump(tokenizer, open(filename, "wb"))
new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"] tokenizer_new = pickle.load(open(filename, "rb"))
added_toks = tokenizer.add_tokens(new_toks)
vocab_size_2 = tokenizer.vocab_size
all_size_2 = len(tokenizer)
tester.assertNotEqual(vocab_size_2, 0) subwords_loaded = tokenizer_new.tokenize(text)
tester.assertEqual(vocab_size, vocab_size_2)
tester.assertEqual(added_toks, len(new_toks))
tester.assertEqual(all_size_2, all_size + len(new_toks))
tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l") self.assertListEqual(subwords, subwords_loaded)
tester.assertGreaterEqual(len(tokens), 4)
tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
'pad_token': "<<<<<|||>|>>>>|>"}
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer)
tester.assertNotEqual(vocab_size_3, 0) def test_add_tokens_tokenizer(self):
tester.assertEqual(vocab_size, vocab_size_3) tokenizer = self.get_tokenizer()
tester.assertEqual(added_toks_2, len(new_toks_2))
tester.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l") vocab_size = tokenizer.vocab_size
all_size = len(tokenizer)
tester.assertGreaterEqual(len(tokens), 6) self.assertNotEqual(vocab_size, 0)
tester.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertEqual(vocab_size, all_size)
tester.assertGreater(tokens[0], tokens[1])
tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
tester.assertGreater(tokens[-2], tokens[-3])
tester.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
tester.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
added_toks = tokenizer.add_tokens(new_toks)
vocab_size_2 = tokenizer.vocab_size
all_size_2 = len(tokenizer)
def create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs): self.assertNotEqual(vocab_size_2, 0)
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs) self.assertEqual(vocab_size, vocab_size_2)
self.assertEqual(added_toks, len(new_toks))
self.assertEqual(all_size_2, all_size + len(new_toks))
tokens = tokenizer.tokenize(input_text) tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
ids = tokenizer.convert_tokens_to_ids(tokens) self.assertGreaterEqual(len(tokens), 4)
ids_2 = tokenizer.encode(input_text) self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
tester.assertListEqual(ids, ids_2) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
tokens_2 = tokenizer.convert_ids_to_tokens(ids) new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
text_2 = tokenizer.decode(ids) 'pad_token': "<<<<<|||>|>>>>|>"}
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer)
tester.assertEqual(text_2, output_text) self.assertNotEqual(vocab_size_3, 0)
self.assertEqual(vocab_size, vocab_size_3)
self.assertEqual(added_toks_2, len(new_toks_2))
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tester.assertNotEqual(len(tokens_2), 0) tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
tester.assertIsInstance(text_2, (str, unicode))
self.assertGreaterEqual(len(tokens), 6)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[0], tokens[1])
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokens[-3])
self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
def create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
weights_list = list(tokenizer_class.max_model_input_sizes.keys())
weights_lists_2 = []
for file_id, map_list in tokenizer_class.pretrained_vocab_files_map.items():
weights_lists_2.append(list(map_list.keys()))
for weights_list_2 in weights_lists_2: def test_required_methods_tokenizer(self):
tester.assertListEqual(weights_list, weights_list_2) tokenizer = self.get_tokenizer()
input_text, output_text = self.get_input_output_texts()
tokens = tokenizer.tokenize(input_text)
ids = tokenizer.convert_tokens_to_ids(tokens)
ids_2 = tokenizer.encode(input_text)
self.assertListEqual(ids, ids_2)
def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs): tokens_2 = tokenizer.convert_ids_to_tokens(ids)
create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs) text_2 = tokenizer.decode(ids)
create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs) self.assertEqual(text_2, output_text)
create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs) self.assertNotEqual(len(tokens_2), 0)
self.assertIsInstance(text_2, (str, unicode))
def test_pretrained_model_lists(self):
weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
weights_lists_2 = []
for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
weights_lists_2.append(list(map_list.keys()))
for weights_list_2 in weights_lists_2:
self.assertListEqual(weights_list, weights_list_2)
...@@ -20,32 +20,39 @@ from io import open ...@@ -20,32 +20,39 @@ from io import open
from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
from.tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory from.tokenization_tests_commons import CommonTestCases
class TransfoXLTokenizationTest(unittest.TestCase): class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer_class = TransfoXLTokenizer
def setUp(self):
super(TransfoXLTokenizationTest, self).setUp()
def test_full_tokenizer(self):
vocab_tokens = [ vocab_tokens = [
"<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
"running", ",", "low", "l", "running", ",", "low", "l",
] ]
with TemporaryDirectory() as tmpdirname: self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file']) with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
with open(vocab_file, "w", encoding='utf-8') as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
input_text = u"<unk> UNwanted , running" def get_tokenizer(self):
output_text = u"<unk> unwanted, running" return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True)
create_and_check_tokenizer_commons(self, input_text, output_text, TransfoXLTokenizer, tmpdirname, lower_case=True) def get_input_output_texts(self):
input_text = u"<unk> UNwanted , running"
output_text = u"<unk> unwanted, running"
return input_text, output_text
tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True) def test_full_tokenizer(self):
tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
tokens = tokenizer.tokenize(u"<unk> UNwanted , running") tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"]) self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
self.assertListEqual( self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
def test_full_tokenizer_lower(self): def test_full_tokenizer_lower(self):
tokenizer = TransfoXLTokenizer(lower_case=True) tokenizer = TransfoXLTokenizer(lower_case=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment