Commit 1ab25c49 authored by thomwolf's avatar thomwolf
Browse files

Merge branch 'master' into pr/2115

parents df396112 18601c3b
...@@ -18,21 +18,42 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -18,21 +18,42 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import logging import logging
from .configuration_bert import BertConfig from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_gpt2 import GPT2Config from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_transfo_xl import TransfoXLConfig from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_xlnet import XLNetConfig from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_xlm import XLMConfig from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_roberta import RobertaConfig from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_distilbert import DistilBertConfig from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_ctrl import CTRLConfig from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_camembert import CamembertConfig from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_albert import AlbertConfig from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
for pretrained_map in [
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
]
for key, value, in pretrained_map.items())
class AutoConfig(object): class AutoConfig(object):
r""":class:`~transformers.AutoConfig` is a generic configuration class r""":class:`~transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library that will be instantiated as one of the configuration classes of the library
...@@ -47,6 +68,7 @@ class AutoConfig(object): ...@@ -47,6 +68,7 @@ class AutoConfig(object):
- contains `distilbert`: DistilBertConfig (DistilBERT model) - contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model) - contains `albert`: AlbertConfig (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model) - contains `camembert`: CamembertConfig (CamemBERT model)
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
- contains `roberta`: RobertaConfig (RoBERTa model) - contains `roberta`: RobertaConfig (RoBERTa model)
- contains `bert`: BertConfig (Bert model) - contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
...@@ -61,6 +83,34 @@ class AutoConfig(object): ...@@ -61,6 +83,34 @@ class AutoConfig(object):
raise EnvironmentError("AutoConfig is designed to be instantiated " raise EnvironmentError("AutoConfig is designed to be instantiated "
"using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.") "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
@classmethod
def for_model(cls, model_type, *args, **kwargs):
if 'distilbert' in model_type:
return DistilBertConfig(*args, **kwargs)
elif 'roberta' in model_type:
return RobertaConfig(*args, **kwargs)
elif 'bert' in model_type:
return BertConfig(*args, **kwargs)
elif 'openai-gpt' in model_type:
return OpenAIGPTConfig(*args, **kwargs)
elif 'gpt2' in model_type:
return GPT2Config(*args, **kwargs)
elif 'transfo-xl' in model_type:
return TransfoXLConfig(*args, **kwargs)
elif 'xlnet' in model_type:
return XLNetConfig(*args, **kwargs)
elif 'xlm' in model_type:
return XLMConfig(*args, **kwargs)
elif 'ctrl' in model_type:
return CTRLConfig(*args, **kwargs)
elif 'albert' in model_type:
return AlbertConfig(*args, **kwargs)
elif 'camembert' in model_type:
return CamembertConfig(*args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta', 'ctrl', 'camembert', 'albert'".format(model_type))
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r""" Instantiate a one of the configuration classes of the library r""" Instantiate a one of the configuration classes of the library
...@@ -68,9 +118,11 @@ class AutoConfig(object): ...@@ -68,9 +118,11 @@ class AutoConfig(object):
The configuration class to instantiate is selected as the first pattern matching The configuration class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5Config (T5 model)
- contains `distilbert`: DistilBertConfig (DistilBERT model) - contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model) - contains `albert`: AlbertConfig (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model) - contains `camembert`: CamembertConfig (CamemBERT model)
- contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
- contains `roberta`: RobertaConfig (RoBERTa model) - contains `roberta`: RobertaConfig (RoBERTa model)
- contains `bert`: BertConfig (Bert model) - contains `bert`: BertConfig (Bert model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
...@@ -83,6 +135,7 @@ class AutoConfig(object): ...@@ -83,6 +135,7 @@ class AutoConfig(object):
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
...@@ -123,12 +176,16 @@ class AutoConfig(object): ...@@ -123,12 +176,16 @@ class AutoConfig(object):
assert unused_kwargs == {'foo': False} assert unused_kwargs == {'foo': False}
""" """
if 'distilbert' in pretrained_model_name_or_path: if 't5' in pretrained_model_name_or_path:
return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'albert' in pretrained_model_name_or_path: elif 'albert' in pretrained_model_name_or_path:
return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'camembert' in pretrained_model_name_or_path: elif 'camembert' in pretrained_model_name_or_path:
return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'xlm-roberta' in pretrained_model_name_or_path:
return XLMRobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'roberta' in pretrained_model_name_or_path: elif 'roberta' in pretrained_model_name_or_path:
return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'bert' in pretrained_model_name_or_path: elif 'bert' in pretrained_model_name_or_path:
...@@ -147,4 +204,4 @@ class AutoConfig(object): ...@@ -147,4 +204,4 @@ class AutoConfig(object):
return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of " raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path)) "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
...@@ -42,6 +42,12 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -42,6 +42,12 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
} }
...@@ -52,7 +58,7 @@ class BertConfig(PretrainedConfig): ...@@ -52,7 +58,7 @@ class BertConfig(PretrainedConfig):
Arguments: Arguments:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer. hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder. num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in num_attention_heads: Number of attention heads for each attention layer in
...@@ -77,7 +83,7 @@ class BertConfig(PretrainedConfig): ...@@ -77,7 +83,7 @@ class BertConfig(PretrainedConfig):
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=30522, vocab_size=30522,
hidden_size=768, hidden_size=768,
num_hidden_layers=12, num_hidden_layers=12,
num_attention_heads=12, num_attention_heads=12,
...@@ -91,25 +97,15 @@ class BertConfig(PretrainedConfig): ...@@ -91,25 +97,15 @@ class BertConfig(PretrainedConfig):
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
**kwargs): **kwargs):
super(BertConfig, self).__init__(**kwargs) super(BertConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 self.vocab_size = vocab_size
and isinstance(vocab_size_or_config_json_file, unicode)): self.hidden_size = hidden_size
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: self.num_hidden_layers = num_hidden_layers
json_config = json.loads(reader.read()) self.num_attention_heads = num_attention_heads
for key, value in json_config.items(): self.hidden_act = hidden_act
self.__dict__[key] = value self.intermediate_size = intermediate_size
elif isinstance(vocab_size_or_config_json_file, int): self.hidden_dropout_prob = hidden_dropout_prob
self.vocab_size = vocab_size_or_config_json_file self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.hidden_size = hidden_size self.max_position_embeddings = max_position_embeddings
self.num_hidden_layers = num_hidden_layers self.type_vocab_size = type_vocab_size
self.num_attention_heads = num_attention_heads self.initializer_range = initializer_range
self.hidden_act = hidden_act self.layer_norm_eps = layer_norm_eps
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
...@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig): ...@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `CTRLModel`. """Configuration class to store the configuration of a `CTRLModel`.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN. dff: Size of the inner dimension of the FFN.
...@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig): ...@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
vocab_size_or_config_json_file=246534, vocab_size=246534,
n_positions=256, n_positions=256,
n_ctx=256, n_ctx=256,
n_embd=1280, n_embd=1280,
...@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig): ...@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig):
attn_pdrop=0.1, attn_pdrop=0.1,
layer_norm_epsilon=1e-6, layer_norm_epsilon=1e-6,
initializer_range=0.02, initializer_range=0.02,
num_labels=1,
summary_type='cls_index', summary_type='cls_index',
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
...@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig): ...@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig):
"""Constructs CTRLConfig. """Constructs CTRLConfig.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN. dff: Size of the inner dimension of the FFN.
...@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig): ...@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig):
initializing all weight matrices. initializing all weight matrices.
""" """
super(CTRLConfig, self).__init__(**kwargs) super(CTRLConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size
self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
self.n_embd = n_embd self.n_embd = n_embd
...@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig): ...@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig):
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.num_labels = num_labels
self.summary_type = summary_type self.summary_type = summary_type
self.summary_use_proj = summary_use_proj self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels self.summary_proj_to_labels = summary_proj_to_labels
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif not isinstance(vocab_size_or_config_json_file, int):
raise ValueError(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
......
...@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig): ...@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=30522, vocab_size=30522,
max_position_embeddings=512, max_position_embeddings=512,
sinusoidal_pos_embds=False, sinusoidal_pos_embds=False,
n_layers=6, n_layers=6,
...@@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig): ...@@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig):
seq_classif_dropout=0.2, seq_classif_dropout=0.2,
**kwargs): **kwargs):
super(DistilBertConfig, self).__init__(**kwargs) super(DistilBertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.sinusoidal_pos_embds = sinusoidal_pos_embds
self.n_layers = n_layers
self.n_heads = n_heads
self.dim = dim
self.hidden_dim = hidden_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation = activation
self.initializer_range = initializer_range
self.tie_weights_ = tie_weights_
self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file
self.max_position_embeddings = max_position_embeddings
self.sinusoidal_pos_embds = sinusoidal_pos_embds
self.n_layers = n_layers
self.n_heads = n_heads
self.dim = dim
self.hidden_dim = hidden_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation = activation
self.initializer_range = initializer_range
self.tie_weights_ = tie_weights_
self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property @property
def hidden_size(self): def hidden_size(self):
return self.dim return self.dim
......
...@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig): ...@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig):
"""Configuration class to store the configuration of a `GPT2Model`. """Configuration class to store the configuration of a `GPT2Model`.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. n_embd: Dimensionality of the embeddings and hidden states.
...@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig): ...@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig):
def __init__( def __init__(
self, self,
vocab_size_or_config_json_file=50257, vocab_size=50257,
n_positions=1024, n_positions=1024,
n_ctx=1024, n_ctx=1024,
n_embd=768, n_embd=768,
...@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig): ...@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig):
attn_pdrop=0.1, attn_pdrop=0.1,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
num_labels=1,
summary_type='cls_index', summary_type='cls_index',
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
...@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig): ...@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig):
"""Constructs GPT2Config. """Constructs GPT2Config.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. n_embd: Dimensionality of the embeddings and hidden states.
...@@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig): ...@@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig):
initializing all weight matrices. initializing all weight matrices.
""" """
super(GPT2Config, self).__init__(**kwargs) super(GPT2Config, self).__init__(**kwargs)
self.vocab_size = vocab_size
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 self.n_ctx = n_ctx
and isinstance(vocab_size_or_config_json_file, unicode)): self.n_positions = n_positions
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: self.n_embd = n_embd
json_config = json.loads(reader.read()) self.n_layer = n_layer
for key, value in json_config.items(): self.n_head = n_head
self.__dict__[key] = value self.resid_pdrop = resid_pdrop
elif isinstance(vocab_size_or_config_json_file, int): self.embd_pdrop = embd_pdrop
self.vocab_size = vocab_size_or_config_json_file self.attn_pdrop = attn_pdrop
self.n_ctx = n_ctx self.layer_norm_epsilon = layer_norm_epsilon
self.n_positions = n_positions self.initializer_range = initializer_range
self.n_embd = n_embd self.summary_type = summary_type
self.n_layer = n_layer self.summary_use_proj = summary_use_proj
self.n_head = n_head self.summary_activation = summary_activation
self.resid_pdrop = resid_pdrop self.summary_first_dropout = summary_first_dropout
self.embd_pdrop = embd_pdrop self.summary_proj_to_labels = summary_proj_to_labels
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.num_labels = num_labels
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
else:
raise ValueError(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
......
...@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig):
Configuration class to store the configuration of a `OpenAIGPTModel`. Configuration class to store the configuration of a `OpenAIGPTModel`.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. n_embd: Dimensionality of the embeddings and hidden states.
...@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
vocab_size_or_config_json_file=40478, vocab_size=40478,
n_positions=512, n_positions=512,
n_ctx=512, n_ctx=512,
n_embd=768, n_embd=768,
...@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig):
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
predict_special_tokens=True, predict_special_tokens=True,
num_labels=1,
summary_type='cls_index', summary_type='cls_index',
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
...@@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig):
"""Constructs OpenAIGPTConfig. """Constructs OpenAIGPTConfig.
""" """
super(OpenAIGPTConfig, self).__init__(**kwargs) super(OpenAIGPTConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 self.n_ctx = n_ctx
and isinstance(vocab_size_or_config_json_file, unicode)): self.n_positions = n_positions
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: self.n_embd = n_embd
json_config = json.loads(reader.read()) self.n_layer = n_layer
for key, value in json_config.items(): self.n_head = n_head
self.__dict__[key] = value self.afn = afn
elif isinstance(vocab_size_or_config_json_file, int): self.resid_pdrop = resid_pdrop
self.vocab_size = vocab_size_or_config_json_file self.embd_pdrop = embd_pdrop
self.n_ctx = n_ctx self.attn_pdrop = attn_pdrop
self.n_positions = n_positions self.layer_norm_epsilon = layer_norm_epsilon
self.n_embd = n_embd self.initializer_range = initializer_range
self.n_layer = n_layer self.predict_special_tokens = predict_special_tokens
self.n_head = n_head self.summary_type = summary_type
self.afn = afn self.summary_use_proj = summary_use_proj
self.resid_pdrop = resid_pdrop self.summary_activation = summary_activation
self.embd_pdrop = embd_pdrop self.summary_first_dropout = summary_first_dropout
self.attn_pdrop = attn_pdrop self.summary_proj_to_labels = summary_proj_to_labels
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.predict_special_tokens = predict_special_tokens
self.num_labels = num_labels
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
else:
raise ValueError(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
......
# coding=utf-8
# Copyright 2010, The T5 Authors and HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" T5 model configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import sys
import six
from io import open
from .configuration_utils import PretrainedConfig
logger = logging.getLogger(__name__)
T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
}
class T5Config(PretrainedConfig):
r"""
:class:`~transformers.T5Config` is the configuration class to store the configuration of a
`T5Model`.
Arguments:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`T5Model`.
initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
layer_norm_eps: The epsilon used by LayerNorm.
"""
pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self,
vocab_size=32128,
n_positions=512,
d_model=512,
d_kv=64,
d_ff=2048,
num_layers=6,
num_heads=8,
relative_attention_num_buckets=32,
dropout_rate=0.1,
layer_norm_epsilon=1e-6,
initializer_factor=1.0,
**kwargs):
super(T5Config, self).__init__(**kwargs)
self.vocab_size = vocab_size
self.n_positions = n_positions
self.d_model = d_model
self.d_kv = d_kv
self.d_ff = d_ff
self.num_layers = num_layers
self.num_heads = num_heads
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_factor = initializer_factor
@property
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.num_heads
@property
def num_hidden_layers(self):
return self.num_layers
...@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `TransfoXLModel`. """Configuration class to store the configuration of a `TransfoXLModel`.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
cutoffs: cutoffs for the adaptive softmax cutoffs: cutoffs for the adaptive softmax
d_model: Dimensionality of the model's hidden states. d_model: Dimensionality of the model's hidden states.
d_embed: Dimensionality of the embeddings d_embed: Dimensionality of the embeddings
...@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig):
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=267735, vocab_size=267735,
cutoffs=[20000, 40000, 200000], cutoffs=[20000, 40000, 200000],
d_model=1024, d_model=1024,
d_embed=1024, d_embed=1024,
...@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig):
"""Constructs TransfoXLConfig. """Constructs TransfoXLConfig.
""" """
super(TransfoXLConfig, self).__init__(**kwargs) super(TransfoXLConfig, self).__init__(**kwargs)
self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 self.vocab_size = vocab_size
self.cutoffs = [] self.cutoffs = []
self.cutoffs.extend(cutoffs) self.cutoffs.extend(cutoffs)
self.tie_weight = tie_weight self.tie_weight = tie_weight
...@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig):
self.init_std = init_std self.init_std = init_std
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif not isinstance(vocab_size_or_config_json_file, int):
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
return self.tgt_len + self.ext_len + self.mem_len return self.tgt_len + self.ext_len + self.mem_len
@property @property
def vocab_size(self): def n_token(self): # Backward compatibility
return self.n_token return self.vocab_size
@vocab_size.setter @n_token.setter
def vocab_size(self, value): def n_token(self, value): # Backward compatibility
self.n_token = value self.vocab_size = value
@property @property
def hidden_size(self): def hidden_size(self):
......
...@@ -24,7 +24,7 @@ import logging ...@@ -24,7 +24,7 @@ import logging
import os import os
from io import open from io import open
from .file_utils import cached_path, CONFIG_NAME from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -49,16 +49,47 @@ class PretrainedConfig(object): ...@@ -49,16 +49,47 @@ class PretrainedConfig(object):
pretrained_config_archive_map = {} pretrained_config_archive_map = {}
def __init__(self, **kwargs): def __init__(self, **kwargs):
self.finetuning_task = kwargs.pop('finetuning_task', None) # Attributes with defaults
self.num_labels = kwargs.pop('num_labels', 2)
self.output_attentions = kwargs.pop('output_attentions', False) self.output_attentions = kwargs.pop('output_attentions', False)
self.output_hidden_states = kwargs.pop('output_hidden_states', False) self.output_hidden_states = kwargs.pop('output_hidden_states', False)
self.output_past = kwargs.pop('output_past', True) # Not used by all models self.output_past = kwargs.pop('output_past', True) # Not used by all models
self.torchscript = kwargs.pop('torchscript', False) # Only used by PyTorch models self.torchscript = kwargs.pop('torchscript', False) # Only used by PyTorch models
self.use_bfloat16 = kwargs.pop('use_bfloat16', False) self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
self.pruned_heads = kwargs.pop('pruned_heads', {}) self.pruned_heads = kwargs.pop('pruned_heads', {})
# Is decoder is used in encoder-decoder models to differentiate encoder from decoder
self.is_decoder = kwargs.pop('is_decoder', False) self.is_decoder = kwargs.pop('is_decoder', False)
# Parameters for sequence generation
self.max_length = kwargs.pop('max_length', 20)
self.do_sample = kwargs.pop('do_sample', False)
self.num_beams = kwargs.pop('num_beams', 1)
self.temperature = kwargs.pop('temperature', 1.0)
self.top_k = kwargs.pop('top_k', 50)
self.top_p = kwargs.pop('top_p', 1.0)
self.repetition_penalty = kwargs.pop('repetition_penalty', 1.0)
self.bos_token_id = kwargs.pop('bos_token_id', 0)
self.pad_token_id = kwargs.pop('pad_token_id', 0)
self.eos_token_ids = kwargs.pop('eos_token_ids', 0)
self.length_penalty = kwargs.pop('length_penalty', 1.)
self.num_return_sequences = kwargs.pop('num_return_sequences', 1)
# Fine-tuning task arguments
self.finetuning_task = kwargs.pop('finetuning_task', None)
self.num_labels = kwargs.pop('num_labels', 2)
self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
self.id2label = dict((int(key), value) for key, value in self.id2label.items())
self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
# Additional attributes without default values
for key, value in kwargs.items():
try:
setattr(self, key, value)
except AttributeError as err:
logger.error("Can't set {} with value {} for {}".format(key, value, self))
raise err
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save a configuration object to the directory `save_directory`, so that it """ Save a configuration object to the directory `save_directory`, so that it
can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method. can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
...@@ -79,6 +110,7 @@ class PretrainedConfig(object): ...@@ -79,6 +110,7 @@ class PretrainedConfig(object):
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
...@@ -131,12 +163,18 @@ class PretrainedConfig(object): ...@@ -131,12 +163,18 @@ class PretrainedConfig(object):
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path] config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path): elif os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
else: elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
config_file = pretrained_model_name_or_path config_file = pretrained_model_name_or_path
# redirect to the cache, if necessary else:
config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
try: try:
# Load from URL or cache if already cached
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
proxies=proxies, resume_download=resume_download) proxies=proxies, resume_download=resume_download)
# Load config
config = cls.from_json_file(resolved_config_file)
except EnvironmentError: except EnvironmentError:
if pretrained_model_name_or_path in cls.pretrained_config_archive_map: if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format( msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
...@@ -150,15 +188,18 @@ class PretrainedConfig(object): ...@@ -150,15 +188,18 @@ class PretrainedConfig(object):
config_file, CONFIG_NAME) config_file, CONFIG_NAME)
raise EnvironmentError(msg) raise EnvironmentError(msg)
except json.JSONDecodeError:
msg = "Couldn't reach server at '{}' to download configuration file or " \
"configuration file is not a valid JSON file. " \
"Please check network or file content here: {}.".format(config_file, resolved_config_file)
raise EnvironmentError(msg)
if resolved_config_file == config_file: if resolved_config_file == config_file:
logger.info("loading configuration file {}".format(config_file)) logger.info("loading configuration file {}".format(config_file))
else: else:
logger.info("loading configuration file {} from cache at {}".format( logger.info("loading configuration file {} from cache at {}".format(
config_file, resolved_config_file)) config_file, resolved_config_file))
# Load config
config = cls.from_json_file(resolved_config_file)
if hasattr(config, 'pruned_heads'): if hasattr(config, 'pruned_heads'):
config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
...@@ -180,17 +221,15 @@ class PretrainedConfig(object): ...@@ -180,17 +221,15 @@ class PretrainedConfig(object):
@classmethod @classmethod
def from_dict(cls, json_object): def from_dict(cls, json_object):
"""Constructs a `Config` from a Python dictionary of parameters.""" """Constructs a `Config` from a Python dictionary of parameters."""
config = cls(vocab_size_or_config_json_file=-1) return cls(**json_object)
for key, value in json_object.items():
setattr(config, key, value)
return config
@classmethod @classmethod
def from_json_file(cls, json_file): def from_json_file(cls, json_file):
"""Constructs a `BertConfig` from a json file of parameters.""" """Constructs a `Config` from a json file of parameters."""
with open(json_file, "r", encoding='utf-8') as reader: with open(json_file, "r", encoding='utf-8') as reader:
text = reader.read() text = reader.read()
return cls.from_dict(json.loads(text)) dict_obj = json.loads(text)
return cls(**dict_obj)
def __eq__(self, other): def __eq__(self, other):
return self.__dict__ == other.__dict__ return self.__dict__ == other.__dict__
......
...@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig): ...@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `XLMModel`. """Configuration class to store the configuration of a `XLMModel`.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`. vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
d_model: Size of the encoder layers and the pooler layer. d_model: Size of the encoder layers and the pooler layer.
n_layer: Number of hidden layers in the Transformer encoder. n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in n_head: Number of attention heads for each attention layer in
...@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig): ...@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig):
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=30145, vocab_size=30145,
emb_dim=2048, emb_dim=2048,
n_layers=12, n_layers=12,
n_heads=16, n_heads=16,
...@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig): ...@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig):
unk_index=3, unk_index=3,
mask_index=5, mask_index=5,
is_encoder=True, is_encoder=True,
finetuning_task=None,
num_labels=2,
summary_type='first', summary_type='first',
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
...@@ -113,60 +110,54 @@ class XLMConfig(PretrainedConfig): ...@@ -113,60 +110,54 @@ class XLMConfig(PretrainedConfig):
summary_first_dropout=0.1, summary_first_dropout=0.1,
start_n_top=5, start_n_top=5,
end_n_top=5, end_n_top=5,
mask_token_id=0,
lang_id=0,
**kwargs): **kwargs):
"""Constructs XLMConfig. """Constructs XLMConfig.
""" """
super(XLMConfig, self).__init__(**kwargs) super(XLMConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 self.emb_dim = emb_dim
and isinstance(vocab_size_or_config_json_file, unicode)): self.n_layers = n_layers
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: self.n_heads = n_heads
json_config = json.loads(reader.read()) self.dropout = dropout
for key, value in json_config.items(): self.attention_dropout = attention_dropout
self.__dict__[key] = value self.gelu_activation = gelu_activation
elif isinstance(vocab_size_or_config_json_file, int): self.sinusoidal_embeddings = sinusoidal_embeddings
self.n_words = vocab_size_or_config_json_file self.causal = causal
self.emb_dim = emb_dim self.asm = asm
self.n_layers = n_layers self.n_langs = n_langs
self.n_heads = n_heads self.use_lang_emb = use_lang_emb
self.dropout = dropout self.layer_norm_eps = layer_norm_eps
self.attention_dropout = attention_dropout self.bos_index = bos_index
self.gelu_activation = gelu_activation self.eos_index = eos_index
self.sinusoidal_embeddings = sinusoidal_embeddings self.pad_index = pad_index
self.causal = causal self.unk_index = unk_index
self.asm = asm self.mask_index = mask_index
self.n_langs = n_langs self.is_encoder = is_encoder
self.use_lang_emb = use_lang_emb self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps self.embed_init_std = embed_init_std
self.bos_index = bos_index self.init_std = init_std
self.eos_index = eos_index self.summary_type = summary_type
self.pad_index = pad_index self.summary_use_proj = summary_use_proj
self.unk_index = unk_index self.summary_activation = summary_activation
self.mask_index = mask_index self.summary_proj_to_labels = summary_proj_to_labels
self.is_encoder = is_encoder self.summary_first_dropout = summary_first_dropout
self.max_position_embeddings = max_position_embeddings self.start_n_top = start_n_top
self.embed_init_std = embed_init_std self.end_n_top = end_n_top
self.init_std = init_std self.mask_token_id = mask_token_id
self.finetuning_task = finetuning_task self.lang_id = lang_id
self.num_labels = num_labels
self.summary_type = summary_type if "n_words" in kwargs:
self.summary_use_proj = summary_use_proj self.n_words = kwargs["n_words"]
self.summary_activation = summary_activation
self.summary_proj_to_labels = summary_proj_to_labels
self.summary_first_dropout = summary_first_dropout
self.start_n_top = start_n_top
self.end_n_top = end_n_top
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property @property
def vocab_size(self): def n_words(self): # For backward compatibility
return self.n_words return self.vocab_size
@vocab_size.setter @n_words.setter
def vocab_size(self, value): def n_words(self, value): # For backward compatibility
self.n_words = value self.vocab_size = value
@property @property
def hidden_size(self): def hidden_size(self):
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" XLM-RoBERTa configuration """
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging
from .configuration_roberta import RobertaConfig
logger = logging.getLogger(__name__)
XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
}
class XLMRobertaConfig(RobertaConfig):
pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig): ...@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig):
"""Configuration class to store the configuration of a ``XLNetModel``. """Configuration class to store the configuration of a ``XLNetModel``.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
d_model: Size of the encoder layers and the pooler layer. d_model: Size of the encoder layers and the pooler layer.
n_layer: Number of hidden layers in the Transformer encoder. n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in n_head: Number of attention heads for each attention layer in
...@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig): ...@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig):
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=32000, vocab_size=32000,
d_model=1024, d_model=1024,
n_layer=24, n_layer=24,
n_head=16, n_head=16,
d_inner=4096, d_inner=4096,
max_position_embeddings=512,
ff_activation="gelu", ff_activation="gelu",
untie_r=True, untie_r=True,
attn_type="bi", attn_type="bi",
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
dropout=0.1, dropout=0.1,
mem_len=None, mem_len=None,
reuse_len=None, reuse_len=None,
bi_data=False, bi_data=False,
clamp_len=-1, clamp_len=-1,
same_length=False, same_length=False,
finetuning_task=None,
num_labels=2,
summary_type='last', summary_type='last',
summary_use_proj=True, summary_use_proj=True,
summary_activation='tanh', summary_activation='tanh',
...@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig): ...@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig):
"""Constructs XLNetConfig. """Constructs XLNetConfig.
""" """
super(XLNetConfig, self).__init__(**kwargs) super(XLNetConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 self.d_model = d_model
and isinstance(vocab_size_or_config_json_file, unicode)): self.n_layer = n_layer
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: self.n_head = n_head
json_config = json.loads(reader.read()) assert d_model % n_head == 0
for key, value in json_config.items(): self.d_head = d_model // n_head
setattr(config, key, value) self.ff_activation = ff_activation
elif isinstance(vocab_size_or_config_json_file, int): self.d_inner = d_inner
self.n_token = vocab_size_or_config_json_file self.untie_r = untie_r
self.d_model = d_model self.attn_type = attn_type
self.n_layer = n_layer
self.n_head = n_head self.initializer_range = initializer_range
assert d_model % n_head == 0 self.layer_norm_eps = layer_norm_eps
self.d_head = d_model // n_head
self.ff_activation = ff_activation self.dropout = dropout
self.d_inner = d_inner self.mem_len = mem_len
self.untie_r = untie_r self.reuse_len = reuse_len
self.attn_type = attn_type self.bi_data = bi_data
self.clamp_len = clamp_len
self.initializer_range = initializer_range self.same_length = same_length
self.layer_norm_eps = layer_norm_eps
self.summary_type = summary_type
self.dropout = dropout self.summary_use_proj = summary_use_proj
self.mem_len = mem_len self.summary_activation = summary_activation
self.reuse_len = reuse_len self.summary_last_dropout = summary_last_dropout
self.bi_data = bi_data self.start_n_top = start_n_top
self.clamp_len = clamp_len self.end_n_top = end_n_top
self.same_length = same_length
self.finetuning_task = finetuning_task
self.num_labels = num_labels
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_last_dropout = summary_last_dropout
self.start_n_top = start_n_top
self.end_n_top = end_n_top
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
return -1 return -1
@property @property
def vocab_size(self): def n_token(self): # Backward compatibility
return self.n_token return self.vocab_size
@vocab_size.setter @n_token.setter
def vocab_size(self, value): def n_token(self, value): # Backward compatibility
self.n_token = value self.vocab_size = value
@property @property
def hidden_size(self): def hidden_size(self):
......
...@@ -32,9 +32,10 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model, ...@@ -32,9 +32,10 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
TransfoXLConfig, TFTransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig, TFTransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFDistilBertForSequenceClassification, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP) AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)
if is_torch_available(): if is_torch_available():
import torch import torch
...@@ -46,9 +47,10 @@ if is_torch_available(): ...@@ -46,9 +47,10 @@ if is_torch_available():
TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
else: else:
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
...@@ -57,9 +59,10 @@ else: ...@@ -57,9 +59,10 @@ else:
TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForMaskedLM, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) = ( AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
None, None, None, None, None, None, None, None,
None, None, None, None,
None, None, None, None,
...@@ -67,7 +70,8 @@ else: ...@@ -67,7 +70,8 @@ else:
None, None, None, None,
None, None, None, None,
None, None, None, None, None, None,
None, None, None, None, None, None, None,
None, None,
None, None, None, None,
None, None) None, None)
...@@ -89,8 +93,10 @@ MODEL_CLASSES = { ...@@ -89,8 +93,10 @@ MODEL_CLASSES = {
'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP), 'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), 'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), 'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP), 'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP) 'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
} }
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True): def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
...@@ -115,23 +121,21 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file ...@@ -115,23 +121,21 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path) tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
if compare_with_pt_model: if compare_with_pt_model:
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] tfo = tf_model(tf_model.dummy_inputs, training=False) # build the network
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False) # build the network
pt_model = pt_model_class.from_pretrained(None, state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
config=config, config=config,
state_dict=torch.load(pytorch_checkpoint_path, state_dict=state_dict)
map_location='cpu'))
pt_inputs = torch.tensor(inputs_list)
with torch.no_grad(): with torch.no_grad():
pto = pt_model(pt_inputs) pto = pt_model(**pt_model.dummy_inputs)
np_pt = pto[0].detach().numpy() np_pt = pto[0].numpy()
np_tf = tfo[0].numpy() np_tf = tfo[0].numpy()
diff = np.amax(np.abs(np_pt - np_tf)) diff = np.amax(np.abs(np_pt - np_tf))
print("Max absolute difference between models outputs {}".format(diff)) print("Max absolute difference between models outputs {}".format(diff))
assert diff <= 2e-2, "Error, model absolute difference is >2e-2" assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(diff)
# Save pytorch-model # Save pytorch-model
print("Save TensorFlow model to {}".format(tf_dump_path)) print("Save TensorFlow model to {}".format(tf_dump_path))
...@@ -139,7 +143,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file ...@@ -139,7 +143,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None, def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False): compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory" assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
if args_model_type is None: if args_model_type is None:
...@@ -187,13 +191,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc ...@@ -187,13 +191,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
if os.path.isfile(model_shortcut_name): if os.path.isfile(model_shortcut_name):
model_shortcut_name = 'converted_model' model_shortcut_name = 'converted_model'
convert_pt_checkpoint_to_tf(model_type=model_type, convert_pt_checkpoint_to_tf(model_type=model_type,
pytorch_checkpoint_path=model_file, pytorch_checkpoint_path=model_file,
config_file=config_file, config_file=config_file,
tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'), tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
compare_with_pt_model=compare_with_pt_model) compare_with_pt_model=compare_with_pt_model)
os.remove(config_file) if remove_cached_files:
os.remove(model_file) os.remove(config_file)
os.remove(model_file)
if __name__ == "__main__": if __name__ == "__main__":
...@@ -226,6 +232,9 @@ if __name__ == "__main__": ...@@ -226,6 +232,9 @@ if __name__ == "__main__":
parser.add_argument("--use_cached_models", parser.add_argument("--use_cached_models",
action='store_true', action='store_true',
help = "Use cached models if possible instead of updating to latest checkpoint versions.") help = "Use cached models if possible instead of updating to latest checkpoint versions.")
parser.add_argument("--remove_cached_files",
action='store_true',
help = "Remove pytorch models after conversion (save memory when converting in batches).")
parser.add_argument("--only_convert_finetuned_models", parser.add_argument("--only_convert_finetuned_models",
action='store_true', action='store_true',
help = "Only convert finetuned models.") help = "Only convert finetuned models.")
...@@ -245,4 +254,5 @@ if __name__ == "__main__": ...@@ -245,4 +254,5 @@ if __name__ == "__main__":
config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None, config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
compare_with_pt_model=args.compare_with_pt_model, compare_with_pt_model=args.compare_with_pt_model,
use_cached_models=args.use_cached_models, use_cached_models=args.use_cached_models,
remove_cached_files=args.remove_cached_files,
only_convert_finetuned_models=args.only_convert_finetuned_models) only_convert_finetuned_models=args.only_convert_finetuned_models)
...@@ -20,6 +20,13 @@ import argparse ...@@ -20,6 +20,13 @@ import argparse
import logging import logging
import numpy as np import numpy as np
import torch import torch
import pathlib
import fairseq
from packaging import version
if version.parse(fairseq.__version__) < version.parse("0.9.0"):
raise Exception("requires fairseq >= 0.9.0")
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
from fairseq.modules import TransformerSentenceEncoderLayer from fairseq.modules import TransformerSentenceEncoderLayer
...@@ -45,8 +52,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -45,8 +52,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
""" """
roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
roberta.eval() # disable dropout roberta.eval() # disable dropout
roberta_sent_encoder = roberta.model.decoder.sentence_encoder
config = BertConfig( config = BertConfig(
vocab_size_or_config_json_file=50265, vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
hidden_size=roberta.args.encoder_embed_dim, hidden_size=roberta.args.encoder_embed_dim,
num_hidden_layers=roberta.args.encoder_layers, num_hidden_layers=roberta.args.encoder_layers,
num_attention_heads=roberta.args.encoder_attention_heads, num_attention_heads=roberta.args.encoder_attention_heads,
...@@ -64,7 +72,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -64,7 +72,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
# Now let's copy all the weights. # Now let's copy all the weights.
# Embeddings # Embeddings
roberta_sent_encoder = roberta.model.decoder.sentence_encoder
model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them.
...@@ -79,15 +86,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -79,15 +86,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
### self attention ### self attention
self_attn: BertSelfAttention = layer.attention.self self_attn: BertSelfAttention = layer.attention.self
assert( assert(
roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size)) roberta_layer.self_attn.k_proj.weight.data.shape == \
roberta_layer.self_attn.q_proj.weight.data.shape == \
roberta_layer.self_attn.v_proj.weight.data.shape == \
torch.Size((config.hidden_size, config.hidden_size))
) )
# we use three distinct linear layers so we split the source layer here.
self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :] self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size] self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :] self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size] self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :] self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:] self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
### self-attention output ### self-attention output
self_output: BertSelfOutput = layer.attention.output self_output: BertSelfOutput = layer.attention.output
...@@ -151,6 +161,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -151,6 +161,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
if not success: if not success:
raise Exception("Something went wRoNg") raise Exception("Something went wRoNg")
pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}") print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path) model.save_pretrained(pytorch_dump_folder_path)
......
# coding=utf-8
# Copyright 2018 The T5 authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert T5 checkpoint."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import torch
from transformers import T5Config, T5Model, load_tf_weights_in_t5
import logging
logging.basicConfig(level=logging.INFO)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
# Initialise PyTorch model
config = T5Config.from_json_file(config_file)
print("Building PyTorch model from configuration: {}".format(str(config)))
model = T5Model(config)
# Load weights from tf checkpoint
load_tf_weights_in_t5(model, config, tf_checkpoint_path)
# Save pytorch-model
print("Save PyTorch model to {}".format(pytorch_dump_path))
torch.save(model.state_dict(), pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--tf_checkpoint_path",
default = None,
type = str,
required = True,
help = "Path to the TensorFlow checkpoint path.")
parser.add_argument("--config_file",
default = None,
type = str,
required = True,
help = "The config json file corresponding to the pre-trained T5 model. \n"
"This specifies the model architecture.")
parser.add_argument("--pytorch_dump_path",
default = None,
type = str,
required = True,
help = "Path to the output PyTorch model.")
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
args.config_file,
args.pytorch_dump_path)
from .processors import InputExample, InputFeatures, DataProcessor from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures, SingleSentenceClassificationProcessor
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
from .metrics import is_sklearn_available from .metrics import is_sklearn_available
......
""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
"""
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Load SQuAD dataset. """
from __future__ import absolute_import, division, print_function
import json import json
import logging import logging
...@@ -24,481 +14,372 @@ import math ...@@ -24,481 +14,372 @@ import math
import collections import collections
from io import open from io import open
from tqdm import tqdm from tqdm import tqdm
import string
import re
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class SquadExample(object): def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
return re.sub(regex, ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_tokens(s):
if not s:
return []
return normalize_answer(s).split()
def compute_exact(a_gold, a_pred):
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
def compute_f1(a_gold, a_pred):
gold_toks = get_tokens(a_gold)
pred_toks = get_tokens(a_pred)
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(gold_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(gold_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def get_raw_scores(examples, preds):
""" """
A single training/test example for the Squad dataset. Computes the exact and f1 scores from the examples and the model predictions
For examples without an answer, the start and end position are -1.
""" """
exact_scores = {}
f1_scores = {}
for example in examples:
qas_id = example.qas_id
gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])]
if not gold_answers:
# For unanswerable questions, only correct answer is empty string
gold_answers = ['']
if qas_id not in preds:
print('Missing prediction for %s' % qas_id)
continue
prediction = preds[qas_id]
exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
return exact_scores, f1_scores
def __init__(self,
qas_id, def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
question_text, new_scores = {}
doc_tokens, for qid, s in scores.items():
orig_answer_text=None, pred_na = na_probs[qid] > na_prob_thresh
start_position=None, if pred_na:
end_position=None, new_scores[qid] = float(not qid_to_has_ans[qid])
is_impossible=None): else:
self.qas_id = qas_id new_scores[qid] = s
self.question_text = question_text return new_scores
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position def make_eval_dict(exact_scores, f1_scores, qid_list=None):
self.end_position = end_position if not qid_list:
self.is_impossible = is_impossible total = len(exact_scores)
return collections.OrderedDict([
def __str__(self): ('exact', 100.0 * sum(exact_scores.values()) / total),
return self.__repr__() ('f1', 100.0 * sum(f1_scores.values()) / total),
('total', total),
def __repr__(self): ])
s = "" else:
s += "qas_id: %s" % (self.qas_id) total = len(qid_list)
s += ", question_text: %s" % ( return collections.OrderedDict([
self.question_text) ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
if self.start_position: ('total', total),
s += ", start_position: %d" % (self.start_position) ])
if self.end_position:
s += ", end_position: %d" % (self.end_position)
if self.is_impossible: def merge_eval(main_eval, new_eval, prefix):
s += ", is_impossible: %r" % (self.is_impossible) for k in new_eval:
return s main_eval['%s_%s' % (prefix, k)] = new_eval[k]
class InputFeatures(object): def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
"""A single set of features of data.""" num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
cur_score = num_no_ans
def __init__(self, best_score = cur_score
unique_id, best_thresh = 0.0
example_index, qid_list = sorted(na_probs, key=lambda k: na_probs[k])
doc_span_index, for i, qid in enumerate(qid_list):
tokens, if qid not in scores:
token_to_orig_map, continue
token_is_max_context, if qid_to_has_ans[qid]:
input_ids, diff = scores[qid]
input_mask, else:
segment_ids, if preds[qid]:
cls_index, diff = -1
p_mask,
paragraph_len,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.cls_index = cls_index
self.p_mask = p_mask
self.paragraph_len = paragraph_len
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def read_squad_examples(input_file, is_training, version_2_with_negative):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r", encoding='utf-8') as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer.")
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
def convert_examples_to_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, is_training,
cls_token_at_end=False,
cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
sequence_a_segment_id=0, sequence_b_segment_id=1,
cls_token_segment_id=0, pad_token_segment_id=0,
mask_padding_with_zero=True,
sequence_a_is_doc=False):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
# cnt_pos, cnt_neg = 0, 0
# max_N, max_M = 1024, 1024
# f = np.zeros((max_N, max_M), dtype=np.float32)
features = []
for (example_index, example) in enumerate(tqdm(examples)):
# if example_index % 100 == 0:
# logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else: else:
tok_end_position = len(all_doc_tokens) - 1 diff = 0
(tok_start_position, tok_end_position) = _improve_answer_span( cur_score += diff
all_doc_tokens, tok_start_position, tok_end_position, tokenizer, if cur_score > best_score:
example.orig_answer_text) best_score = cur_score
best_thresh = na_probs[qid]
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 has_ans_score, has_ans_cnt = 0, 0
assert max_tokens_for_doc > 0 for qid in qid_list:
if not qid_to_has_ans[qid]:
# We can have documents that are longer than the maximum sequence length. continue
# To deal with this we do a sliding window approach, where we take chunks has_ans_cnt += 1
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name if qid not in scores:
"DocSpan", ["start", "length"]) continue
doc_spans = [] has_ans_score += scores[qid]
start_offset = 0
while start_offset < len(all_doc_tokens): return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
doc_spans.append(_DocSpan(start=start_offset, length=length)) best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(
if start_offset + length == len(all_doc_tokens): preds, exact_raw, na_probs, qid_to_has_ans)
break best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(
start_offset += min(length, doc_stride) preds, f1_raw, na_probs, qid_to_has_ans)
main_eval['best_exact'] = best_exact
for (doc_span_index, doc_span) in enumerate(doc_spans): main_eval['best_exact_thresh'] = exact_thresh
tokens = [] main_eval['best_f1'] = best_f1
token_to_orig_map = {} main_eval['best_f1_thresh'] = f1_thresh
token_is_max_context = {} main_eval['has_ans_exact'] = has_ans_exact
segment_ids = [] main_eval['has_ans_f1'] = has_ans_f1
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# Original TF implem also keep the classification token (set to 0) (not sure why...) def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
p_mask = [] num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
cur_score = num_no_ans
# CLS token at the beginning best_score = cur_score
if not cls_token_at_end: best_thresh = 0.0
tokens.append(cls_token) qid_list = sorted(na_probs, key=lambda k: na_probs[k])
segment_ids.append(cls_token_segment_id) for _, qid in enumerate(qid_list):
p_mask.append(0) if qid not in scores:
cls_index = 0 continue
if qid_to_has_ans[qid]:
# XLNet: P SEP Q SEP CLS diff = scores[qid]
# Others: CLS Q SEP P SEP else:
if not sequence_a_is_doc: if preds[qid]:
# Query diff = -1
tokens += query_tokens else:
segment_ids += [sequence_a_segment_id] * len(query_tokens) diff = 0
p_mask += [1] * len(query_tokens) cur_score += diff
if cur_score > best_score:
# SEP token best_score = cur_score
tokens.append(sep_token) best_thresh = na_probs[qid]
segment_ids.append(sequence_a_segment_id) return 100.0 * best_score / len(scores), best_thresh
p_mask.append(1)
# Paragraph def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
for i in range(doc_span.length): best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
split_token_index = doc_span.start + i best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
main_eval['best_exact'] = best_exact
is_max_context = _check_is_max_context(doc_spans, doc_span_index, main_eval['best_exact_thresh'] = exact_thresh
split_token_index) main_eval['best_f1'] = best_f1
token_is_max_context[len(tokens)] = is_max_context main_eval['best_f1_thresh'] = f1_thresh
tokens.append(all_doc_tokens[split_token_index])
if not sequence_a_is_doc:
segment_ids.append(sequence_b_segment_id) def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
else: qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
segment_ids.append(sequence_a_segment_id) has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
p_mask.append(0) no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
paragraph_len = doc_span.length
if no_answer_probs is None:
if sequence_a_is_doc: no_answer_probs = {k: 0.0 for k in preds}
# SEP token
tokens.append(sep_token) exact, f1 = get_raw_scores(examples, preds)
segment_ids.append(sequence_a_segment_id)
p_mask.append(1) exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
tokens += query_tokens
segment_ids += [sequence_b_segment_id] * len(query_tokens) evaluation = make_eval_dict(exact_threshold, f1_threshold)
p_mask += [1] * len(query_tokens)
if has_answer_qids:
# SEP token has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
tokens.append(sep_token) merge_eval(evaluation, has_ans_eval, 'HasAns')
segment_ids.append(sequence_b_segment_id)
p_mask.append(1) if no_answer_qids:
no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
# CLS token at the end merge_eval(evaluation, no_ans_eval, 'NoAns')
if cls_token_at_end:
tokens.append(cls_token) if no_answer_probs:
segment_ids.append(cls_token_segment_id) find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
p_mask.append(0)
cls_index = len(tokens) - 1 # Index of classification token return evaluation
input_ids = tokenizer.convert_tokens_to_ids(tokens)
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
# The mask has 1 for real tokens and 0 for padding tokens. Only real """Project the tokenized prediction back to the original text."""
# tokens are attended to.
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# Zero-pad up to the sequence length. # now `orig_text` contains the span of our original text corresponding to the
while len(input_ids) < max_seq_length: # span that we predicted.
input_ids.append(pad_token)
input_mask.append(0 if mask_padding_with_zero else 1)
segment_ids.append(pad_token_segment_id)
p_mask.append(1)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
span_is_impossible = example.is_impossible
start_position = None
end_position = None
if is_training and not span_is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
span_is_impossible = True
else:
if sequence_a_is_doc:
doc_offset = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and span_is_impossible:
start_position = cls_index
end_position = cls_index
if example_index < 20:
logger.info("*** Example ***")
logger.info("unique_id: %s" % (unique_id))
logger.info("example_index: %s" % (example_index))
logger.info("doc_span_index: %s" % (doc_span_index))
logger.info("tokens: %s" % " ".join(tokens))
logger.info("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
logger.info("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info(
"input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info(
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
if is_training and span_is_impossible:
logger.info("impossible example")
if is_training and not span_is_impossible:
answer_text = " ".join(tokens[start_position:(end_position + 1)])
logger.info("start_position: %d" % (start_position))
logger.info("end_position: %d" % (end_position))
logger.info(
"answer: %s" % (answer_text))
features.append(
InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
cls_index=cls_index,
p_mask=p_mask,
paragraph_len=paragraph_len,
start_position=start_position,
end_position=end_position,
is_impossible=span_is_impossible))
unique_id += 1
return features
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
# #
# The original whitespace-tokenized answer will be "(1895-1943).". However # However, `orig_text` may contain extra characters that we don't want in
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match # our prediction.
# the exact answer, 1895.
# #
# However, this is not always possible. Consider the following: # For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
# #
# Question: What country is the top exporter of electornics? # We don't want to return `orig_text` because it contains the extra "'s".
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
# #
# In this case, the annotator chose "Japan" as a character sub-span of # We don't want to return `pred_text` because it's already been normalized
# the word "Japanese". Since our WordPiece tokenizer does not split # (the SQuAD eval script also does punctuation stripping/lower casing but
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare # our tokenizer does additional normalization like stripping accent
# in SQuAD, but does happen. # characters).
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
# #
# Now the word 'bought' will have two scores from spans B and C. We only # What we really want to return is "Steve Smith".
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
# #
# In the example the maximum context for 'bought' would be span C since # Therefore, we have to apply a semi-complicated alignment heuristic between
# it has 1 left context and 3 right context, while span B has 4 left context # `pred_text` and `orig_text` to get a character-to-character alignment. This
# and 0 right context. # can fail in certain cases in which case we just return `orig_text`.
best_score = None
best_span_index = None def _strip_spaces(text):
for (span_index, doc_span) in enumerate(doc_spans): ns_chars = []
end = doc_span.start + doc_span.length - 1 ns_to_s_map = collections.OrderedDict()
if position < doc_span.start: for (i, c) in enumerate(text):
continue if c == " ":
if position > end: continue
continue ns_to_s_map[len(ns_chars)] = i
num_left_context = position - doc_span.start ns_chars.append(c)
num_right_context = end - position ns_text = "".join(ns_chars)
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length return (ns_text, ns_to_s_map)
if best_score is None or score > best_score:
best_score = score # We first tokenize `orig_text`, strip whitespace from the result
best_span_index = span_index # and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose_logging:
logger.info(
"Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in tok_ns_to_s_map.items():
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose_logging:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
return cur_span_index == best_span_index if orig_end_position is None:
if verbose_logging:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
def write_predictions(all_examples, all_features, all_results, n_best_size, def _get_best_indexes(logits, n_best_size):
max_answer_length, do_lower_case, output_prediction_file, """Get the n-best logits from a list."""
output_nbest_file, output_null_log_odds_file, verbose_logging, index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
version_2_with_negative, null_score_diff_threshold):
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
def compute_predictions_logits(
all_examples,
all_features,
all_results,
n_best_size,
max_answer_length,
do_lower_case,
output_prediction_file,
output_nbest_file,
output_null_log_odds_file,
verbose_logging,
version_2_with_negative,
null_score_diff_threshold,
tokenizer,
):
"""Write final predictions to the json file and log-odds of null if needed.""" """Write final predictions to the json file and log-odds of null if needed."""
logger.info("Writing predictions to: %s" % (output_prediction_file)) logger.info("Writing predictions to: %s" % (output_prediction_file))
logger.info("Writing nbest to: %s" % (output_nbest_file)) logger.info("Writing nbest to: %s" % (output_nbest_file))
...@@ -594,11 +475,14 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, ...@@ -594,11 +475,14 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off. tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "") # tok_text = " ".join(tok_tokens)
#
# # De-tokenize WordPieces that have been split off.
# tok_text = tok_text.replace(" ##", "")
# tok_text = tok_text.replace("##", "")
# Clean whitespace # Clean whitespace
tok_text = tok_text.strip() tok_text = tok_text.strip()
...@@ -627,12 +511,12 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, ...@@ -627,12 +511,12 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
text="", text="",
start_logit=null_start_logit, start_logit=null_start_logit,
end_logit=null_end_logit)) end_logit=null_end_logit))
# In very rare edge cases we could only have single null prediction. # In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure. # So we just create a nonce prediction in this case to avoid failure.
if len(nbest)==1: if len(nbest) == 1:
nbest.insert(0, nbest.insert(0,
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
# In very rare edge cases we could have no valid predictions. So we # In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure. # just create a nonce prediction in this case to avoid failure.
...@@ -689,18 +573,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, ...@@ -689,18 +573,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
return all_predictions return all_predictions
# For XLNet (and XLM which uses the same head) def compute_predictions_log_probs(
RawResultExtended = collections.namedtuple("RawResultExtended", all_examples,
["unique_id", "start_top_log_probs", "start_top_index", all_features,
"end_top_log_probs", "end_top_index", "cls_logits"]) all_results,
n_best_size,
max_answer_length,
def write_predictions_extended(all_examples, all_features, all_results, n_best_size, output_prediction_file,
max_answer_length, output_prediction_file, output_nbest_file,
output_nbest_file, output_null_log_odds_file,
output_null_log_odds_file, orig_data_file, start_n_top,
start_n_top, end_n_top, version_2_with_negative, end_n_top,
tokenizer, verbose_logging): version_2_with_negative,
tokenizer,
verbose_logging
):
""" XLNet write prediction logic (more complex than Bert's). """ XLNet write prediction logic (more complex than Bert's).
Write final predictions to the json file and log-odds of null if needed. Write final predictions to the json file and log-odds of null if needed.
...@@ -709,7 +596,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s ...@@ -709,7 +596,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction", "PrelimPrediction",
["feature_index", "start_index", "end_index", ["feature_index", "start_index", "end_index",
"start_log_prob", "end_log_prob"]) "start_log_prob", "end_log_prob"])
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
...@@ -746,12 +633,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s ...@@ -746,12 +633,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
for i in range(start_n_top): for i in range(start_n_top):
for j in range(end_n_top): for j in range(end_n_top):
start_log_prob = result.start_top_log_probs[i] start_log_prob = result.start_logits[i]
start_index = result.start_top_index[i] start_index = result.start_top_index[i]
j_index = i * end_n_top + j j_index = i * end_n_top + j
end_log_prob = result.end_top_log_probs[j_index] end_log_prob = result.end_logits[j_index]
end_index = result.end_top_index[j_index] end_index = result.end_top_index[j_index]
# We could hypothetically create invalid predictions, e.g., predict # We could hypothetically create invalid predictions, e.g., predict
...@@ -792,7 +679,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s ...@@ -792,7 +679,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
# XLNet un-tokenizer # XLNet un-tokenizer
# Let's keep it simple for now and see if we need all this later. # Let's keep it simple for now and see if we need all this later.
# #
# tok_start_to_orig_index = feature.tok_start_to_orig_index # tok_start_to_orig_index = feature.tok_start_to_orig_index
# tok_end_to_orig_index = feature.tok_end_to_orig_index # tok_end_to_orig_index = feature.tok_end_to_orig_index
# start_orig_pos = tok_start_to_orig_index[pred.start_index] # start_orig_pos = tok_start_to_orig_index[pred.start_index]
...@@ -812,7 +699,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s ...@@ -812,7 +699,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
tok_text = " ".join(tok_text.split()) tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens) orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, if hasattr(tokenizer, "do_lower_case"):
do_lower_case = tokenizer.do_lower_case
else:
do_lower_case = tokenizer.do_lowercase_and_remove_accent
final_text = get_final_text(tok_text, orig_text, do_lower_case,
verbose_logging) verbose_logging)
if final_text in seen_predictions: if final_text in seen_predictions:
...@@ -872,146 +764,4 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s ...@@ -872,146 +764,4 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
with open(output_null_log_odds_file, "w") as writer: with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n") writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
with open(orig_data_file, "r", encoding='utf-8') as reader: return all_predictions
orig_data = json.load(reader)["data"]
qid_to_has_ans = make_qid_to_has_ans(orig_data)
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
out_eval = {}
find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
return out_eval
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heuristic between
# `pred_text` and `orig_text` to get a character-to-character alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose_logging:
logger.info(
"Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in tok_ns_to_s_map.items():
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose_logging:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose_logging:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
from .utils import InputExample, InputFeatures, DataProcessor from .utils import InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
\ No newline at end of file
...@@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer, ...@@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer,
if is_tf_available() and is_tf_dataset: if is_tf_available() and is_tf_dataset:
def gen(): def gen():
for ex in features: for ex in features:
yield ({'input_ids': ex.input_ids, yield ({'input_ids': ex.input_ids,
'attention_mask': ex.attention_mask, 'attention_mask': ex.attention_mask,
'token_type_ids': ex.token_type_ids}, 'token_type_ids': ex.token_type_ids},
ex.label) ex.label)
......
from tqdm import tqdm
import collections
import logging
import os
import json
import numpy as np
from multiprocessing import Pool
from multiprocessing import cpu_count
from functools import partial
from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
from .utils import DataProcessor, InputExample, InputFeatures
from ...file_utils import is_tf_available, is_torch_available
if is_torch_available():
import torch
from torch.utils.data import TensorDataset
if is_tf_available():
import tensorflow as tf
logger = logging.getLogger(__name__)
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def _new_check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# if len(doc_spans) == 1:
# return True
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span["start"] + doc_span["length"] - 1
if position < doc_span["start"]:
continue
if position > end:
continue
num_left_context = position - doc_span["start"]
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def _is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
def squad_convert_example_to_features(example, max_seq_length,
doc_stride, max_query_length, is_training):
features = []
if is_training and not example.is_impossible:
# Get start and end position
start_position = example.start_position
end_position = example.end_position
# If the answer cannot be found in the text, then skip this example.
actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
return []
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
)
spans = []
truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \
if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence
sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
span_doc_tokens = all_doc_tokens
while len(spans) * doc_stride < len(all_doc_tokens):
encoded_dict = tokenizer.encode_plus(
truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
max_length=max_seq_length,
return_overflowing_tokens=True,
pad_to_max_length=True,
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
)
paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride,
max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
if tokenizer.pad_token_id in encoded_dict['input_ids']:
non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
else:
non_padded_ids = encoded_dict['input_ids']
tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
token_to_orig_map = {}
for i in range(paragraph_len):
index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
encoded_dict["paragraph_len"] = paragraph_len
encoded_dict["tokens"] = tokens
encoded_dict["token_to_orig_map"] = token_to_orig_map
encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
encoded_dict["token_is_max_context"] = {}
encoded_dict["start"] = len(spans) * doc_stride
encoded_dict["length"] = paragraph_len
spans.append(encoded_dict)
if "overflowing_tokens" not in encoded_dict:
break
span_doc_tokens = encoded_dict["overflowing_tokens"]
for doc_span_index in range(len(spans)):
for j in range(spans[doc_span_index]["paragraph_len"]):
is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
index = j if tokenizer.padding_side == "left" else spans[doc_span_index][
"truncated_query_with_special_tokens_length"] + j
spans[doc_span_index]["token_is_max_context"][index] = is_max_context
for span in spans:
# Identify the position of the CLS token
cls_index = span['input_ids'].index(tokenizer.cls_token_id)
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# Original TF implem also keep the classification token (set to 0) (not sure why...)
p_mask = np.array(span['token_type_ids'])
p_mask = np.minimum(p_mask, 1)
if tokenizer.padding_side == "right":
# Limit positive values to one
p_mask = 1 - p_mask
p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
# Set the CLS index to '0'
p_mask[cls_index] = 0
span_is_impossible = example.is_impossible
start_position = 0
end_position = 0
if is_training and not span_is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = span["start"]
doc_end = span["start"] + span["length"] - 1
out_of_span = False
if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = cls_index
end_position = cls_index
span_is_impossible = True
else:
if tokenizer.padding_side == "left":
doc_offset = 0
else:
doc_offset = len(truncated_query) + sequence_added_tokens
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
features.append(SquadFeatures(
span['input_ids'],
span['attention_mask'],
span['token_type_ids'],
cls_index,
p_mask.tolist(),
example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing.
unique_id=0,
paragraph_len=span['paragraph_len'],
token_is_max_context=span["token_is_max_context"],
tokens=span["tokens"],
token_to_orig_map=span["token_to_orig_map"],
start_position=start_position,
end_position=end_position
))
return features
def squad_convert_example_to_features_init(tokenizer_for_convert):
global tokenizer
tokenizer = tokenizer_for_convert
def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, is_training,
return_dataset=False, threads=1):
"""
Converts a list of examples into a list of features that can be directly given as input to a model.
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args:
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
max_seq_length: The maximum sequence length of the inputs.
doc_stride: The stride used when the context is too large and is split across several features.
max_query_length: The maximum length of the query.
is_training: whether to create features for model evaluation or model training.
return_dataset: Default False. Either 'pt' or 'tf'.
if 'pt': returns a torch.data.TensorDataset,
if 'tf': returns a tf.data.Dataset
threads: multiple processing threadsa-smi
Returns:
list of :class:`~transformers.data.processors.squad.SquadFeatures`
Example::
processor = SquadV2Processor()
examples = processor.get_dev_examples(data_dir)
features = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=not evaluate,
)
"""
# Defining helper methods
features = []
threads = min(threads, cpu_count())
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
annotate_ = partial(squad_convert_example_to_features, max_seq_length=max_seq_length,
doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training)
features = list(tqdm(p.imap(annotate_, examples, chunksize=32), total=len(examples), desc='convert squad examples to features'))
new_features = []
unique_id = 1000000000
example_index = 0
for example_features in tqdm(features, total=len(features), desc='add example index and unique id'):
if not example_features:
continue
for example_feature in example_features:
example_feature.example_index = example_index
example_feature.unique_id = unique_id
new_features.append(example_feature)
unique_id += 1
example_index += 1
features = new_features
del new_features
if return_dataset == 'pt':
if not is_torch_available():
raise ImportError("Pytorch must be installed to return a pytorch dataset.")
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
if not is_training:
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
dataset = TensorDataset(
all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
)
else:
all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
dataset = TensorDataset(
all_input_ids,
all_attention_masks,
all_token_type_ids,
all_start_positions,
all_end_positions,
all_cls_index,
all_p_mask,
)
return features, dataset
elif return_dataset == "tf":
if not is_tf_available():
raise ImportError("TensorFlow must be installed to return a TensorFlow dataset.")
def gen():
for ex in features:
yield (
{
"input_ids": ex.input_ids,
"attention_mask": ex.attention_mask,
"token_type_ids": ex.token_type_ids,
}, {
"start_position": ex.start_position,
"end_position": ex.end_position,
"cls_index": ex.cls_index,
"p_mask": ex.p_mask,
}
)
return tf.data.Dataset.from_generator(
gen,
(
{"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
{"start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32},
),
(
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"token_type_ids": tf.TensorShape([None]),
},
{
"start_position": tf.TensorShape([]),
"end_position": tf.TensorShape([]),
"cls_index": tf.TensorShape([]),
"p_mask": tf.TensorShape([None]),
},
),
)
return features
class SquadProcessor(DataProcessor):
"""
Processor for the SQuAD data set.
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
"""
train_file = None
dev_file = None
def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
if not evaluate:
answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
answers = []
else:
answers = [
{"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
]
answer = None
answer_start = None
return SquadExample(
qas_id=tensor_dict["id"].numpy().decode("utf-8"),
question_text=tensor_dict["question"].numpy().decode("utf-8"),
context_text=tensor_dict["context"].numpy().decode("utf-8"),
answer_text=answer,
start_position_character=answer_start,
title=tensor_dict["title"].numpy().decode("utf-8"),
answers=answers,
)
def get_examples_from_dataset(self, dataset, evaluate=False):
"""
Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
Args:
dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
evaluate: boolean specifying if in evaluation mode or in training mode
Returns:
List of SquadExample
Examples::
import tensorflow_datasets as tfds
dataset = tfds.load("squad")
training_examples = get_examples_from_dataset(dataset, evaluate=False)
evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
"""
if evaluate:
dataset = dataset["validation"]
else:
dataset = dataset["train"]
examples = []
for tensor_dict in tqdm(dataset):
examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
return examples
def get_train_examples(self, data_dir, filename=None):
"""
Returns the training examples from the data directory.
Args:
data_dir: Directory containing the data files used for training and evaluating.
filename: None by default, specify this if the training file has a different name than the original one
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
"""
if data_dir is None:
data_dir = ""
if self.train_file is None:
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
with open(
os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
) as reader:
input_data = json.load(reader)["data"]
return self._create_examples(input_data, "train")
def get_dev_examples(self, data_dir, filename=None):
"""
Returns the evaluation example from the data directory.
Args:
data_dir: Directory containing the data files used for training and evaluating.
filename: None by default, specify this if the evaluation file has a different name than the original one
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
"""
if data_dir is None:
data_dir = ""
if self.dev_file is None:
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
with open(
os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
) as reader:
input_data = json.load(reader)["data"]
return self._create_examples(input_data, "dev")
def _create_examples(self, input_data, set_type):
is_training = set_type == "train"
examples = []
for entry in tqdm(input_data):
title = entry["title"]
for paragraph in entry["paragraphs"]:
context_text = paragraph["context"]
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position_character = None
answer_text = None
answers = []
if "is_impossible" in qa:
is_impossible = qa["is_impossible"]
else:
is_impossible = False
if not is_impossible:
if is_training:
answer = qa["answers"][0]
answer_text = answer["text"]
start_position_character = answer["answer_start"]
else:
answers = qa["answers"]
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
context_text=context_text,
answer_text=answer_text,
start_position_character=start_position_character,
title=title,
is_impossible=is_impossible,
answers=answers,
)
examples.append(example)
return examples
class SquadV1Processor(SquadProcessor):
train_file = "train-v1.1.json"
dev_file = "dev-v1.1.json"
class SquadV2Processor(SquadProcessor):
train_file = "train-v2.0.json"
dev_file = "dev-v2.0.json"
class SquadExample(object):
"""
A single training/test example for the Squad dataset, as loaded from disk.
Args:
qas_id: The example's unique identifier
question_text: The question string
context_text: The context string
answer_text: The answer string
start_position_character: The character position of the start of the answer
title: The title of the example
answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
is_impossible: False by default, set to True if the example has no possible answer.
"""
def __init__(
self,
qas_id,
question_text,
context_text,
answer_text,
start_position_character,
title,
answers=[],
is_impossible=False,
):
self.qas_id = qas_id
self.question_text = question_text
self.context_text = context_text
self.answer_text = answer_text
self.title = title
self.is_impossible = is_impossible
self.answers = answers
self.start_position, self.end_position = 0, 0
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
# Split on whitespace so that different tokens may be attributed to their original position.
for c in self.context_text:
if _is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
self.doc_tokens = doc_tokens
self.char_to_word_offset = char_to_word_offset
# Start end end positions only has a value during evaluation.
if start_position_character is not None and not is_impossible:
self.start_position = char_to_word_offset[start_position_character]
self.end_position = char_to_word_offset[
min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
]
class SquadFeatures(object):
"""
Single squad example features to be fed to a model.
Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
Args:
input_ids: Indices of input sequence tokens in the vocabulary.
attention_mask: Mask to avoid performing attention on padding token indices.
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
cls_index: the index of the CLS token.
p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
example_index: the index of the example
unique_id: The unique Feature identifier
paragraph_len: The length of the context
token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
If a token does not have their maximum context in this feature object, it means that another feature object
has more information related to that token and should be prioritized over this feature for that token.
tokens: list of tokens corresponding to the input ids
token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
start_position: start of the answer token index
end_position: end of the answer token index
"""
def __init__(
self,
input_ids,
attention_mask,
token_type_ids,
cls_index,
p_mask,
example_index,
unique_id,
paragraph_len,
token_is_max_context,
tokens,
token_to_orig_map,
start_position,
end_position,
):
self.input_ids = input_ids
self.attention_mask = attention_mask
self.token_type_ids = token_type_ids
self.cls_index = cls_index
self.p_mask = p_mask
self.example_index = example_index
self.unique_id = unique_id
self.paragraph_len = paragraph_len
self.token_is_max_context = token_is_max_context
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.start_position = start_position
self.end_position = end_position
class SquadResult(object):
"""
Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
Args:
unique_id: The unique identifier corresponding to that example.
start_logits: The logits corresponding to the start of the answer
end_logits: The logits corresponding to the end of the answer
"""
def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
self.start_logits = start_logits
self.end_logits = end_logits
self.unique_id = unique_id
if start_top_index:
self.start_top_index = start_top_index
self.end_top_index = end_top_index
self.cls_logits = cls_logits
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment