Unverified Commit 0c9c8472 authored by Susnato Dhar's avatar Susnato Dhar Committed by GitHub
Browse files

Add Ernie-M Model to huggingface (#21349)

* config and tokenization(fast too) changed and ErnieEncoder added

* Slow Tokenization Added

* Tokenizer(slow) is now working and Fast Tokenizer removed

* Added Config code

* Added Base Model and utils

* ErnieMModel is now working

* All added except tests

* All tests passed except ErnieUIEM

* All tests passed

* all fixes done

* all fixes done

* fixed MAP

* fixed check_code_quality

* fixed Build PR Documentation issue

* Added changes(comments) and also updated to the latest upstream/main

* Added fixup

* Added # Copied comments

* Added fixup

* Added more comments and some nits

* Added fixup

* Fixed README_hd.md

* Added more fixes

* ErnieMTokenizer (being sentencepiece) protected and other docs edited

* Added code_quality fix

* Fixed for

* Added more fix

* modified AZ

* ernie-m tokenization test added!

* attention mask part fixed(with 0->self.config.pad_token_id)

* applied make fixup
parent 40ca1336
# coding=utf-8
# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" ErnieM model configuration"""
# Adapted from original paddlenlp repository.(https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_m/configuration.py)
from __future__ import annotations
from typing import Dict
from ...configuration_utils import PretrainedConfig
ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"susnato/ernie-m-base_pytorch": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/config.json",
"susnato/ernie-m-large_pytorch": "https://huggingface.co/susnato/ernie-m-large_pytorch/blob/main/config.json",
}
class ErnieMConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`ErnieMModel`]. It is used to instantiate a
Ernie-M model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the `Ernie-M`
[susnato/ernie-m-base_pytorch](https://huggingface.co/susnato/ernie-m-base_pytorch) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 250002):
Vocabulary size of `inputs_ids` in [`ErnieMModel`]. Also is the vocab size of token embedding matrix.
Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling
[`ErnieMModel`].
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the embedding layer, encoder layers and pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to feed-forward layers are
firstly projected from hidden_size to intermediate_size, and then projected back to hidden_size. Typically
intermediate_size is larger than hidden_size.
hidden_act (`str`, *optional*, defaults to `"gelu"`):
The non-linear activation function in the feed-forward layer. `"gelu"`, `"relu"` and any other torch
supported activation functions are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings and encoder.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability used in `MultiHeadAttention` in all encoder layers to drop some attention target.
act_dropout (`float`, *optional*, defaults to 0.0):
This dropout probability is used in `ErnieMEncoderLayer` after activation.
max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum value of the dimensionality of position encoding, which dictates the maximum supported length
of an input sequence.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the normal initializer for initializing all weight matrices.
pad_token_id(`int`, *optional*, defaults to 1):
The index of padding token in the token vocabulary.
A normal_initializer initializes weight matrices as normal distributions. See
`ErnieMPretrainedModel._init_weights()` for how weights are initialized in `ErnieMModel`.
"""
model_type = "ernie_m"
attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
def __init__(
self,
vocab_size: int = 250002,
hidden_size: int = 768,
num_hidden_layers: int = 12,
num_attention_heads: int = 12,
intermediate_size: int = 3072,
hidden_act: str = "gelu",
hidden_dropout_prob: float = 0.1,
attention_probs_dropout_prob: float = 0.1,
max_position_embeddings: int = 514,
initializer_range: float = 0.02,
pad_token_id: int = 1,
layer_norm_eps: float = 1e-05,
classifier_dropout=None,
is_decoder=False,
act_dropout=0.0,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.classifier_dropout = classifier_dropout
self.is_decoder = is_decoder
self.act_dropout = act_dropout
# coding=utf-8
# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch ErnieM model."""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn, tensor
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_ernie_m import ErnieMConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "susnato/ernie-m-base_pytorch"
_CONFIG_FOR_DOC = "ErnieMConfig"
_TOKENIZER_FOR_DOC = "ErnieMTokenizer"
ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST = [
"susnato/ernie-m-base_pytorch",
"susnato/ernie-m-large_pytorch",
# See all ErnieM models at https://huggingface.co/models?filter=ernie_m
]
# Adapted from paddlenlp.transformers.ernie_m.modeling.ErnieEmbeddings
class ErnieMEmbeddings(nn.Module):
"""Construct the embeddings from word and position embeddings."""
def __init__(self, config):
super().__init__()
self.hidden_size = config.hidden_size
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id
)
self.layer_norm = nn.LayerNorm(normalized_shape=config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
self.padding_idx = config.pad_token_id
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.LongTensor] = None,
past_key_values_length: int = 0,
) -> torch.Tensor:
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
if position_ids is None:
input_shape = inputs_embeds.size()[:-1]
ones = torch.ones(input_shape, dtype=torch.int64)
seq_length = torch.cumsum(ones, dim=1)
position_ids = seq_length - ones
if past_key_values_length > 0:
position_ids = position_ids + past_key_values_length
# to mimic paddlenlp implementation
position_ids += 2
position_ids = position_ids.to(next(self.position_embeddings.parameters()).device)
position_embeddings = self.position_embeddings(position_ids)
embeddings = inputs_embeds + position_embeddings
embeddings = self.layer_norm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->ErnieM,self.value->self.v_proj,self.key->self.k_proj,self.query->self.q_proj
class ErnieMSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.q_proj = nn.Linear(config.hidden_size, self.all_head_size)
self.k_proj = nn.Linear(config.hidden_size, self.all_head_size)
self.v_proj = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = position_embedding_type or getattr(
config, "position_embedding_type", "absolute"
)
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
mixed_query_layer = self.q_proj(hidden_states)
# If this is instantiated as a cross-attention module, the keys
# and values come from an encoder; the attention mask needs to be
# such that the encoder's padding tokens are not attended to.
is_cross_attention = encoder_hidden_states is not None
if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions
key_layer = past_key_value[0]
value_layer = past_key_value[1]
attention_mask = encoder_attention_mask
elif is_cross_attention:
key_layer = self.transpose_for_scores(self.k_proj(encoder_hidden_states))
value_layer = self.transpose_for_scores(self.v_proj(encoder_hidden_states))
attention_mask = encoder_attention_mask
elif past_key_value is not None:
key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
else:
key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
use_cache = past_key_value is not None
if self.is_decoder:
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
# all previous decoder key/value_states. Further calls to uni-directional self-attention
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
# if encoder bi-directional self-attention `past_key_value` is always `None`
past_key_value = (key_layer, value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
query_length, key_length = query_layer.shape[2], key_layer.shape[2]
if use_cache:
position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
-1, 1
)
else:
position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
distance = position_ids_l - position_ids_r
positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
if self.position_embedding_type == "relative_key":
relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores
elif self.position_embedding_type == "relative_key_query":
relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in ErnieMModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
if self.is_decoder:
outputs = outputs + (past_key_value,)
return outputs
class ErnieMAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type)
self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self_attn.num_attention_heads, self.self_attn.attention_head_size, self.pruned_heads
)
# Prune linear layers
self.self_attn.q_proj = prune_linear_layer(self.self_attn.q_proj, index)
self.self_attn.k_proj = prune_linear_layer(self.self_attn.k_proj, index)
self.self_attn.v_proj = prune_linear_layer(self.self_attn.v_proj, index)
self.out_proj = prune_linear_layer(self.out_proj, index, dim=1)
# Update hyper params and store pruned heads
self.self_attn.num_attention_heads = self.self_attn.num_attention_heads - len(heads)
self.self_attn.all_head_size = self.self_attn.attention_head_size * self.self_attn.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self_attn(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.out_proj(self_outputs[0])
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
class ErnieMEncoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
# to mimic paddlenlp implementation
dropout = 0.1 if config.hidden_dropout_prob is None else config.hidden_dropout_prob
act_dropout = config.hidden_dropout_prob if config.act_dropout is None else config.act_dropout
self.self_attn = ErnieMAttention(config)
self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.dropout = nn.Dropout(act_dropout)
self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size)
self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
if isinstance(config.hidden_act, str):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = True,
):
residual = hidden_states
if output_attentions:
hidden_states, attention_opt_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
)
else:
hidden_states = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
head_mask=head_mask,
past_key_value=past_key_value,
output_attentions=output_attentions,
)
hidden_states = residual + self.dropout1(hidden_states)
hidden_states = self.norm1(hidden_states)
residual = hidden_states
hidden_states = self.linear1(hidden_states)
hidden_states = self.activation(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.linear2(hidden_states)
hidden_states = residual + self.dropout2(hidden_states)
hidden_states = self.norm2(hidden_states)
if output_attentions:
return hidden_states, attention_opt_weights
else:
return hidden_states
class ErnieMEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layers = nn.ModuleList([ErnieMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
def forward(
self,
input_embeds: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
hidden_states = () if output_hidden_states else None
attentions = () if output_attentions else None
output = input_embeds
if output_hidden_states:
hidden_states = hidden_states + (output,)
for i, layer in enumerate(self.layers):
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
output, opt_attn_weights = layer(
hidden_states=output,
attention_mask=attention_mask,
head_mask=layer_head_mask,
past_key_value=past_key_value,
)
if output_hidden_states:
hidden_states = hidden_states + (output,)
if output_attentions:
attentions = attentions + (opt_attn_weights,)
last_hidden_state = output
if not return_dict:
return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=attentions
)
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->ErnieM
class ErnieMPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class ErnieMPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = ErnieMConfig
base_model_prefix = "ernie_m"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, ErnieMEncoder):
module.gradient_checkpointing = value
ERNIE_M_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`ErnieMConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
ERNIE_M_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`ErnieMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare ErnieM Model transformer outputting raw hidden-states without any specific head on top.",
ERNIE_M_START_DOCSTRING,
)
class ErnieMModel(ErnieMPreTrainedModel):
def __init__(self, config, add_pooling_layer=True):
super(ErnieMModel, self).__init__(config)
self.initializer_range = config.initializer_range
self.embeddings = ErnieMEmbeddings(config)
self.encoder = ErnieMEncoder(config)
self.pooler = ErnieMPooler(config) if add_pooling_layer else None
self.post_init()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layers[layer].self_attn.prune_heads(heads)
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutputWithPastAndCrossAttentions,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[tensor] = None,
position_ids: Optional[tensor] = None,
attention_mask: Optional[tensor] = None,
head_mask: Optional[tensor] = None,
inputs_embeds: Optional[tensor] = None,
past_key_values: Optional[Tuple[Tuple[tensor]]] = None,
use_cache: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
# init the default bool value
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.return_dict
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
past_key_values_length = 0
if past_key_values is not None:
past_key_values_length = past_key_values[0][0].shape[2]
# Adapted from paddlenlp.transformers.ernie_m.ErnieMModel
if attention_mask is None:
attention_mask = (input_ids == self.config.pad_token_id).to(torch.float32)
attention_mask *= torch.finfo(attention_mask.dtype).min
if past_key_values is not None:
batch_size = past_key_values[0][0].shape[0]
past_mask = torch.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
attention_mask = torch.concat([past_mask, attention_mask], dim=-1)
# For 2D attention_mask from tokenizer
elif attention_mask.ndim == 2:
attention_mask = attention_mask.to(torch.float32)
attention_mask = 1.0 - attention_mask
attention_mask *= torch.finfo(attention_mask.dtype).min
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
past_key_values=past_key_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
sequence_output = encoder_outputs[0]
pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
return (sequence_output, pooler_output) + encoder_outputs[1:]
sequence_output = encoder_outputs["last_hidden_state"]
pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
hidden_states = None if not output_hidden_states else encoder_outputs["hidden_states"]
attentions = None if not output_attentions else encoder_outputs["attentions"]
return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
pooler_output=pooler_output,
hidden_states=hidden_states,
attentions=attentions,
)
@add_start_docstrings(
"""ErnieM Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks.""",
ERNIE_M_START_DOCSTRING,
)
class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.ernie_m = ErnieMModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.Tensor]] = None,
use_cache: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = True,
labels: Optional[torch.Tensor] = None,
):
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.ernie_m(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
past_key_values=past_key_values,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""ErnieM Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
ERNIE_M_START_DOCSTRING,
)
class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config):
super().__init__(config)
self.ernie_m = ErnieMModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, 1)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = True,
):
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.ernie_m(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""ErnieM Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
ERNIE_M_START_DOCSTRING,
)
class ErnieMForTokenClassification(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.Tensor]] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
return_dict: Optional[bool] = True,
labels: Optional[torch.Tensor] = None,
):
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.ernie_m(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
past_key_values=past_key_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""ErnieM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
ERNIE_M_START_DOCSTRING,
)
class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = True,
):
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.ernie_m(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""ErnieMForInformationExtraction is a Ernie-M Model with two linear layer on top of the hidden-states output to
compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""",
ERNIE_M_START_DOCSTRING,
)
# Copied from paddlenlp.transformers.ernie_m.modeling.UIEM
class ErnieMForInformationExtraction(ErnieMPreTrainedModel):
def __init__(self, config):
super(ErnieMForInformationExtraction, self).__init__(config)
self.ernie_m = ErnieMModel(config)
self.linear_start = nn.Linear(config.hidden_size, 1)
self.linear_end = nn.Linear(config.hidden_size, 1)
self.sigmoid = nn.Sigmoid()
self.post_init()
@add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
start_positions: Optional[torch.Tensor] = None,
end_positions: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = True,
):
r"""
start_positions (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for position (index) for computing the start_positions loss. Position outside of the sequence are
not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) for computing the end_positions loss. Position outside of the sequence are not
taken into account for computing the loss.
"""
result = self.ernie_m(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if return_dict:
sequence_output = result.last_hidden_state
elif not return_dict:
sequence_output = result[0]
start_logits = self.linear_start(sequence_output)
start_logits = start_logits.squeeze(-1)
end_logits = self.linear_end(sequence_output)
end_logits = end_logits.squeeze(-1)
total_loss = None
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = BCEWithLogitsLoss()
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
return tuple(
i
for i in [total_loss, start_logits, end_logits, result.hidden_states, result.attentions]
if i is not None
)
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=result.hidden_states,
attentions=result.attentions,
)
# coding=utf-8
# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for Ernie-M."""
import io
import os
import unicodedata
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "sentencepiece_model_ckpt": "sentencepiece.bpe.model"}
RESOURCE_FILES_NAMES = {
"sentencepiece_model_file": "sentencepiece.bpe.model",
"vocab_file": "vocab.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"ernie-m-base": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/vocab.txt",
"ernie-m-large": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/vocab.txt",
},
"sentencepiece_model_file": {
"ernie-m-base": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/sentencepiece.bpe.model",
"ernie-m-large": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/sentencepiece.bpe.model",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"ernie-m-base": 514,
"ernie-m-large": 514,
}
PRETRAINED_INIT_CONFIGURATION = {
"ernie-m-base": {"do_lower_case": False},
"ernie-m-large": {"do_lower_case": False},
}
# Adapted from paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer
class ErnieMTokenizer(PreTrainedTokenizer):
r"""
Constructs a Ernie-M tokenizer. It uses the `sentencepiece` tools to cut the words to sub-words.
Args:
sentencepiece_model_file (`str`):
The file path of sentencepiece model.
vocab_file (`str`, *optional*):
The file path of the vocabulary.
do_lower_case (`str`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
A special token representing the `unknown (out-of-vocabulary)` token. An unknown token is set to be
`unk_token` inorder to be converted to an ID.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
A special token separating two different sentences in the same input.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
A special token used to make arrays of tokens the same size for batching purposes.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
A special token used for sequence classification. It is the last token of the sequence when built with
special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
A special token representing a masked token. This is the token used in the masked language modeling task
which the model tries to predict the original unmasked ones.
"""
# Ernie-M model doesn't have token_type embedding.
model_input_names: List[str] = ["input_ids"]
vocab_files_names = VOCAB_FILES_NAMES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
resource_files_names = RESOURCE_FILES_NAMES
def __init__(
self,
sentencepiece_model_ckpt,
vocab_file=None,
do_lower_case=False,
encoding="utf8",
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
# Mask token behave like a normal word, i.e. include the space before it and
# is included in the raw text, there should be a match in a non-normalized sentence.
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__(
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
vocab_file=vocab_file,
encoding=encoding,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs,
)
self.do_lower_case = do_lower_case
self.sentencepiece_model_ckpt = sentencepiece_model_ckpt
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(sentencepiece_model_ckpt)
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
if vocab_file is not None:
self.vocab = self.load_vocab(filepath=vocab_file)
else:
self.vocab = dict((self.sp_model.id_to_piece(id), id) for id in range(self.sp_model.get_piece_size()))
self.reverse_vocab = dict((v, k) for k, v in self.vocab.items())
def get_offset_mapping(self, text):
if text is None:
return None
split_tokens = self.tokenize(text)
normalized_text, char_mapping = "", []
for i, ch in enumerate(text):
if ch in self.SP_CHAR_MAPPING:
ch = self.SP_CHAR_MAPPING.get(ch)
else:
ch = unicodedata.normalize("NFKC", ch)
if self.is_whitespace(ch):
continue
normalized_text += ch
char_mapping.extend([i] * len(ch))
text, token_mapping, offset = normalized_text, [], 0
if self.do_lower_case:
text = text.lower()
for token in split_tokens:
if token[:1] == "▁":
token = token[1:]
start = text[offset:].index(token) + offset
end = start + len(token)
token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1))
offset = end
return token_mapping
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.sentencepiece_model_ckpt)
def clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
return "".join((self.SP_CHAR_MAPPING.get(c, c) for c in text))
def _tokenize(self, text, enable_sampling=False, nbest_size=64, alpha=0.1):
"""Tokenize a string."""
if self.sp_model_kwargs.get("enable_sampling") is True:
enable_sampling = True
if self.sp_model_kwargs.get("alpha") is not None:
alpha = self.sp_model_kwargs.get("alpha")
if self.sp_model_kwargs.get("nbest_size") is not None:
nbest_size = self.sp_model_kwargs.get("nbest_size")
if not enable_sampling:
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, nbest_size, alpha)
new_pieces = []
for pi, piece in enumerate(pieces):
if piece == SPIECE_UNDERLINE:
if not pieces[pi + 1].startswith(SPIECE_UNDERLINE) and pi != 0:
new_pieces.append(SPIECE_UNDERLINE)
continue
else:
continue
lst_i = 0
for i, chunk in enumerate(piece):
if chunk == SPIECE_UNDERLINE:
continue
if self.is_ch_char(chunk) or self.is_punct(chunk):
if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
new_pieces.append(piece[lst_i:i])
new_pieces.append(chunk)
lst_i = i + 1
elif chunk.isdigit() and i > 0 and not piece[i - 1].isdigit():
if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
new_pieces.append(piece[lst_i:i])
lst_i = i
elif not chunk.isdigit() and i > 0 and piece[i - 1].isdigit():
if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
new_pieces.append(piece[lst_i:i])
lst_i = i
if len(piece) > lst_i:
new_pieces.append(piece[lst_i:])
return new_pieces
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def convert_ids_to_string(self, ids):
"""
Converts a sequence of tokens (strings for sub-words) in a single string.
"""
tokens = self.convert_ids_to_tokens(ids)
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
def _convert_token_to_id(self, token):
return self.vocab.get(token, self.vocab.get(self.unk_token))
# to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.reverse_vocab.get(index, self.unk_token)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
r"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An ErnieM sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of input_id with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
_cls = [self.cls_token_id]
_sep = [self.sep_token_id]
return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
r"""
Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. An Ernie-M
offset_mapping has the following format:
- single sequence: `(0,0) X (0,0)`
- pair of sequences: `(0,0) A (0,0) (0,0) B (0,0)`
Args:
offset_mapping_ids_0 (`List[tuple]`):
List of char offsets to which the special tokens will be added.
offset_mapping_ids_1 (`List[tuple]`, *optional*):
Optional second list of wordpiece offsets for offset mapping pairs.
Returns:
`List[tuple]`: List of wordpiece offsets with the appropriate offsets of special tokens.
"""
if offset_mapping_1 is None:
return [(0, 0)] + offset_mapping_0 + [(0, 0)]
return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
r"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `encode` method.
Args:
token_ids_0 (`List[int]`):
List of ids of the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`str`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`:
The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create the token type IDs corresponding to the sequences passed. [What are token type
IDs?](../glossary#token-type-ids) Should be overridden in a subclass if the model has a special way of
building: those.
Args:
token_ids_0 (`List[int]`):
The first tokenized sequence.
token_ids_1 (`List[int]`, *optional*):
The second tokenized sequence.
Returns:
`List[int]`: The token type ids.
"""
# called when `add_special_tokens` is True, so align with `build_inputs_with_special_tokens` method
if token_ids_1 is None:
# [CLS] X [SEP]
return (len(token_ids_0) + 2) * [0]
# [CLS] A [SEP] [SEP] B [SEP]
return [0] * (len(token_ids_0) + 1) + [1] * (len(token_ids_1) + 3)
def is_ch_char(self, char):
"""
is_ch_char
"""
if "\u4e00" <= char <= "\u9fff":
return True
return False
def is_alpha(self, char):
"""
is_alpha
"""
if ("a" <= char <= "z") or ("A" <= char <= "Z"):
return True
return False
def is_punct(self, char):
"""
is_punct
"""
if char in ",;:.?!~,;:。?!《》【】":
return True
return False
def is_whitespace(self, char):
"""
is whitespace
"""
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
if len(char) == 1:
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def load_vocab(self, filepath):
token_to_idx = {}
with io.open(filepath, "r", encoding="utf-8") as f:
for index, line in enumerate(f):
token = line.rstrip("\n")
token_to_idx[token] = int(index)
return token_to_idx
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token + "\n")
index += 1
tokenizer_model_file = os.path.join(save_directory, "sentencepiece.bpe.model")
with open(tokenizer_model_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (vocab_file,)
......@@ -2415,6 +2415,58 @@ class ErniePreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST = None
class ErnieMForInformationExtraction(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ErnieMForMultipleChoice(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ErnieMForQuestionAnswering(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ErnieMForSequenceClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ErnieMForTokenClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ErnieMModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ErnieMPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
ESM_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -58,6 +58,13 @@ class DebertaV2Tokenizer(metaclass=DummyObject):
requires_backends(self, ["sentencepiece"])
class ErnieMTokenizer(metaclass=DummyObject):
_backends = ["sentencepiece"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["sentencepiece"])
class FNetTokenizer(metaclass=DummyObject):
_backends = ["sentencepiece"]
......
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch ErnieM model. """
import unittest
from transformers import ErnieMConfig, is_torch_available
from transformers.testing_utils import require_torch, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
if is_torch_available():
import torch
from transformers import (
ErnieMForInformationExtraction,
ErnieMForMultipleChoice,
ErnieMForQuestionAnswering,
ErnieMForSequenceClassification,
ErnieMForTokenClassification,
ErnieMModel,
)
from transformers.models.ernie_m.modeling_ernie_m import ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST
class ErnieMModelTester:
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = self.get_config()
return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
def prepare_config_and_inputs_for_uiem(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
config = self.get_config()
return config, input_ids, input_mask
def get_config(self):
return ErnieMConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range,
)
def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = ErnieMModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, return_dict=True)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_for_question_answering(
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = ErnieMForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
start_positions=sequence_labels,
end_positions=sequence_labels,
)
self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
def create_and_check_for_information_extraction(
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = ErnieMForInformationExtraction(config=config)
model.to(torch_device)
model.eval()
sequence_labels = torch.ones_like(input_ids, dtype=torch.float32)
result = model(
input_ids,
attention_mask=input_mask,
start_positions=sequence_labels,
end_positions=sequence_labels,
)
self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
def create_and_check_for_sequence_classification(
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = ErnieMForSequenceClassification(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def create_and_check_for_token_classification(
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = ErnieMForTokenClassification(config=config)
model.to(torch_device)
model.eval()
input_ids.to(torch_device)
input_mask.to(torch_device)
token_labels.to(torch_device)
result = model(input_ids, attention_mask=input_mask, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
def create_and_check_for_multiple_choice(
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_choices = self.num_choices
model = ErnieMForMultipleChoice(config=config)
model.to(torch_device)
model.eval()
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
result = model(
multiple_choice_inputs_ids,
attention_mask=multiple_choice_input_mask,
labels=choice_labels,
)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class ErnieMModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (
(
ErnieMModel,
ErnieMForMultipleChoice,
ErnieMForQuestionAnswering,
ErnieMForSequenceClassification,
ErnieMForTokenClassification,
)
if is_torch_available()
else ()
)
all_generative_model_classes = ()
test_torchscript = False
def setUp(self):
self.model_tester = ErnieMModelTester(self)
self.config_tester = ConfigTester(self, config_class=ErnieMConfig, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_model_various_embeddings(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
for type in ["absolute", "relative_key", "relative_key_query"]:
config_and_inputs[0].position_embedding_type = type
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
def test_for_information_extraction(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_information_extraction(*config_and_inputs)
def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
for model_name in ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = ErnieMModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_torch
class ErnieMModelIntegrationTest(unittest.TestCase):
@slow
def test_inference_model(self):
model = ErnieMModel.from_pretrained("susnato/ernie-m-base_pytorch")
model.eval()
input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
output = model(input_ids)[0]
# TODO Replace vocab size
hidden_size = 768
expected_shape = torch.Size((1, 6, hidden_size))
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[[-0.0012, 0.1245, -0.0214], [-0.0742, 0.0244, -0.0771], [-0.0333, 0.1164, -0.1554]]]
)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch ErnieM model. """
import unittest
from transformers import ErnieMTokenizer
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
@require_sentencepiece
@require_tokenizers
class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ErnieMTokenizer
test_seq2seq = False
test_sentencepiece = True
test_rust_tokenizer = False
test_sentencepiece_ignore_case = False
def setUp(self):
super().setUp()
# We have a SentencePiece fixture for testing
tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
tokenizer.save_pretrained(self.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_text = "this is a test"
output_text = "this is a test"
return input_text, output_text
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
token = "<pad>"
token_id = 0
self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
def test_get_vocab(self):
vocab_keys = list(self.get_tokenizer().get_vocab().keys())
self.assertEqual(vocab_keys[0], "<pad>")
self.assertEqual(vocab_keys[1], "<unk>")
self.assertEqual(vocab_keys[-1], "▁eloquent")
self.assertEqual(len(vocab_keys), 30_000)
def test_vocab_size(self):
self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
def test_full_tokenizer(self):
tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, do_lower_case=True, unk_token="<unk>", pad_token="<pad>")
tokens = tokenizer.tokenize("This is a test")
self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
# ErnieMTokenizer(paddlenlp implementation) outputs '9' instead of '_9' so to mimic that '_9' is changed to '9'
self.assertListEqual(
tokens, ["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
)
ids = tokenizer.convert_tokens_to_ids(tokens)
self.assertListEqual(ids, [31, 23, 386, 19, 518, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
back_tokens = tokenizer.convert_ids_to_tokens(ids)
self.assertListEqual(
back_tokens,
["▁i", "▁was", "▁born", "▁in", "9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
)
def test_sequence_builders(self):
tokenizer = ErnieMTokenizer(SAMPLE_VOCAB, unk_token="<unk>", pad_token="<pad>")
text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build")
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + [
tokenizer.sep_token_id
] + text_2 + [tokenizer.sep_token_id]
@slow
def test_tokenizer_integration(self):
# fmt: off
expected_encoding = {'input_ids': [[0, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 9, 304, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 5, 5, 5, 16, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 6460, 1328, 4589, 42, 122009, 115774, 23, 3559, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [0, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
# fmt: on
self.tokenizer_integration_test_util(
expected_encoding=expected_encoding,
model_name="susnato/ernie-m-base_pytorch",
sequences=[
"Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
"general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
"Language Understanding (NLU) and Natural Language Generation (NLG) with over32+ pretrained "
"models in100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
"BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
"conditioning on both left and right context in all layers.",
"The quick brown fox jumps over the lazy dog.",
],
)
......@@ -56,6 +56,7 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
"Blip2QFormerModel", # Building part of bigger (tested) model.
"DetaEncoder", # Building part of bigger (tested) model.
"DetaDecoder", # Building part of bigger (tested) model.
"ErnieMForInformationExtraction",
"GraphormerEncoder", # Building part of bigger (tested) model.
"GraphormerDecoderHead", # Building part of bigger (tested) model.
"CLIPSegDecoder", # Building part of bigger (tested) model.
......@@ -175,6 +176,7 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"Blip2ForConditionalGeneration",
"Blip2QFormerModel",
"Blip2VisionModel",
"ErnieMForInformationExtraction",
"GitVisionModel",
"GraphormerModel",
"GraphormerForGraphClassification",
......
......@@ -80,6 +80,8 @@ src/transformers/models/electra/configuration_electra.py
src/transformers/models/electra/modeling_electra.py
src/transformers/models/electra/modeling_tf_electra.py
src/transformers/models/ernie/configuration_ernie.py
src/transformers/models/ernie_m/configuration_ernie_m.py
src/transformers/models/ernie_m/modeling_ernie_m.py
src/transformers/models/flava/configuration_flava.py
src/transformers/models/fnet/configuration_fnet.py
src/transformers/models/fsmt/configuration_fsmt.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment