"megatron/vscode:/vscode.git/clone" did not exist on "803ae5ee660b66c0da06ad8174d48699d49af8bc"
Unverified Commit 2422cda0 authored by Weizhen's avatar Weizhen Committed by GitHub
Browse files

ProphetNet (#7157)



* add new model prophetnet

prophetnet modified

modify codes as suggested v1

add prophetnet test files

* still bugs, because of changed output formats of encoder and decoder

* move prophetnet into the latest version

* clean integration tests

* clean tokenizers

* add xlm config to init

* correct typo in init

* further refactoring

* continue refactor

* save parallel

* add decoder_attention_mask

* fix use_cache vs. past_key_values

* fix common tests

* change decoder output logits

* fix xlm tests

* make common tests pass

* change model architecture

* add tokenizer tests

* finalize model structure

* no weight mapping

* correct n-gram stream attention mask as discussed with qweizhen

* remove unused import

* fix index.rst

* fix tests

* delete unnecessary code

* add fast integration test

* rename weights

* final weight remapping

* save intermediate

* Descriptions for Prophetnet Config File

* finish all models

* finish new model outputs

* delete unnecessary files

* refactor encoder layer

* add dummy docs

* code quality

* fix tests

* add model pages to doctree

* further refactor

* more refactor, more tests

* finish code refactor and tests

* remove unnecessary files

* further clean up

* add docstring template

* finish tokenizer doc

* finish prophetnet

* fix copies

* fix typos

* fix tf tests

* fix fp16

* fix tf test 2nd try

* fix code quality

* add test for each model

* merge new tests to branch

* Update model_cards/microsoft/prophetnet-large-uncased-cnndm/README.md
Co-authored-by: default avatarSam Shleifer <sshleifer@gmail.com>

* Update model_cards/microsoft/prophetnet-large-uncased-cnndm/README.md
Co-authored-by: default avatarSam Shleifer <sshleifer@gmail.com>

* Update src/transformers/modeling_prophetnet.py
Co-authored-by: default avatarSam Shleifer <sshleifer@gmail.com>

* Update utils/check_repo.py
Co-authored-by: default avatarSam Shleifer <sshleifer@gmail.com>

* apply sams and sylvains comments

* make style

* remove unnecessary code

* Update README.md
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update README.md
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/configuration_prophetnet.py
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>

* implement lysandres comments

* correct docs

* fix isort

* fix tokenizers

* fix copies
Co-authored-by: default avatarweizhen <weizhen@mail.ustc.edu.cn>
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: default avatarSam Shleifer <sshleifer@gmail.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
parent 8f8f8d99
......@@ -490,12 +490,17 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
if self.config.is_encoder_decoder and self.config.tie_encoder_decoder:
if hasattr(self, self.base_model_prefix):
self = getattr(self, self.base_model_prefix)
self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)
@staticmethod
def _tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str):
uninitialized_encoder_weights: List[str] = []
assert decoder.__class__ == encoder.__class__, f"{decoder.__class__} and {encoder.__class__} have to be equal."
if decoder.__class__ != encoder.__class__:
logger.info(
f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
)
def tie_encoder_to_decoder_recursively(
decoder_pointer: nn.Module,
......@@ -528,7 +533,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
if name.isdigit():
encoder_name = str(int(name) + encoder_layer_pos)
decoder_name = name
if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])):
if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len(
encoder_modules
) != len(decoder_modules):
# this can happen if the name corresponds to the position in a list module list of layers
# in this case the decoder has added a cross-attention that the encoder does not have
# thus skip this step and substract one layer pos from encoder
......
# coding=utf-8
# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch XLM-ProphetNet model."""
from .configuration_xlm_prophetnet import XLMProphetNetConfig
from .modeling_prophetnet import (
ProphetNetDecoder,
ProphetNetEncoder,
ProphetNetForCausalLM,
ProphetNetForConditionalGeneration,
ProphetNetModel,
)
from .utils import logging
logger = logging.get_logger(__name__)
_TOKENIZER_FOR_DOC = "XLMProphetNetTokenizer"
XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/xprophetnet-large-wiki100-cased",
# See all ProphetNet models at https://huggingface.co/models?filter=xprophetnet
]
class XLMProphetNetEncoder(ProphetNetEncoder):
r"""
This class overrides :class:`~transformers.ProphetNetEncoder`. Please check the
superclass for the appropriate documentation alongside usage examples.
Example::
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetEncoder
>>> import torch
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = XLMProphetNetEncoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone', return_dict=True)
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
"""
config_class = XLMProphetNetConfig
class XLMProphetNetDecoder(ProphetNetDecoder):
r"""
This class overrides :class:`~transformers.ProphetNetDecoder`. Please check the
superclass for the appropriate documentation alongside usage examples.
Example::
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetDecoder
>>> import torch
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = XLMProphetNetDecoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone', add_cross_attention=False, return_dict=True)
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
"""
config_class = XLMProphetNetConfig
class XLMProphetNetModel(ProphetNetModel):
r"""
This class overrides :class:`~transformers.ProphetNetModel`. Please check the
superclass for the appropriate documentation alongside usage examples.
Example::
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetModel
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = XLMProphetNetModel.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True)
>>> last_hidden_states = outputs.last_hidden_state # main stream hidden states
>>> last_hidden_states_ngram = outputs.last_hidden_state_ngram # predict hidden states
"""
config_class = XLMProphetNetConfig
class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
r"""
This class overrides :class:`~transformers.ProphetNetForConditionalGeneration`. Please check the
superclass for the appropriate documentation alongside usage examples.
Example::
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForConditionalGeneration
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = XLMProphetNetForConditionalGeneration.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True)
>>> logits_next_token = outputs.logits # logits to predict next token as usual
>>> logits_ngram_next_tokens = outputs.logits_ngram # logits to predict 2nd, 3rd, ... next tokens
"""
config_class = XLMProphetNetConfig
class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
r"""
This class overrides :class:`~transformers.ProphetNetForCausalLM`. Please check the
superclass for the appropriate documentation alongside usage examples.
Example::
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForCausalLM
>>> import torch
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = XLMProphetNetForCausalLM.from_pretrained('patrickvonplaten/xprophetnet-decoder-clm-large-uncased', return_dict=True)
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> # Model can also be used with EncoderDecoder framework
>>> from transformers import BertTokenizer, EncoderDecoderModel
>>> import torch
>>> tokenizer = BertTokenizer.from_pretrained('bert-uncased-large')
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-uncased-large", "patrickvonplaten/xprophetnet-decoder-clm-large-uncased")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"])
>>> loss = outputs.loss
"""
config_class = XLMProphetNetConfig
......@@ -43,6 +43,7 @@ from .configuration_auto import (
MobileBertConfig,
OpenAIGPTConfig,
PegasusConfig,
ProphetNetConfig,
RagConfig,
ReformerConfig,
RetriBertConfig,
......@@ -51,6 +52,7 @@ from .configuration_auto import (
T5Config,
TransfoXLConfig,
XLMConfig,
XLMProphetNetConfig,
XLMRobertaConfig,
XLNetConfig,
replace_list_option_in_docstrings,
......@@ -77,6 +79,7 @@ from .tokenization_lxmert import LxmertTokenizer
from .tokenization_mobilebert import MobileBertTokenizer
from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_phobert import PhobertTokenizer
from .tokenization_prophetnet import ProphetNetTokenizer
from .tokenization_rag import RagTokenizer
from .tokenization_retribert import RetriBertTokenizer
from .tokenization_roberta import RobertaTokenizer
......@@ -95,6 +98,7 @@ if is_sentencepiece_available():
from .tokenization_pegasus import PegasusTokenizer
from .tokenization_reformer import ReformerTokenizer
from .tokenization_t5 import T5Tokenizer
from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer
from .tokenization_xlm_roberta import XLMRobertaTokenizer
from .tokenization_xlnet import XLNetTokenizer
else:
......@@ -199,6 +203,8 @@ TOKENIZER_MAPPING = OrderedDict(
(BertGenerationConfig, (BertGenerationTokenizer, None)),
(DebertaConfig, (DebertaTokenizer, None)),
(RagConfig, (RagTokenizer, None)),
(XLMProphetNetConfig, (XLMProphetNetTokenizer, None)),
(ProphetNetConfig, (ProphetNetTokenizer, None)),
]
)
......
......@@ -14,7 +14,6 @@
# limitations under the License.
""" Tokenization class for model DeBERTa."""
import logging
import os
import pathlib
import random
......@@ -28,6 +27,7 @@ import tqdm
import requests
from .tokenization_utils import PreTrainedTokenizer
from .utils import logging
try:
......@@ -36,7 +36,7 @@ except ImportError:
raise ImportError("Please install regex with: pip install regex")
logger = logging.getLogger(__name__)
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "bpe_encoder.bin"}
......
......@@ -16,7 +16,6 @@
import json
import logging
import os
import re
import unicodedata
......@@ -27,9 +26,10 @@ import sacremoses as sm
from .file_utils import add_start_docstrings
from .tokenization_utils import BatchEncoding, PreTrainedTokenizer
from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
from .utils import logging
logger = logging.getLogger(__name__)
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"src_vocab_file": "vocab-src.json",
......
# coding=utf-8
# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List, Optional, Tuple
from .tokenization_bert import BasicTokenizer, WordpieceTokenizer
from .tokenization_utils import PreTrainedTokenizer
from .utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/prophetnet-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/prophetnet-large-uncased/prophetnet.tokenizer",
}
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/prophetnet-large-uncased": {"do_lower_case": True},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/prophetnet-large-uncased": 512,
}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
class ProphetNetTokenizer(PreTrainedTokenizer):
r"""
Construct a ProphetNetTokenizer. Based on WordPiece.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
File containing the vocabulary.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to lowercase the input when tokenizing.
do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to do basic tokenization before WordPiece.
never_split (:obj:`Iterable`, `optional`):
Collection of tokens which will never be split during tokenization. Only has an effect when
:obj:`do_basic_tokenize=True`
unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
x_sep_token (:obj:`str`, `optional`, defaults to :obj:`"[X_SEP]"`):
Special second separator token, which can be generated by :class:`~transformers.ProphetNetForConditionalGeneration`.
It is used to separate bullet-point like sentences in summarization, *e.g.*.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to tokenize Chinese characters.
This should likely be deactivated for Japanese (see this `issue
<https://github.com/huggingface/transformers/issues/328>`__).
strip_accents: (:obj:`bool`, `optional`):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for :obj:`lowercase` (as in the original BERT).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
x_sep_token="[X_SEP]",
pad_token="[PAD]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs
):
super().__init__(
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
mask_token=mask_token,
x_sep_token=x_sep_token,
**kwargs,
)
self.unique_no_split_tokens.append(x_sep_token)
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = ProphetNetTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
# If the token is part of the never_split set
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1]
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
A ProphetNet sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0]
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
"Saving vocabulary to {}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!".format(vocab_file)
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return token_ids_0 + [self.sep_token_id]
sep = [self.sep_token_id]
return token_ids_0 + sep + token_ids_1 + sep
# coding=utf-8
# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from shutil import copyfile
from typing import List, Optional, Tuple
from .tokenization_utils import PreTrainedTokenizer
from .utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/xprophetnet-large-wiki100-cased": "https://cdn.huggingface.co/microsoft/xprophetnet-large-wiki100-cased/prophetnet.tokenizer",
}
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/xprophetnet-large-wiki100-cased": {"do_lower_case": False},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/xprophetnet-large-wiki100-cased": 512,
}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
class XLMProphetNetTokenizer(PreTrainedTokenizer):
"""
Adapted from :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on
`SentencePiece <https://github.com/google/sentencepiece>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__(
self,
vocab_file,
bos_token="[SEP]",
eos_token="[SEP]",
sep_token="[SEP]",
unk_token="[UNK]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
**kwargs
):
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
mask_token=mask_token,
**kwargs,
)
try:
import sentencepiece as spm
except ImportError:
logger.warning(
"You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
raise
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
# Original fairseq vocab and spm vocab must be "aligned":
# Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
# -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
# fairseq | '<s>' | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's' | '▁de' | '-'
# spm | '<unk>' | '<s>' | '</s>' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a'
# put special tokens and [unused] tokens into the vocab
self.fairseq_tokens_to_ids = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4}
for i in range(10):
tok = "[unused{}]".format(i)
self.fairseq_tokens_to_ids[tok] = 5 + i
# The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab
self.fairseq_offset = 12
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
for k in self.fairseq_tokens_to_ids.keys():
self.unique_no_split_tokens.append(k)
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
try:
import sentencepiece as spm
except ImportError:
logger.warning(
"You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
raise
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1]
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
XLMProphetNet does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0]
return len(token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
@property
def vocab_size(self):
return len(self.sp_model) + self.fairseq_offset
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
spm_id = self.sp_model.PieceToId(token)
# Need to return unknown token if the SP model returned 0
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A XLMProphetNet sequence has the following format:
- single sequence: ``X [SEP]``
- pair of sequences: ``A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return token_ids_0 + [self.sep_token_id]
sep = [self.sep_token_id]
return token_ids_0 + sep + token_ids_1 + sep
......@@ -22,12 +22,13 @@ from typing import List, Optional, Tuple
import sentencepiece as spm
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_xlnet import SPIECE_UNDERLINE
from .utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
PRETRAINED_VOCAB_FILES_MAP = {
......
......@@ -1305,6 +1305,51 @@ class PegasusForConditionalGeneration:
requires_pytorch(self)
PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
class ProphetNetDecoder:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
class ProphetNetEncoder:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
class ProphetNetForCausalLM:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
class ProphetNetForConditionalGeneration:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_pytorch(self)
class ProphetNetModel:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_pytorch(self)
class ProphetNetPreTrainedModel:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_pytorch(self)
class RagModel:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
......@@ -1706,6 +1751,42 @@ class XLMWithLMHeadModel:
requires_pytorch(self)
XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
class XLMProphetNetDecoder:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
class XLMProphetNetEncoder:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
class XLMProphetNetForCausalLM:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
class XLMProphetNetForConditionalGeneration:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_pytorch(self)
class XLMProphetNetModel:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_pytorch(self)
XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -74,6 +74,15 @@ class T5Tokenizer:
requires_sentencepiece(self)
class XLMProphetNetTokenizer:
def __init__(self, *args, **kwargs):
requires_sentencepiece(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_sentencepiece(self)
class XLMRobertaTokenizer:
def __init__(self, *args, **kwargs):
requires_sentencepiece(self)
......
......@@ -112,6 +112,7 @@ class ModelTesterMixin:
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
out_2 = outputs[0].cpu().numpy()
out_2[np.isnan(out_2)] = 0
......@@ -152,6 +153,7 @@ class ModelTesterMixin:
with torch.no_grad():
first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
out_1 = first.cpu().numpy()
out_2 = second.cpu().numpy()
out_1 = out_1[~np.isnan(out_1)]
......@@ -183,10 +185,12 @@ class ModelTesterMixin:
def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
seq_len = getattr(self.model_tester, "seq_length", None)
decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
chunk_length = getattr(self.model_tester, "chunk_length", None)
if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
......@@ -211,7 +215,7 @@ class ModelTesterMixin:
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class), return_dict=True)
attentions = outputs[-1]
attentions = outputs["attentions"] if "attentions" in outputs.keys() else outputs[-1]
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
if chunk_length is not None:
......@@ -227,8 +231,14 @@ class ModelTesterMixin:
out_len = len(outputs)
if self.is_encoder_decoder:
correct_outlen = 4
decoder_attention_idx = 1
correct_outlen = (
self.model_tester.base_model_out_len if hasattr(self.model_tester, "base_model_out_len") else 4
)
decoder_attention_idx = (
self.model_tester.decoder_attention_idx
if hasattr(self.model_tester, "decoder_attention_idx")
else 1
)
# loss is at first position
if "labels" in inputs_dict:
......@@ -238,6 +248,7 @@ class ModelTesterMixin:
if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
correct_outlen += 1 # start_logits and end_logits instead of only 1 output
decoder_attention_idx += 1
self.assertEqual(out_len, correct_outlen)
decoder_attentions = outputs[decoder_attention_idx]
......@@ -256,9 +267,16 @@ class ModelTesterMixin:
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
self_attentions = outputs[-1]
if hasattr(self.model_tester, "num_hidden_states_types"):
added_hidden_states = self.model_tester.num_hidden_states_types
elif self.is_encoder_decoder:
added_hidden_states = 2
else:
added_hidden_states = 1
self.assertEqual(out_len + added_hidden_states, len(outputs))
self_attentions = outputs["attentions"] if "attentions" in outputs else outputs[-1]
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
if chunk_length is not None:
self.assertListEqual(
......@@ -571,8 +589,8 @@ class ModelTesterMixin:
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs[-1]
outputs = model(**self._prepare_for_class(inputs_dict, model_class), return_dict=True)
hidden_states = outputs["hidden_states"] if "hidden_states" in outputs else outputs[-1]
expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
......@@ -822,6 +840,7 @@ class ModelTesterMixin:
model.eval()
inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
if not self.is_encoder_decoder:
input_ids = inputs["input_ids"]
del inputs["input_ids"]
......@@ -839,7 +858,7 @@ class ModelTesterMixin:
inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
with torch.no_grad():
model(**inputs)
model(**inputs)[0]
def test_lm_head_model_random_no_beam_search_generate(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
......
......@@ -24,6 +24,7 @@ from .test_modeling_bert import BertModelTester
from .test_modeling_bert_generation import BertGenerationEncoderTester
from .test_modeling_common import ids_tensor
from .test_modeling_gpt2 import GPT2ModelTester
from .test_modeling_prophetnet import ProphetNetStandaloneDecoderModelTester
from .test_modeling_roberta import RobertaModelTester
......@@ -41,9 +42,11 @@ if is_torch_available():
EncoderDecoderConfig,
EncoderDecoderModel,
GPT2LMHeadModel,
ProphetNetForCausalLM,
RobertaForCausalLM,
RobertaModel,
)
from transformers.modeling_outputs import BaseModelOutput
@require_torch
......@@ -82,10 +85,15 @@ class EncoderDecoderMixin:
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
return_dict=True,
)
self.assertEqual(outputs_encoder_decoder[0].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)))
self.assertEqual(outputs_encoder_decoder[1].shape, (input_ids.shape + (config.hidden_size,)))
self.assertEqual(
outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
)
self.assertEqual(
outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
)
def check_encoder_decoder_model(
self,
......@@ -109,20 +117,30 @@ class EncoderDecoderMixin:
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
return_dict=True,
)
self.assertEqual(
outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
)
self.assertEqual(
outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
)
self.assertEqual(outputs_encoder_decoder[0].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)))
self.assertEqual(outputs_encoder_decoder[1].shape, (input_ids.shape + (config.hidden_size,)))
encoder_outputs = (encoder_hidden_states,)
encoder_outputs = BaseModelOutput(last_hidden_state=encoder_hidden_states)
outputs_encoder_decoder = enc_dec_model(
encoder_outputs=encoder_outputs,
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
return_dict=True,
)
self.assertEqual(outputs_encoder_decoder[0].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)))
self.assertEqual(outputs_encoder_decoder[1].shape, (input_ids.shape + (config.hidden_size,)))
self.assertEqual(
outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
)
self.assertEqual(
outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
)
def check_encoder_decoder_model_from_pretrained(
self,
......@@ -145,10 +163,15 @@ class EncoderDecoderMixin:
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
return_dict=True,
)
self.assertEqual(outputs_encoder_decoder[0].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)))
self.assertEqual(outputs_encoder_decoder[1].shape, (input_ids.shape + (config.hidden_size,)))
self.assertEqual(
outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
)
self.assertEqual(
outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
)
def check_save_and_load(
self,
......@@ -255,14 +278,19 @@ class EncoderDecoderMixin:
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
labels=labels,
return_dict=True,
)
mlm_loss = outputs_encoder_decoder[0]
loss = outputs_encoder_decoder["loss"]
# check that backprop works
mlm_loss.backward()
loss.backward()
self.assertEqual(outputs_encoder_decoder[1].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)))
self.assertEqual(outputs_encoder_decoder[2].shape, (input_ids.shape + (config.hidden_size,)))
self.assertEqual(
outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
)
self.assertEqual(
outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
)
def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config, **kwargs):
encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
......@@ -425,6 +453,7 @@ class EncoderDecoderMixin:
self.assertLessEqual(max_diff, 1e-5)
@require_torch
class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
def get_pretrained_model(self):
return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "bert-base-cased")
......@@ -493,6 +522,7 @@ class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
self.assertEqual(summary, EXPECTED_SUMMARY)
@require_torch
class BertGenerationEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
def get_pretrained_model(self):
return EncoderDecoderModel.from_encoder_decoder_pretrained(
......@@ -554,6 +584,7 @@ class BertGenerationEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCa
self.assertEqual(summary, EXPECTED_SUMMARY)
@require_torch
class RoBertaEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
def get_encoder_decoder_model(self, config, decoder_config):
encoder_model = RobertaModel(config)
......@@ -606,6 +637,7 @@ class RoBertaEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
return EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base")
@require_torch
class GPT2EncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
def get_encoder_decoder_model(self, config, decoder_config):
encoder_model = BertModel(config)
......@@ -663,3 +695,59 @@ class GPT2EncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
def test_encoder_decoder_model_shared_weights(self):
pass
@require_torch
class ProphetNetEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
def get_encoder_decoder_model(self, config, decoder_config):
encoder_model = BertModel(config)
decoder_model = ProphetNetForCausalLM(decoder_config)
return encoder_model, decoder_model
def prepare_config_and_inputs(self):
model_tester_encoder = BertModelTester(self, batch_size=13)
model_tester_decoder = ProphetNetStandaloneDecoderModelTester(
self, batch_size=13, hidden_size=32, max_position_embeddings=512
)
encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = encoder_config_and_inputs
(
decoder_config,
decoder_input_ids,
decoder_attention_mask,
encoder_hidden_states,
encoder_attention_mask,
lm_labels,
) = decoder_config_and_inputs
# make sure that cross attention layers are added
decoder_config.add_cross_attention = True
# disable cache for now
decoder_config.use_cache = False
return {
"config": config,
"input_ids": input_ids,
"attention_mask": input_mask,
"decoder_config": decoder_config,
"decoder_input_ids": decoder_input_ids,
"decoder_attention_mask": decoder_attention_mask,
"encoder_hidden_states": encoder_hidden_states,
"labels": lm_labels,
}
def get_pretrained_model(self):
return EncoderDecoderModel.from_encoder_decoder_pretrained(
"bert-large-uncased", "patrickvonplaten/prophetnet-decoder-clm-large-uncased"
)
def test_encoder_decoder_model_shared_weights(self):
pass
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import tempfile
import unittest
from transformers import is_torch_available
from transformers.testing_utils import require_torch, slow, torch_device
from .test_configuration_common import ConfigTester
from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
if is_torch_available():
import torch
from transformers import (
ProphetNetConfig,
ProphetNetDecoder,
ProphetNetEncoder,
ProphetNetForCausalLM,
ProphetNetForConditionalGeneration,
ProphetNetModel,
ProphetNetTokenizer,
)
class ProphetNetModelTester:
def __init__(
self,
parent,
vocab_size=99,
batch_size=13,
hidden_size=16,
encoder_seq_length=7,
decoder_seq_length=9,
# For common tests
is_training=True,
use_attention_mask=True,
use_labels=True,
decoder_start_token_id=0,
encoder_ffn_dim=32,
num_encoder_layers=4,
num_encoder_attention_heads=4,
decoder_ffn_dim=32,
num_decoder_layers=4,
num_decoder_attention_heads=4,
max_position_embeddings=30,
is_encoder_decoder=True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
ngram=2,
num_buckets=32,
relative_max_distance=128,
disable_ngram_loss=False,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.encoder_seq_length = encoder_seq_length
self.decoder_seq_length = decoder_seq_length
# For common tests
self.seq_length = self.decoder_seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_decoder_layers
self.num_encoder_layers = num_encoder_layers
self.num_decoder_layers = num_decoder_layers
self.decoder_ffn_dim = decoder_ffn_dim
self.encoder_ffn_dim = encoder_ffn_dim
self.num_attention_heads = num_decoder_attention_heads
self.num_encoder_attention_heads = num_encoder_attention_heads
self.num_decoder_attention_heads = num_decoder_attention_heads
self.eos_token_id = eos_token_id
self.bos_token_id = bos_token_id
self.pad_token_id = pad_token_id
self.decoder_start_token_id = decoder_start_token_id
self.ngram = ngram
self.num_buckets = num_buckets
self.relative_max_distance = relative_max_distance
self.disable_ngram_loss = disable_ngram_loss
self.max_position_embeddings = max_position_embeddings
self.is_encoder_decoder = is_encoder_decoder
self.scope = None
self.decoder_key_length = decoder_seq_length
self.base_model_out_len = 7
self.num_hidden_states_types = 3 # encoder, decoder_main, decoder_ngram
self.decoder_attention_idx = 2
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
attention_mask = None
decoder_attention_mask = None
if self.use_attention_mask:
attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
lm_labels = None
if self.use_labels:
lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
config = ProphetNetConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_encoder_layers=self.num_encoder_layers,
num_decoder_layers=self.num_decoder_layers,
decoder_ffn_dim=self.decoder_ffn_dim,
encoder_ffn_dim=self.encoder_ffn_dim,
num_encoder_attention_heads=self.num_encoder_attention_heads,
num_decoder_attention_heads=self.num_decoder_attention_heads,
eos_token_id=self.eos_token_id,
bos_token_id=self.bos_token_id,
pad_token_id=self.pad_token_id,
decoder_start_token_id=self.decoder_start_token_id,
ngram=self.ngram,
num_buckets=self.num_buckets,
relative_max_distance=self.relative_max_distance,
disable_ngram_loss=self.disable_ngram_loss,
max_position_embeddings=self.max_position_embeddings,
is_encoder_decoder=self.is_encoder_decoder,
return_dict=True,
)
return (
config,
input_ids,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
)
def prepare_config_and_inputs_for_decoder(self):
(
config,
input_ids,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
) = self.prepare_config_and_inputs()
encoder_hidden_states = floats_tensor([self.batch_size, self.encoder_seq_length, self.hidden_size])
encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
return (
config,
decoder_input_ids,
decoder_attention_mask,
encoder_hidden_states,
encoder_attention_mask,
lm_labels,
)
def check_prepare_lm_labels_via_shift_left(
self,
config,
input_ids,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = ProphetNetModel(config=config)
model.to(torch_device)
model.eval()
# make sure that lm_labels are correctly padded from the right
lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
# add casaul pad token mask
triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
decoder_input_ids = model._shift_right(lm_labels)
for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
# first item
self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
if i < decoder_input_ids_slice.shape[-1]:
if i < decoder_input_ids.shape[-1] - 1:
# items before diagonal
self.parent.assertListEqual(
decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
)
# pad items after diagonal
if i < decoder_input_ids.shape[-1] - 2:
self.parent.assertListEqual(
decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
)
else:
# all items after square
self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
def create_and_check_model(
self,
config,
input_ids,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = ProphetNetModel(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids=input_ids,
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
)
result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
decoder_output = result.last_hidden_state
decoder_past = result.past_key_values
encoder_output = result.encoder_last_hidden_state
self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size))
# There should be `num_layers` key value embeddings stored in decoder_past
self.parent.assertEqual(len(decoder_past), config.num_decoder_layers)
# There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
self.parent.assertEqual(len(decoder_past[0]), 2) # cross-attention + uni-directional self-attention
def create_and_check_with_lm_head(
self,
config,
input_ids,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = ProphetNetForConditionalGeneration(config=config).to(torch_device).eval()
outputs = model(
input_ids=input_ids,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
labels=lm_labels,
)
self.parent.assertEqual(len(outputs), 5)
self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
self.parent.assertEqual(outputs["loss"].size(), ())
def create_and_check_causal_lm_decoder(
self,
config,
input_ids,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = ProphetNetForCausalLM(config=config).to(torch_device).eval()
outputs = model(
input_ids=decoder_input_ids,
attention_mask=decoder_attention_mask,
labels=lm_labels,
)
self.parent.assertEqual(len(outputs), 4)
self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
self.parent.assertEqual(outputs["loss"].size(), ())
def create_and_check_generate_with_past_key_value_states(
self,
config,
input_ids,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = ProphetNetForConditionalGeneration(config=config).to(torch_device).eval()
torch.manual_seed(0)
output_without_past_cache = model.generate(
input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
)
torch.manual_seed(0)
output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
def create_and_check_model_fp16_forward(
self,
config,
input_ids,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = ProphetNetModel(config=config).to(torch_device).half().eval()
output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
self.parent.assertFalse(torch.isnan(output).any().item())
def create_and_check_encoder_decoder_shared_weights(
self,
config,
input_ids,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
for model_class in [ProphetNetModel, ProphetNetForConditionalGeneration]:
torch.manual_seed(0)
model = model_class(config=config).to(torch_device).eval()
# load state dict copies weights but does not tie them
if model_class == ProphetNetForConditionalGeneration:
model.prophetnet.encoder.load_state_dict(model.prophetnet.decoder.state_dict(), strict=False)
else:
model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
torch.manual_seed(0)
tied_config = copy.deepcopy(config)
tied_config.tie_encoder_decoder = True
tied_model = model_class(config=tied_config).to(torch_device).eval()
model_result = model(
input_ids=input_ids,
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
return_dict=True,
)
tied_model_result = tied_model(
input_ids=input_ids,
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
return_dict=True,
)
# check that models has less parameters
self.parent.assertLess(
sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
)
random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
# check that outputs are equal
self.parent.assertTrue(
torch.allclose(
model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
)
)
# check that outputs after saving and loading are equal
with tempfile.TemporaryDirectory() as tmpdirname:
tied_model.save_pretrained(tmpdirname)
tied_model = model_class.from_pretrained(tmpdirname)
tied_model.to(torch_device)
tied_model.eval()
# check that models has less parameters
self.parent.assertLess(
sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
)
random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
tied_model_result = tied_model(
input_ids=input_ids,
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
)
# check that outputs are equal
self.parent.assertTrue(
torch.allclose(
model_result[0][0, :, random_slice_idx],
tied_model_result[0][0, :, random_slice_idx],
atol=1e-4,
)
)
def check_fast_integration(
self,
config,
*args,
):
input_ids = torch.tensor([[7, 4, 78, 0, 24, 52, 43]], device=torch_device, dtype=torch.long)
decoder_input_ids = torch.tensor([[12, 62, 25, 11, 47, 15, 14]], device=torch_device, dtype=torch.long)
attention_mask = torch.tensor([[1, 1, 1, 0, 1, 0, 0]], device=torch_device, dtype=torch.long)
decoder_attention_mask = torch.tensor([[1, 1, 1, 0, 0, 1, 0]], device=torch_device, dtype=torch.long)
lm_labels = torch.tensor([[62, 25, 11, 47, 15, 14, 24]], device=torch_device, dtype=torch.long)
torch.manual_seed(0)
config.ngram = 4
model = ProphetNetForConditionalGeneration(config=config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(
input_ids=input_ids,
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
labels=lm_labels,
return_dict=True,
)
self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(128.2925, device=torch_device), atol=1e-3))
expected_logit_slice = torch.tensor(
[-0.1565, 0.0418, 0.1207, 0.0030, 0.0665, 0.0467, 0.0412], device=torch_device
)
self.parent.assertTrue(torch.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3))
def check_model_with_attn_mask(self, config, input_ids, decoder_input_ids, *args):
model = ProphetNetModel(config=config)
model.to(torch_device)
model.eval()
outputs_no_mask = model(
input_ids=input_ids[:, :5], decoder_input_ids=decoder_input_ids[:, :5], return_dict=True
)
attention_mask = torch.ones_like(input_ids)
decoder_attention_mask = torch.ones_like(decoder_input_ids)
attention_mask[:, 5:] = 0
outputs_with_mask = model(
input_ids=input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
return_dict=True,
)
# check encoder
self.parent.assertTrue(
torch.allclose(
outputs_no_mask.encoder_last_hidden_state[0, :, 0],
outputs_with_mask.encoder_last_hidden_state[0, :5, 0],
atol=1e-3,
)
)
# check decoder
# main stream
self.parent.assertTrue(
torch.allclose(
outputs_no_mask.last_hidden_state[0, :, 0], outputs_with_mask.last_hidden_state[0, :5, 0], atol=1e-3
)
)
# predict stream
self.parent.assertTrue(
torch.allclose(
outputs_no_mask.last_hidden_state_ngram[0, :5, 0],
outputs_with_mask.last_hidden_state_ngram[0, :5, 0],
atol=1e-3,
)
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"decoder_input_ids": decoder_input_ids,
"decoder_attention_mask": decoder_attention_mask,
"use_cache": False,
}
return config, inputs_dict
class ProphetNetStandaloneDecoderModelTester:
def __init__(
self,
parent,
vocab_size=99,
batch_size=13,
hidden_size=16,
encoder_seq_length=7,
decoder_seq_length=7,
# For common tests
is_training=True,
is_decoder=True,
use_attention_mask=True,
add_cross_attention=False,
use_cache=False,
use_labels=True,
decoder_start_token_id=0,
encoder_ffn_dim=32,
num_encoder_layers=4,
num_encoder_attention_heads=4,
decoder_ffn_dim=32,
num_decoder_layers=4,
num_decoder_attention_heads=4,
max_position_embeddings=30,
is_encoder_decoder=False,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
ngram=2,
return_dict=True,
num_buckets=32,
relative_max_distance=128,
disable_ngram_loss=False,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.encoder_seq_length = encoder_seq_length
self.decoder_seq_length = decoder_seq_length
# For common tests
self.seq_length = self.decoder_seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_decoder_layers
self.num_encoder_layers = num_encoder_layers
self.num_decoder_layers = num_decoder_layers
self.decoder_ffn_dim = decoder_ffn_dim
self.encoder_ffn_dim = encoder_ffn_dim
self.num_attention_heads = num_decoder_attention_heads
self.num_encoder_attention_heads = num_encoder_attention_heads
self.num_decoder_attention_heads = num_decoder_attention_heads
self.eos_token_id = eos_token_id
self.bos_token_id = bos_token_id
self.pad_token_id = pad_token_id
self.decoder_start_token_id = decoder_start_token_id
self.ngram = ngram
self.num_buckets = num_buckets
self.relative_max_distance = relative_max_distance
self.use_cache = use_cache
self.disable_ngram_loss = disable_ngram_loss
self.max_position_embeddings = max_position_embeddings
self.add_cross_attention = add_cross_attention
self.is_encoder_decoder = is_encoder_decoder
self.return_dict = return_dict
self.scope = None
self.decoder_key_length = decoder_seq_length
self.base_model_out_len = 2
self.num_hidden_states_types = 2 # decoder_main, decoder_ngram
self.decoder_attention_idx = 1
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
attention_mask = None
if self.use_attention_mask:
attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
lm_labels = None
if self.use_labels:
lm_labels = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
config = ProphetNetConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_encoder_layers=self.num_encoder_layers,
num_decoder_layers=self.num_decoder_layers,
decoder_ffn_dim=self.decoder_ffn_dim,
encoder_ffn_dim=self.encoder_ffn_dim,
num_encoder_attention_heads=self.num_encoder_attention_heads,
num_decoder_attention_heads=self.num_decoder_attention_heads,
eos_token_id=self.eos_token_id,
bos_token_id=self.bos_token_id,
use_cache=self.use_cache,
pad_token_id=self.pad_token_id,
decoder_start_token_id=self.decoder_start_token_id,
ngram=self.ngram,
num_buckets=self.num_buckets,
relative_max_distance=self.relative_max_distance,
disable_ngram_loss=self.disable_ngram_loss,
max_position_embeddings=self.max_position_embeddings,
add_cross_attention=self.add_cross_attention,
is_encoder_decoder=self.is_encoder_decoder,
return_dict=self.return_dict,
)
return (
config,
input_ids,
attention_mask,
lm_labels,
)
def prepare_config_and_inputs_for_decoder(self):
(
config,
input_ids,
attention_mask,
lm_labels,
) = self.prepare_config_and_inputs()
encoder_hidden_states = floats_tensor([self.batch_size, self.encoder_seq_length, self.hidden_size])
encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
return (
config,
input_ids,
attention_mask,
encoder_hidden_states,
encoder_attention_mask,
lm_labels,
)
def create_and_check_decoder_model_past(
self,
config,
input_ids,
attention_mask,
lm_labels,
):
config.use_cache = True
model = ProphetNetDecoder(config=config).to(torch_device).eval()
# first forward pass
outputs = model(input_ids, use_cache=True)
outputs_use_cache_conf = model(input_ids)
outputs_no_past = model(input_ids, use_cache=False)
self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
past_key_values = outputs["past_key_values"]
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
output_from_no_past = model(next_input_ids)["last_hidden_state"]
output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
def create_and_check_decoder_model_attention_mask_past(
self,
config,
input_ids,
attention_mask,
lm_labels,
):
model = ProphetNetDecoder(config=config).to(torch_device).eval()
# create attention mask
attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
half_seq_length = input_ids.shape[-1] // 2
attn_mask[:, half_seq_length:] = 0
# first forward pass
past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# change a random masked slice from input_ids
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
# append to next input_ids and attn_mask
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
attn_mask = torch.cat(
[attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
dim=1,
)
# get two different outputs
output_from_no_past = model(next_input_ids)["last_hidden_state"]
output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
attention_mask,
lm_labels,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
class ProphetNetStandaloneEncoderModelTester:
def __init__(
self,
parent,
vocab_size=99,
batch_size=13,
hidden_size=16,
encoder_seq_length=7,
decoder_seq_length=7,
# For common tests
is_training=True,
is_decoder=False,
use_attention_mask=True,
add_cross_attention=False,
use_cache=False,
use_labels=True,
decoder_start_token_id=0,
encoder_ffn_dim=32,
num_encoder_layers=4,
num_encoder_attention_heads=4,
decoder_ffn_dim=32,
num_decoder_layers=4,
num_decoder_attention_heads=4,
max_position_embeddings=30,
is_encoder_decoder=False,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
return_dict=True,
num_buckets=32,
relative_max_distance=128,
disable_ngram_loss=False,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.encoder_seq_length = encoder_seq_length
self.decoder_seq_length = decoder_seq_length
# For common tests
self.seq_length = self.decoder_seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_decoder_layers
self.num_encoder_layers = num_encoder_layers
self.num_decoder_layers = num_decoder_layers
self.decoder_ffn_dim = decoder_ffn_dim
self.encoder_ffn_dim = encoder_ffn_dim
self.num_attention_heads = num_decoder_attention_heads
self.num_encoder_attention_heads = num_encoder_attention_heads
self.num_decoder_attention_heads = num_decoder_attention_heads
self.eos_token_id = eos_token_id
self.bos_token_id = bos_token_id
self.pad_token_id = pad_token_id
self.decoder_start_token_id = decoder_start_token_id
self.num_buckets = num_buckets
self.relative_max_distance = relative_max_distance
self.use_cache = use_cache
self.disable_ngram_loss = disable_ngram_loss
self.max_position_embeddings = max_position_embeddings
self.add_cross_attention = add_cross_attention
self.is_encoder_decoder = is_encoder_decoder
self.return_dict = return_dict
self.scope = None
self.decoder_key_length = decoder_seq_length
self.base_model_out_len = 1
self.num_hidden_states_types = 1
self.decoder_attention_idx = 1
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
attention_mask = None
if self.use_attention_mask:
attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
config = ProphetNetConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_encoder_layers=self.num_encoder_layers,
num_decoder_layers=self.num_decoder_layers,
decoder_ffn_dim=self.decoder_ffn_dim,
encoder_ffn_dim=self.encoder_ffn_dim,
num_encoder_attention_heads=self.num_encoder_attention_heads,
num_decoder_attention_heads=self.num_decoder_attention_heads,
eos_token_id=self.eos_token_id,
bos_token_id=self.bos_token_id,
use_cache=self.use_cache,
pad_token_id=self.pad_token_id,
decoder_start_token_id=self.decoder_start_token_id,
num_buckets=self.num_buckets,
relative_max_distance=self.relative_max_distance,
disable_ngram_loss=self.disable_ngram_loss,
max_position_embeddings=self.max_position_embeddings,
add_cross_attention=self.add_cross_attention,
is_encoder_decoder=self.is_encoder_decoder,
return_dict=self.return_dict,
)
return (
config,
input_ids,
attention_mask,
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
attention_mask,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class ProphetNetModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (ProphetNetModel, ProphetNetForConditionalGeneration) if is_torch_available() else ()
all_generative_model_classes = (ProphetNetForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_torchscript = False
test_resize_embeddings = False
test_headmasking = False
is_encoder_decoder = True
def setUp(self):
self.model_tester = ProphetNetModelTester(self)
self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_lm_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
def test_only_decoder_causal_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_causal_lm_decoder(*config_and_inputs)
def test_fast_integration(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.check_fast_integration(*config_and_inputs)
def test_shared_weights(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
def test_shift_labels_via_shift_left(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
def test_decoder_model_generate(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_generate_with_past_key_value_states(*config_and_inputs)
def test_attn_mask_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.check_model_with_attn_mask(*config_and_inputs)
@unittest.skipIf(torch_device == "cpu", "Cant do half precision")
def test_fp16_forward(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
@require_torch
class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (ProphetNetDecoder, ProphetNetForCausalLM) if is_torch_available() else ()
all_generative_model_classes = (ProphetNetForCausalLM,) if is_torch_available() else ()
test_pruning = False
test_torchscript = False
test_resize_embeddings = False
test_headmasking = False
is_encoder_decoder = False
def setUp(self):
self.model_tester = ProphetNetStandaloneDecoderModelTester(self)
self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
def test_config(self):
self.config_tester.run_common_tests()
def test_decoder_model_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
def test_decoder_model_attn_mask_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
@require_torch
class ProphetNetStandaloneEncoderModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (ProphetNetEncoder,) if is_torch_available() else ()
test_pruning = False
test_torchscript = False
test_resize_embeddings = False
test_headmasking = False
is_encoder_decoder = False
def setUp(self):
self.model_tester = ProphetNetStandaloneEncoderModelTester(self)
self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
def test_config(self):
self.config_tester.run_common_tests()
class ProphetNetModelIntegrationTest(unittest.TestCase):
@slow
def test_pretrained_checkpoint_hidden_states(self):
model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")
model.to(torch_device)
# encoder-decoder outputs
encoder_ids = torch.tensor(
[
[
2871,
102,
2048,
3176,
2780,
1997,
2871,
26727,
2169,
2097,
12673,
1996,
8457,
2006,
2049,
8240,
2859,
2799,
1012,
2023,
6512,
2038,
2174,
13977,
2195,
25962,
1012,
102,
]
]
).to(torch_device)
decoder_prev_ids = torch.tensor([[102, 2129, 2116, 2372, 2024, 2006, 2169, 1997, 2122, 2048, 2780, 1029]]).to(
torch_device
)
output = model(
input_ids=encoder_ids,
attention_mask=None,
encoder_outputs=None,
decoder_input_ids=decoder_prev_ids,
return_dict=True,
)
output_predited_logits = output[0]
expected_shape = torch.Size((1, 12, 30522))
self.assertEqual(output_predited_logits.shape, expected_shape)
expected_slice = torch.tensor(
[[[-7.6213, -7.9008, -7.9979], [-7.6834, -7.8467, -8.2187], [-7.5326, -7.4762, -8.1914]]]
).to(torch_device)
# self.assertTrue(torch.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4))
assert torch.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4)
# encoder outputs
encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
expected_encoder_outputs_slice = torch.tensor(
[[[-0.2526, -0.1951, -0.2185], [-0.8923, 0.2992, -0.4623], [-0.4585, 0.0165, -0.6652]]]
).to(torch_device)
expected_shape_encoder = torch.Size((1, 28, 1024))
self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
# self.assertTrue(torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4))
assert torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4)
# decoder outputs
decoder_outputs = model.prophetnet.decoder(
decoder_prev_ids, encoder_hidden_states=encoder_outputs, return_dict=True
)
predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 12, -1)
predicting_streams_logits = model.lm_head(predicting_streams)
next_first_stream_logits = predicting_streams_logits[:, 0]
# self.assertTrue(torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4))
assert torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4)
@slow
def test_cnndm_inference(self):
model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
model.config.max_length = 512
model.to(torch_device)
tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
ARTICLE_TO_SUMMARIZE = "USTC was founded in Beijing by the Chinese Academy of Sciences (CAS) in September 1958. The Director of CAS, Mr. Guo Moruo was appointed the first president of USTC. USTC's founding mission was to develop a high-level science and technology workforce, as deemed critical for development of China's economy, defense, and science and technology education. The establishment was hailed as \"A Major Event in the History of Chinese Education and Science.\" CAS has supported USTC by combining most of its institutes with the departments of the university. USTC is listed in the top 16 national key universities, becoming the youngest national key university.".lower()
input_ids = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=511, return_tensors="pt").input_ids
input_ids = input_ids.to(torch_device)
summary_ids = model.generate(
input_ids, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
)
EXPECTED_SUMMARIZE_512 = "us ##tc was founded by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc is listed in the top 16 national key universities ."
generated_titles = [
" ".join(tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True)) for g in summary_ids
]
self.assertListEqual(
[EXPECTED_SUMMARIZE_512],
generated_titles,
)
input_ids = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=99, return_tensors="pt").input_ids
input_ids = input_ids.to(torch_device)
# actually 98 tokens are used. max_length=100 contains bos and eos.
summary_ids = model.generate(
input_ids, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
)
EXPECTED_SUMMARIZE_100 = (
r"us ##tc was founded in beijing by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc "
"'"
' s founding mission was to develop a high - level science and technology workforce . [X_SEP] establishment hailed as " a major event in the history of chinese education and science "'
)
generated_titles = [
" ".join(tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True)) for g in summary_ids
]
self.assertListEqual(
[EXPECTED_SUMMARIZE_100],
generated_titles,
)
......@@ -43,7 +43,6 @@ class T5ModelTester:
encoder_seq_length=7,
decoder_seq_length=9,
# For common tests
seq_length=7,
is_training=True,
use_attention_mask=True,
use_labels=True,
......
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import is_torch_available
from transformers.testing_utils import slow, torch_device
if is_torch_available():
import torch
from transformers import XLMProphetNetForConditionalGeneration, XLMProphetNetTokenizer
class XLMProphetNetModelIntegrationTest(unittest.TestCase):
@slow
def test_pretrained_checkpoint_hidden_states(self):
model = XLMProphetNetForConditionalGeneration.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
model.to(torch_device)
# encoder-decoder outputs
encoder_ids = torch.tensor([[17, 96208, 103471, 2]]).to(torch_device)
decoder_prev_ids = torch.tensor(
[[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
).to(torch_device)
output = model(
input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
)
output_predited_logis = output[0]
expected_shape = torch.Size((1, 14, 250012))
self.assertEqual(output_predited_logis.shape, expected_shape)
expected_slice = torch.tensor(
[[[-6.6042, -8.3838, 12.4717], [-6.4426, -8.1994, 12.4542], [-6.0851, -7.8209, 12.9493]]]
).to(torch_device)
self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
# encoder outputs
encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
expected_encoder_outputs_slice = torch.tensor(
[[[-1.4260, -0.7628, 0.8453], [-1.4719, -0.1391, 0.7807], [-1.7678, 0.0114, 0.4646]]]
).to(torch_device)
expected_shape_encoder = torch.Size((1, 4, 1024))
self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
self.assertTrue(torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4))
# decoder outputs
decoder_outputs = model.prophetnet.decoder(
decoder_prev_ids,
encoder_hidden_states=encoder_outputs,
)
predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 14, -1)
predicting_streams_logits = model.lm_head(predicting_streams)
next_first_stream_logits = predicting_streams_logits[:, 0]
self.assertTrue(torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4))
@slow
def test_ntg_hidden_states(self):
model = XLMProphetNetForConditionalGeneration.from_pretrained(
"microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
)
model.to(torch_device)
encoder_ids = torch.tensor([[17, 96208, 103471, 2]]).to(torch_device)
decoder_prev_ids = torch.tensor(
[[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
).to(torch_device)
output = model(
input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
)
output_predited_logis = output[0]
expected_shape = torch.Size((1, 14, 250012))
self.assertEqual(output_predited_logis.shape, expected_shape)
# compare the actual values for a slice.
expected_slice = torch.tensor(
[[[-8.8815, -9.2996, -4.4506], [-6.7202, -7.8944, -0.9402], [-8.6890, -7.4528, -1.9437]]]
).to(torch_device)
self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
@slow
def test_xprophetnet_ntg_inference(self):
model = XLMProphetNetForConditionalGeneration.from_pretrained(
"microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
)
model.to(torch_device)
model.config.max_length = 512
tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased-xglue-ntg")
EN_SENTENCE = "Microsoft Corporation intends to officially end free support for the Windows 7 operating system after January 14, 2020, according to the official portal of the organization. From that day, users of this system will not be able to receive security updates, which could make their computers vulnerable to cyber attacks."
RU_SENTENCE = "орпорация Microsoft намерена официально прекратить бесплатную поддержку операционной системы Windows 7 после 14 января 2020 года, сообщается на официальном портале организации . С указанного дня пользователи этой системы не смогут получать обновления безопасности, из-за чего их компьютеры могут стать уязвимыми к кибератакам."
ZH_SENTENCE = (
"根据该组织的官方门户网站,微软公司打算在2020年1月14日之后正式终止对Windows 7操作系统的免费支持。从那时起,该系统的用户将无法接收安全更新,这可能会使他们的计算机容易受到网络攻击。"
)
input_ids = tokenizer(
[EN_SENTENCE, RU_SENTENCE, ZH_SENTENCE], padding=True, max_length=255, return_tensors="pt"
).input_ids
input_ids = input_ids.to(torch_device)
summary_ids = model.generate(
input_ids, num_beams=10, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
)
generated_titles = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
EXPECTED_TITLE_EN = "Microsoft to end Windows 7 free support after January 14, 2020"
EXPECTED_TITLE_RU = "Microsoft намерена прекратить бесплатную поддержку Windows 7 после 14 января 2020 года"
EXPECTED_TITLE_ZH = "微软打算终止对Windows 7操作系统的免费支持"
self.assertListEqual(
[EXPECTED_TITLE_EN, EXPECTED_TITLE_RU, EXPECTED_TITLE_ZH],
generated_titles,
)
summary_ids_beam1 = model.generate(
input_ids, num_beams=1, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
)
generated_titles_beam1_tok = [
tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1
]
EXPECTED_TITLE_EN_BEAM1_TOK = "▁Microsoft ▁to ▁end ▁free ▁support ▁for ▁Windows ▁7".split(" ")
EXPECTED_TITLE_RU_BEAM1_TOK = "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года".split(
" "
)
EXPECTED_TITLE_ZH_BEAM1_TOK = "微软 公司 打算 终止 对 Windows ▁7 操作 系统的 免费 支持".split(" ")
self.assertListEqual(
[EXPECTED_TITLE_EN_BEAM1_TOK, EXPECTED_TITLE_RU_BEAM1_TOK, EXPECTED_TITLE_ZH_BEAM1_TOK],
generated_titles_beam1_tok,
)
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
from transformers.testing_utils import slow
from transformers.tokenization_bert import (
BasicTokenizer,
WordpieceTokenizer,
_is_control,
_is_punctuation,
_is_whitespace,
)
from transformers.tokenization_prophetnet import VOCAB_FILES_NAMES, ProphetNetTokenizer
from .test_tokenization_common import TokenizerTesterMixin
class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ProphetNetTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
vocab_tokens = [
"[UNK]",
"[CLS]",
"[SEP]",
"[PAD]",
"[MASK]",
"want",
"##want",
"##ed",
"wa",
"un",
"runn",
"##ing",
",",
"low",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00E9d,running"
output_text = "unwanted, running"
return input_text, output_text
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file)
tokens = tokenizer.tokenize("UNwant\u00E9d,running")
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
def test_chinese(self):
tokenizer = BasicTokenizer()
self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
def test_basic_tokenizer_lower(self):
tokenizer = BasicTokenizer(do_lower_case=True)
self.assertListEqual(
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
def test_basic_tokenizer_lower_strip_accents_false(self):
tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hällo", "!", "how", "are", "you", "?"]
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
def test_basic_tokenizer_lower_strip_accents_true(self):
tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"]
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
def test_basic_tokenizer_lower_strip_accents_default(self):
tokenizer = BasicTokenizer(do_lower_case=True)
self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"]
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
def test_basic_tokenizer_no_lower(self):
tokenizer = BasicTokenizer(do_lower_case=False)
self.assertListEqual(
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
)
def test_basic_tokenizer_no_lower_strip_accents_false(self):
tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
)
def test_basic_tokenizer_no_lower_strip_accents_true(self):
tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
)
def test_basic_tokenizer_respects_never_split_tokens(self):
tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
self.assertListEqual(
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
)
def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
self.assertListEqual(tokenizer.tokenize(""), [])
self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
def test_is_whitespace(self):
self.assertTrue(_is_whitespace(" "))
self.assertTrue(_is_whitespace("\t"))
self.assertTrue(_is_whitespace("\r"))
self.assertTrue(_is_whitespace("\n"))
self.assertTrue(_is_whitespace("\u00A0"))
self.assertFalse(_is_whitespace("A"))
self.assertFalse(_is_whitespace("-"))
def test_is_control(self):
self.assertTrue(_is_control("\u0005"))
self.assertFalse(_is_control("A"))
self.assertFalse(_is_control(" "))
self.assertFalse(_is_control("\t"))
self.assertFalse(_is_control("\r"))
def test_is_punctuation(self):
self.assertTrue(_is_punctuation("-"))
self.assertTrue(_is_punctuation("$"))
self.assertTrue(_is_punctuation("`"))
self.assertTrue(_is_punctuation("."))
self.assertFalse(_is_punctuation("A"))
self.assertFalse(_is_punctuation(" "))
@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("microsoft/prophetnet-large-uncased")
text = tokenizer.encode("sequence builders", add_special_tokens=False)
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
assert encoded_sentence == text + [102]
assert encoded_pair == text + [102] + text_2 + [102]
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
from transformers.file_utils import cached_property
from transformers.testing_utils import slow
from transformers.tokenization_xlm_prophetnet import SPIECE_UNDERLINE, XLMProphetNetTokenizer
from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMProphetNetTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
# We have a SentencePiece fixture for testing
tokenizer = XLMProphetNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
def test_full_tokenizer(self):
tokenizer = XLMProphetNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokens = tokenizer.tokenize("This is a test")
self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens),
[value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]],
)
tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
self.assertListEqual(
tokens,
[
SPIECE_UNDERLINE + "I",
SPIECE_UNDERLINE + "was",
SPIECE_UNDERLINE + "b",
"or",
"n",
SPIECE_UNDERLINE + "in",
SPIECE_UNDERLINE + "",
"9",
"2",
"0",
"0",
"0",
",",
SPIECE_UNDERLINE + "and",
SPIECE_UNDERLINE + "this",
SPIECE_UNDERLINE + "is",
SPIECE_UNDERLINE + "f",
"al",
"s",
"é",
".",
],
)
ids = tokenizer.convert_tokens_to_ids(tokens)
self.assertListEqual(
ids,
[
value + tokenizer.fairseq_offset
for value in [8, 21, 84, 55, 24, 19, 7, -9, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, -9, 4]
],
)
back_tokens = tokenizer.convert_ids_to_tokens(ids)
self.assertListEqual(
back_tokens,
[
SPIECE_UNDERLINE + "I",
SPIECE_UNDERLINE + "was",
SPIECE_UNDERLINE + "b",
"or",
"n",
SPIECE_UNDERLINE + "in",
SPIECE_UNDERLINE + "",
"[UNK]",
"2",
"0",
"0",
"0",
",",
SPIECE_UNDERLINE + "and",
SPIECE_UNDERLINE + "this",
SPIECE_UNDERLINE + "is",
SPIECE_UNDERLINE + "f",
"al",
"s",
"[UNK]",
".",
],
)
@cached_property
def big_tokenizer(self):
return XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
@slow
def test_tokenization_base_easy_symbols(self):
symbols = "Hello World!"
original_tokenizer_encodings = [35389, 6672, 49, 2]
self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
......@@ -43,6 +43,7 @@ TEST_FILES_WITH_NO_COMMON_TESTS = [
"test_modeling_camembert.py",
"test_modeling_tf_camembert.py",
"test_modeling_tf_xlm_roberta.py",
"test_modeling_xlm_prophetnet.py",
"test_modeling_xlm_roberta.py",
"test_modeling_pegasus.py",
"test_modeling_mbart.py",
......@@ -61,6 +62,7 @@ IGNORE_NON_DOCUMENTED = [
MODEL_NAME_TO_DOC_FILE = {
"openai": "gpt.rst",
"transfo_xl": "transformerxl.rst",
"xlm_prophetnet": "xlmprophetnet.rst",
"xlm_roberta": "xlmroberta.rst",
"bert_generation": "bertgeneration.rst",
}
......@@ -243,6 +245,9 @@ def _get_model_name(module):
# Secial case for transfo_xl
if splits[-1] == "xl":
return "_".join(splits[-2:])
# Special case for xlm_prophetnet
if splits[-1] == "prophetnet" and splits[-2] == "xlm":
return "_".join(splits[-2:])
# Secial case for xlm_roberta
if splits[-1] == "roberta" and splits[-2] == "xlm":
return "_".join(splits[-2:])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment