"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "7c63e6fc8c34dcf8b0121eaee776f41ccf3b1137"
Unverified Commit 854260ca authored by Matt's avatar Matt Committed by GitHub
Browse files

TF/Numpy variants for all DataCollator classes (#13105)



* Adding a TF variant of the DataCollatorForTokenClassification to get feedback

* Added a Numpy variant and a post_init check to fail early if a missing import is found

* Fixed call to Numpy variant

* Added a couple more of the collators

* Update src/transformers/data/data_collator.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Fixes, style pass, finished DataCollatorForSeqToSeq

* Added all the LanguageModeling DataCollators, except SOP and PermutationLanguageModeling

* Adding DataCollatorForPermutationLanguageModeling

* Style pass

* Add missing `__call__` for PLM

* Remove `post_init` checks for frameworks because the imports inside them were making us fail code quality checks

* Remove unused imports

* First attempt at some TF tests

* A second attempt to make any of those tests actually work

* TF tests, round three

* TF tests, round four

* TF tests, round five

* TF tests, all enabled!

* Style pass

* Merging tests into `test_data_collator.py`

* Merging tests into `test_data_collator.py`

* Fixing up test imports

* Fixing up test imports

* Trying shuffling the conditionals around

* Commenting out non-functional old tests

* Completed all tests for all three frameworks

* Style pass

* Fixed test typo

* Style pass

* Move standard `__call__` method to mixin

* Rearranged imports for `test_data_collator`

* Fix data collator typo "torch" -> "pt"

* Fixed the most embarrassingly obvious bug

* Update src/transformers/data/data_collator.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Renaming mixin

* Updating docs
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarDalton Walker <dalton_walker@icloud.com>
Co-authored-by: default avatarAndrew Romans <andrew.romans@hotmail.com>
parent 74b3344f
...@@ -54,18 +54,18 @@ DataCollatorForLanguageModeling ...@@ -54,18 +54,18 @@ DataCollatorForLanguageModeling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling .. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling
:members: mask_tokens :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
DataCollatorForWholeWordMask DataCollatorForWholeWordMask
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask .. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask
:members: mask_tokens :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
DataCollatorForPermutationLanguageModeling DataCollatorForPermutationLanguageModeling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling .. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling
:members: mask_tokens :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
...@@ -81,6 +81,17 @@ _import_structure = { ...@@ -81,6 +81,17 @@ _import_structure = {
"xnli_processors", "xnli_processors",
"xnli_tasks_num_labels", "xnli_tasks_num_labels",
], ],
"data.data_collator": [
"DataCollator",
"DataCollatorForLanguageModeling",
"DataCollatorForPermutationLanguageModeling",
"DataCollatorForSeq2Seq",
"DataCollatorForSOP",
"DataCollatorForTokenClassification",
"DataCollatorForWholeWordMask",
"DataCollatorWithPadding",
"default_data_collator",
],
"feature_extraction_sequence_utils": ["BatchFeature", "SequenceFeatureExtractor"], "feature_extraction_sequence_utils": ["BatchFeature", "SequenceFeatureExtractor"],
"file_utils": [ "file_utils": [
"CONFIG_NAME", "CONFIG_NAME",
...@@ -460,17 +471,6 @@ else: ...@@ -460,17 +471,6 @@ else:
if is_torch_available(): if is_torch_available():
_import_structure["benchmark.benchmark"] = ["PyTorchBenchmark"] _import_structure["benchmark.benchmark"] = ["PyTorchBenchmark"]
_import_structure["benchmark.benchmark_args"] = ["PyTorchBenchmarkArguments"] _import_structure["benchmark.benchmark_args"] = ["PyTorchBenchmarkArguments"]
_import_structure["data.data_collator"] = [
"DataCollator",
"DataCollatorForLanguageModeling",
"DataCollatorForPermutationLanguageModeling",
"DataCollatorForSeq2Seq",
"DataCollatorForSOP",
"DataCollatorForTokenClassification",
"DataCollatorForWholeWordMask",
"DataCollatorWithPadding",
"default_data_collator",
]
_import_structure["data.datasets"] = [ _import_structure["data.datasets"] = [
"GlueDataset", "GlueDataset",
"GlueDataTrainingArguments", "GlueDataTrainingArguments",
...@@ -1830,6 +1830,17 @@ if TYPE_CHECKING: ...@@ -1830,6 +1830,17 @@ if TYPE_CHECKING:
xnli_processors, xnli_processors,
xnli_tasks_num_labels, xnli_tasks_num_labels,
) )
from .data.data_collator import (
DataCollator,
DataCollatorForLanguageModeling,
DataCollatorForPermutationLanguageModeling,
DataCollatorForSeq2Seq,
DataCollatorForSOP,
DataCollatorForTokenClassification,
DataCollatorForWholeWordMask,
DataCollatorWithPadding,
default_data_collator,
)
# Feature Extractor # Feature Extractor
from .feature_extraction_utils import BatchFeature, SequenceFeatureExtractor from .feature_extraction_utils import BatchFeature, SequenceFeatureExtractor
...@@ -2174,17 +2185,6 @@ if TYPE_CHECKING: ...@@ -2174,17 +2185,6 @@ if TYPE_CHECKING:
# Benchmarks # Benchmarks
from .benchmark.benchmark import PyTorchBenchmark from .benchmark.benchmark import PyTorchBenchmark
from .benchmark.benchmark_args import PyTorchBenchmarkArguments from .benchmark.benchmark_args import PyTorchBenchmarkArguments
from .data.data_collator import (
DataCollator,
DataCollatorForLanguageModeling,
DataCollatorForPermutationLanguageModeling,
DataCollatorForSeq2Seq,
DataCollatorForSOP,
DataCollatorForTokenClassification,
DataCollatorForWholeWordMask,
DataCollatorWithPadding,
default_data_collator,
)
from .data.datasets import ( from .data.datasets import (
GlueDataset, GlueDataset,
GlueDataTrainingArguments, GlueDataTrainingArguments,
......
...@@ -17,11 +17,7 @@ import warnings ...@@ -17,11 +17,7 @@ import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
import torch
from torch.nn.utils.rnn import pad_sequence
from ..file_utils import PaddingStrategy from ..file_utils import PaddingStrategy
from ..modeling_utils import PreTrainedModel
from ..models.bert import BertTokenizer, BertTokenizerFast from ..models.bert import BertTokenizer, BertTokenizerFast
from ..tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase from ..tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
...@@ -30,12 +26,26 @@ InputDataClass = NewType("InputDataClass", Any) ...@@ -30,12 +26,26 @@ InputDataClass = NewType("InputDataClass", Any)
""" """
A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
of Tensors. of PyTorch/TensorFlow tensors or NumPy arrays.
""" """
DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]]) DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]])
class DataCollatorMixin:
def __call__(self, features, return_tensors=None):
if return_tensors is None:
return_tensors = self.return_tensors
if return_tensors == "tf":
return self.tf_call(features)
elif return_tensors == "pt":
return self.torch_call(features)
elif return_tensors == "np":
return self.numpy_call(features)
else:
raise ValueError(f"Framework '{return_tensors}' not recognized!")
def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Tensor]: def default_data_collator(features: List[InputDataClass], return_tensors="pt") -> Dict[str, Any]:
""" """
Very simple data collator that simply collates batches of dict-like objects and performs special handling for Very simple data collator that simply collates batches of dict-like objects and performs special handling for
potential keys named: potential keys named:
...@@ -51,9 +61,25 @@ def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Ten ...@@ -51,9 +61,25 @@ def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Ten
# have the same attributes. # have the same attributes.
# So we will look at the first element as a proxy for what attributes exist # So we will look at the first element as a proxy for what attributes exist
# on the whole batch. # on the whole batch.
if return_tensors == "pt":
return torch_default_data_collator(features)
elif return_tensors == "tf":
return tf_default_data_collator(features)
elif return_tensors == "np":
return numpy_default_data_collator(features)
@dataclass
class DefaultDataCollator(DataCollatorMixin):
return_tensors: str = "pt"
def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
import torch
if not isinstance(features[0], (dict, BatchEncoding)): if not isinstance(features[0], (dict, BatchEncoding)):
features = [vars(f) for f in features] features = [vars(f) for f in features]
first = features[0] first = features[0]
batch = {} batch = {}
...@@ -83,6 +109,81 @@ def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Ten ...@@ -83,6 +109,81 @@ def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Ten
return batch return batch
def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
import numpy as np
import tensorflow as tf
if not isinstance(features[0], (dict, BatchEncoding)):
features = [vars(f) for f in features]
first = features[0]
batch = {}
# Special handling for labels.
# Ensure that tensor is created with the correct type
# (it should be automatically the case, but let's make sure of it.)
if "label" in first and first["label"] is not None:
if isinstance(first["label"], tf.Tensor):
dtype = tf.int64 if first["label"].dtype.is_integer() else tf.float32
elif isinstance(first["label"], np.ndarray):
dtype = tf.int64 if np.issubdtype(first["label"].dtype, np.integer) else tf.float32
elif isinstance(first["label"], (tuple, list)):
dtype = tf.int64 if isinstance(first["label"][0], int) else tf.float32
else:
dtype = tf.int64 if isinstance(first["label"], int) else tf.float32
batch["labels"] = tf.convert_to_tensor([f["label"] for f in features], dtype=dtype)
elif "label_ids" in first and first["label_ids"] is not None:
if isinstance(first["label_ids"], tf.Tensor):
batch["labels"] = tf.stack([f["label_ids"] for f in features])
else:
dtype = tf.int64 if type(first["label_ids"][0]) is int else tf.float32
batch["labels"] = tf.convert_to_tensor([f["label_ids"] for f in features], dtype=dtype)
# Handling of all other possible keys.
# Again, we will use the first element to figure out which key/values are not None for this model.
for k, v in first.items():
if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
if isinstance(v, (tf.Tensor, np.ndarray)):
batch[k] = tf.stack([f[k] for f in features])
else:
batch[k] = tf.convert_to_tensor([f[k] for f in features])
return batch
def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
import numpy as np
if not isinstance(features[0], (dict, BatchEncoding)):
features = [vars(f) for f in features]
first = features[0]
batch = {}
# Special handling for labels.
# Ensure that tensor is created with the correct type
# (it should be automatically the case, but let's make sure of it.)
if "label" in first and first["label"] is not None:
label = first["label"].item() if isinstance(first["label"], np.ndarray) else first["label"]
dtype = np.int64 if isinstance(label, int) else np.float32
batch["labels"] = np.array([f["label"] for f in features], dtype=dtype)
elif "label_ids" in first and first["label_ids"] is not None:
if isinstance(first["label_ids"], np.ndarray):
batch["labels"] = np.stack([f["label_ids"] for f in features])
else:
dtype = np.int64 if type(first["label_ids"][0]) is int else np.float32
batch["labels"] = np.array([f["label_ids"] for f in features], dtype=dtype)
# Handling of all other possible keys.
# Again, we will use the first element to figure out which key/values are not None for this model.
for k, v in first.items():
if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
if isinstance(v, np.ndarray):
batch[k] = np.stack([f[k] for f in features])
else:
batch[k] = np.array([f[k] for f in features])
return batch
@dataclass @dataclass
class DataCollatorWithPadding: class DataCollatorWithPadding:
""" """
...@@ -114,14 +215,15 @@ class DataCollatorWithPadding: ...@@ -114,14 +215,15 @@ class DataCollatorWithPadding:
padding: Union[bool, str, PaddingStrategy] = True padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None pad_to_multiple_of: Optional[int] = None
return_tensors: str = "pt"
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
batch = self.tokenizer.pad( batch = self.tokenizer.pad(
features, features,
padding=self.padding, padding=self.padding,
max_length=self.max_length, max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of, pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt", return_tensors=self.return_tensors,
) )
if "label" in batch: if "label" in batch:
batch["labels"] = batch["label"] batch["labels"] = batch["label"]
...@@ -133,7 +235,7 @@ class DataCollatorWithPadding: ...@@ -133,7 +235,7 @@ class DataCollatorWithPadding:
@dataclass @dataclass
class DataCollatorForTokenClassification: class DataCollatorForTokenClassification(DataCollatorMixin):
""" """
Data collator that will dynamically pad the inputs received, as well as the labels. Data collator that will dynamically pad the inputs received, as well as the labels.
...@@ -166,8 +268,11 @@ class DataCollatorForTokenClassification: ...@@ -166,8 +268,11 @@ class DataCollatorForTokenClassification:
max_length: Optional[int] = None max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None pad_to_multiple_of: Optional[int] = None
label_pad_token_id: int = -100 label_pad_token_id: int = -100
return_tensors: str = "pt"
def torch_call(self, features):
import torch
def __call__(self, features):
label_name = "label" if "label" in features[0].keys() else "labels" label_name = "label" if "label" in features[0].keys() else "labels"
labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
batch = self.tokenizer.pad( batch = self.tokenizer.pad(
...@@ -196,15 +301,74 @@ class DataCollatorForTokenClassification: ...@@ -196,15 +301,74 @@ class DataCollatorForTokenClassification:
batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()} batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
return batch return batch
def tf_call(self, features):
import tensorflow as tf
label_name = "label" if "label" in features[0].keys() else "labels"
labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
batch = self.tokenizer.pad(
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
# Conversion to tensors will fail if we have labels as they are not of the same length yet.
return_tensors="tf" if labels is None else None,
)
if labels is None:
return batch
sequence_length = tf.convert_to_tensor(batch["input_ids"]).shape[1]
padding_side = self.tokenizer.padding_side
if padding_side == "right":
batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels]
else:
batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels]
batch = {k: tf.convert_to_tensor(v, dtype=tf.int64) for k, v in batch.items()}
return batch
def numpy_call(self, features):
import numpy as np
label_name = "label" if "label" in features[0].keys() else "labels"
labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
batch = self.tokenizer.pad(
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
# Conversion to tensors will fail if we have labels as they are not of the same length yet.
return_tensors="np" if labels is None else None,
)
if labels is None:
return batch
sequence_length = np.array(batch["input_ids"]).shape[1]
padding_side = self.tokenizer.padding_side
if padding_side == "right":
batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels]
else:
batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels]
batch = {k: np.array(v, dtype=np.int64) for k, v in batch.items()}
return batch
def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None): def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
"""Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary.""" """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
import numpy as np
import torch
# Tensorize if necessary. # Tensorize if necessary.
if isinstance(examples[0], (list, tuple)): if isinstance(examples[0], (list, tuple, np.ndarray)):
examples = [torch.tensor(e, dtype=torch.long) for e in examples] examples = [torch.tensor(e, dtype=torch.long) for e in examples]
# Check if padding is necessary.
length_of_first = examples[0].size(0) length_of_first = examples[0].size(0)
# Check if padding is necessary.
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0): if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
return torch.stack(examples, dim=0) return torch.stack(examples, dim=0)
...@@ -229,8 +393,85 @@ def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None ...@@ -229,8 +393,85 @@ def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None
return result return result
def tolist(x: Union[List[Any], torch.Tensor]): def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
return x.tolist() if isinstance(x, torch.Tensor) else x import numpy as np
import tensorflow as tf
"""Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
# Tensorize if necessary.
if isinstance(examples[0], (list, tuple)):
examples = [tf.convert_to_tensor(e, dtype=tf.int64) for e in examples]
# Check if padding is necessary.
length_of_first = len(examples[0])
are_tensors_same_length = all(len(x) == length_of_first for x in examples)
if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
return tf.stack(examples, axis=0)
# If yes, check if we have a `pad_token`.
if tokenizer._pad_token is None:
raise ValueError(
"You are attempting to pad samples but the tokenizer you are using"
f" ({tokenizer.__class__.__name__}) does not have a pad token."
)
# Creating the full tensor and filling it with our data.
max_length = max(len(x) for x in examples)
if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
# result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
result = []
rank = tf.rank(examples[0])
paddings = np.zeros((rank, 2), dtype=np.int32)
for example in examples:
if tokenizer.padding_side == "right":
paddings[0, 1] = max_length - len(example)
else:
paddings[0, 0] = max_length - len(example)
result.append(tf.pad(example, paddings, constant_values=tokenizer.pad_token_id))
return tf.stack(result, axis=0)
def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
import numpy as np
"""Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
# Tensorize if necessary.
if isinstance(examples[0], (list, tuple)):
examples = [np.array(e, dtype=np.int64) for e in examples]
# Check if padding is necessary.
length_of_first = len(examples[0])
are_tensors_same_length = all(len(x) == length_of_first for x in examples)
if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
return np.stack(examples, axis=0)
# If yes, check if we have a `pad_token`.
if tokenizer._pad_token is None:
raise ValueError(
"You are attempting to pad samples but the tokenizer you are using"
f" ({tokenizer.__class__.__name__}) does not have a pad token."
)
# Creating the full tensor and filling it with our data.
max_length = max(len(x) for x in examples)
if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
result = np.full(shape=(len(examples), max_length), fill_value=tokenizer.pad_token_id, dtype=examples[0].dtype)
for i, example in enumerate(examples):
if tokenizer.padding_side == "right":
result[i, : example.shape[0]] = example
else:
result[i, -example.shape[0] :] = example
return result
def tolist(x):
if isinstance(x, list):
return x
elif hasattr(x, "numpy"): # Checks for TF tensors without needing the import
x = x.numpy()
return x.tolist()
@dataclass @dataclass
...@@ -268,13 +509,16 @@ class DataCollatorForSeq2Seq: ...@@ -268,13 +509,16 @@ class DataCollatorForSeq2Seq:
""" """
tokenizer: PreTrainedTokenizerBase tokenizer: PreTrainedTokenizerBase
model: Optional[PreTrainedModel] = None model: Optional[Any] = None
padding: Union[bool, str, PaddingStrategy] = True padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None pad_to_multiple_of: Optional[int] = None
label_pad_token_id: int = -100 label_pad_token_id: int = -100
return_tensors: str = "pt"
def __call__(self, features): def __call__(self, features, return_tensors=None):
if return_tensors is None:
return_tensors = self.return_tensors
labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
# same length to return tensors. # same length to return tensors.
...@@ -292,7 +536,7 @@ class DataCollatorForSeq2Seq: ...@@ -292,7 +536,7 @@ class DataCollatorForSeq2Seq:
padding=self.padding, padding=self.padding,
max_length=self.max_length, max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of, pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt", return_tensors=return_tensors,
) )
# prepare decoder_input_ids # prepare decoder_input_ids
...@@ -304,7 +548,7 @@ class DataCollatorForSeq2Seq: ...@@ -304,7 +548,7 @@ class DataCollatorForSeq2Seq:
@dataclass @dataclass
class DataCollatorForLanguageModeling: class DataCollatorForLanguageModeling(DataCollatorMixin):
""" """
Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
are not all of the same length. are not all of the same length.
...@@ -333,6 +577,8 @@ class DataCollatorForLanguageModeling: ...@@ -333,6 +577,8 @@ class DataCollatorForLanguageModeling:
mlm: bool = True mlm: bool = True
mlm_probability: float = 0.15 mlm_probability: float = 0.15
pad_to_multiple_of: Optional[int] = None pad_to_multiple_of: Optional[int] = None
tf_experimental_compile: bool = False
return_tensors: str = "pt"
def __post_init__(self): def __post_init__(self):
if self.mlm and self.tokenizer.mask_token is None: if self.mlm and self.tokenizer.mask_token is None:
...@@ -340,20 +586,98 @@ class DataCollatorForLanguageModeling: ...@@ -340,20 +586,98 @@ class DataCollatorForLanguageModeling:
"This tokenizer does not have a mask token which is necessary for masked language modeling. " "This tokenizer does not have a mask token which is necessary for masked language modeling. "
"You should pass `mlm=False` to train on causal language modeling instead." "You should pass `mlm=False` to train on causal language modeling instead."
) )
if self.tf_experimental_compile:
import tensorflow as tf
self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True)
@staticmethod
def tf_bernoulli(shape, probability):
import tensorflow as tf
prob_matrix = tf.fill(shape, probability)
return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool)
def tf_mask_tokens(
self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
) -> Tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
import tensorflow as tf
input_shape = tf.shape(inputs)
# 1 for a special token, 0 for a normal token in the special tokens mask
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability) & ~special_tokens_mask
# Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
labels = tf.where(masked_indices, inputs, -100)
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices
inputs = tf.where(indices_replaced, mask_token_id, inputs)
# 10% of the time, we replace masked input tokens with random word
indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=tf.int64)
inputs = tf.where(indices_random, random_words, inputs)
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
import tensorflow as tf
# Handle dict or lists with proper padding and conversion to tensor.
if isinstance(examples[0], (dict, BatchEncoding)):
batch = self.tokenizer.pad(examples, return_tensors="tf", pad_to_multiple_of=self.pad_to_multiple_of)
else:
batch = {
"input_ids": _tf_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
}
# If special token mask has been preprocessed, pop it from the dict.
special_tokens_mask = batch.pop("special_tokens_mask", None)
if self.mlm:
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
for val in batch["input_ids"].numpy().tolist()
]
# Cannot directly create as bool
special_tokens_mask = tf.cast(tf.convert_to_tensor(special_tokens_mask, dtype=tf.int64), tf.bool)
else:
special_tokens_mask = tf.cast(special_tokens_mask, tf.bool)
batch["input_ids"], batch["labels"] = self.tf_mask_tokens(
tf.cast(batch["input_ids"], tf.int64),
special_tokens_mask=special_tokens_mask,
mask_token_id=self.tokenizer.mask_token_id,
vocab_size=len(self.tokenizer),
)
else:
labels = batch["input_ids"]
if self.tokenizer.pad_token_id is not None:
# Replace self.tokenizer.pad_token_id with -100
labels = tf.where(labels == self.tokenizer.pad_token_id, -100, labels)
else:
labels = tf.identity(labels) # Makes a copy, just in case
batch["labels"] = labels
return batch
def __call__( def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]
) -> Dict[str, torch.Tensor]:
# Handle dict or lists with proper padding and conversion to tensor. # Handle dict or lists with proper padding and conversion to tensor.
if isinstance(examples[0], (dict, BatchEncoding)): if isinstance(examples[0], (dict, BatchEncoding)):
batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of) batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
else: else:
batch = {"input_ids": _collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)} batch = {
"input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
}
# If special token mask has been preprocessed, pop it from the dict. # If special token mask has been preprocessed, pop it from the dict.
special_tokens_mask = batch.pop("special_tokens_mask", None) special_tokens_mask = batch.pop("special_tokens_mask", None)
if self.mlm: if self.mlm:
batch["input_ids"], batch["labels"] = self.mask_tokens( batch["input_ids"], batch["labels"] = self.torch_mask_tokens(
batch["input_ids"], special_tokens_mask=special_tokens_mask batch["input_ids"], special_tokens_mask=special_tokens_mask
) )
else: else:
...@@ -363,12 +687,12 @@ class DataCollatorForLanguageModeling: ...@@ -363,12 +687,12 @@ class DataCollatorForLanguageModeling:
batch["labels"] = labels batch["labels"] = labels
return batch return batch
def mask_tokens( def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
""" """
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
""" """
import torch
labels = inputs.clone() labels = inputs.clone()
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
probability_matrix = torch.full(labels.shape, self.mlm_probability) probability_matrix = torch.full(labels.shape, self.mlm_probability)
...@@ -396,6 +720,69 @@ class DataCollatorForLanguageModeling: ...@@ -396,6 +720,69 @@ class DataCollatorForLanguageModeling:
# The rest of the time (10% of the time) we keep the masked input tokens unchanged # The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels return inputs, labels
def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
import numpy as np
# Handle dict or lists with proper padding and conversion to tensor.
if isinstance(examples[0], (dict, BatchEncoding)):
batch = self.tokenizer.pad(examples, return_tensors="np", pad_to_multiple_of=self.pad_to_multiple_of)
else:
batch = {
"input_ids": _numpy_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
}
# If special token mask has been preprocessed, pop it from the dict.
special_tokens_mask = batch.pop("special_tokens_mask", None)
if self.mlm:
batch["input_ids"], batch["labels"] = self.numpy_mask_tokens(
batch["input_ids"], special_tokens_mask=special_tokens_mask
)
else:
labels = np.copy(batch["input_ids"])
if self.tokenizer.pad_token_id is not None:
labels[labels == self.tokenizer.pad_token_id] = -100
batch["labels"] = labels
return batch
def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
import numpy as np
labels = np.copy(inputs)
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
probability_matrix = np.full(labels.shape, self.mlm_probability)
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
special_tokens_mask = np.array(special_tokens_mask, dtype=np.bool)
else:
special_tokens_mask = special_tokens_mask.astype(np.bool)
probability_matrix[special_tokens_mask] = 0
# Numpy doesn't have bernoulli, so we use a binomial with 1 trial
masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(np.bool)
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(np.bool) & masked_indices
inputs[indices_replaced] = self.tokenizer.mask_token_id
# 10% of the time, we replace masked input tokens with random word
# indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
indices_random = (
np.random.binomial(1, 0.5, size=labels.shape).astype(np.bool) & masked_indices & ~indices_replaced
)
random_words = np.random.randint(
low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
)
inputs[indices_random] = random_words
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
@dataclass @dataclass
class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
...@@ -413,16 +800,70 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): ...@@ -413,16 +800,70 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
:class:`.DataCollatorForLanguageModeling`. :class:`.DataCollatorForLanguageModeling`.
""" """
def __call__( def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]] if isinstance(examples[0], (dict, BatchEncoding)):
) -> Dict[str, torch.Tensor]: input_ids = [e["input_ids"] for e in examples]
else:
input_ids = examples
examples = [{"input_ids": e} for e in examples]
batch_input = _torch_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
mask_labels = []
for e in examples:
ref_tokens = []
for id in tolist(e["input_ids"]):
token = self.tokenizer._convert_id_to_token(id)
ref_tokens.append(token)
# For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢]
if "chinese_ref" in e:
ref_pos = tolist(e["chinese_ref"])
len_seq = len(e["input_ids"])
for i in range(len_seq):
if i in ref_pos:
ref_tokens[i] = "##" + ref_tokens[i]
mask_labels.append(self._whole_word_mask(ref_tokens))
batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
return {"input_ids": inputs, "labels": labels}
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
if isinstance(examples[0], (dict, BatchEncoding)):
input_ids = [e["input_ids"] for e in examples]
else:
input_ids = examples
examples = [{"input_ids": e} for e in examples]
batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
mask_labels = []
for e in examples:
ref_tokens = []
for id in tolist(e["input_ids"]):
token = self.tokenizer._convert_id_to_token(id)
ref_tokens.append(token)
# For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢]
if "chinese_ref" in e:
ref_pos = tolist(e["chinese_ref"])
len_seq = len(e["input_ids"])
for i in range(len_seq):
if i in ref_pos:
ref_tokens[i] = "##" + ref_tokens[i]
mask_labels.append(self._whole_word_mask(ref_tokens))
batch_mask = _tf_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
inputs, labels = self.tf_mask_tokens(batch_input, batch_mask)
return {"input_ids": inputs, "labels": labels}
def np_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
if isinstance(examples[0], (dict, BatchEncoding)): if isinstance(examples[0], (dict, BatchEncoding)):
input_ids = [e["input_ids"] for e in examples] input_ids = [e["input_ids"] for e in examples]
else: else:
input_ids = examples input_ids = examples
examples = [{"input_ids": e} for e in examples] examples = [{"input_ids": e} for e in examples]
batch_input = _collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
mask_labels = [] mask_labels = []
for e in examples: for e in examples:
...@@ -439,8 +880,8 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): ...@@ -439,8 +880,8 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
if i in ref_pos: if i in ref_pos:
ref_tokens[i] = "##" + ref_tokens[i] ref_tokens[i] = "##" + ref_tokens[i]
mask_labels.append(self._whole_word_mask(ref_tokens)) mask_labels.append(self._whole_word_mask(ref_tokens))
batch_mask = _collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of) batch_mask = _numpy_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
inputs, labels = self.mask_tokens(batch_input, batch_mask) inputs, labels = self.numpy_mask_tokens(batch_input, batch_mask)
return {"input_ids": inputs, "labels": labels} return {"input_ids": inputs, "labels": labels}
def _whole_word_mask(self, input_tokens: List[str], max_predictions=512): def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
...@@ -489,11 +930,12 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): ...@@ -489,11 +930,12 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))] mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
return mask_labels return mask_labels
def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
""" """
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref. 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
""" """
import torch
if self.tokenizer.mask_token is None: if self.tokenizer.mask_token is None:
raise ValueError( raise ValueError(
...@@ -527,6 +969,90 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling): ...@@ -527,6 +969,90 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
# The rest of the time (10% of the time) we keep the masked input tokens unchanged # The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels return inputs, labels
def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
"""
import tensorflow as tf
input_shape = tf.shape(inputs)
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
)
labels = inputs.clone()
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
masked_indices = tf.cast(mask_labels, tf.bool)
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
masked_indices = masked_indices & ~tf.convert_to_tensor(special_tokens_mask, dtype=tf.bool)
if self.tokenizer._pad_token is not None:
padding_mask = inputs == self.tokenizer.pad_token_id
masked_indices = masked_indices & ~padding_mask
# Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
labels = tf.where(masked_indices, inputs, -100)
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices
inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)
# 10% of the time, we replace masked input tokens with random word
indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
inputs = tf.where(indices_random, random_words, inputs)
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
"""
import numpy as np
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
)
labels = np.copy(inputs)
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
masked_indices = mask_labels.astype(np.bool)
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
masked_indices[np.array(special_tokens_mask, dtype=np.bool)] = 0
if self.tokenizer._pad_token is not None:
padding_mask = labels == self.tokenizer.pad_token_id
masked_indices[padding_mask] = 0
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(np.bool) & masked_indices
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
# 10% of the time, we replace masked input tokens with random word
# indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
indices_random = (
np.random.binomial(1, 0.5, size=labels.shape).astype(np.bool) & masked_indices & ~indices_replaced
)
random_words = np.random.randint(
low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
)
inputs[indices_random] = random_words[indices_random]
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
@dataclass @dataclass
class DataCollatorForSOP(DataCollatorForLanguageModeling): class DataCollatorForSOP(DataCollatorForLanguageModeling):
...@@ -544,9 +1070,12 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling): ...@@ -544,9 +1070,12 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
FutureWarning, FutureWarning,
) )
def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, Any]:
import torch
from torch.nn.utils.rnn import pad_sequence
input_ids = [example["input_ids"] for example in examples] input_ids = [example["input_ids"] for example in examples]
input_ids = _collate_batch(input_ids, self.tokenizer) input_ids = _torch_collate_batch(input_ids, self.tokenizer)
input_ids, labels, attention_mask = self.mask_tokens(input_ids) input_ids, labels, attention_mask = self.mask_tokens(input_ids)
token_type_ids = [example["token_type_ids"] for example in examples] token_type_ids = [example["token_type_ids"] for example in examples]
...@@ -564,11 +1093,13 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling): ...@@ -564,11 +1093,13 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
"sentence_order_label": sentence_order_label, "sentence_order_label": sentence_order_label,
} }
def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]:
""" """
Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10% Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
original. N-gram not applied yet. original. N-gram not applied yet.
""" """
import torch
if self.tokenizer.mask_token is None: if self.tokenizer.mask_token is None:
raise ValueError( raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
...@@ -606,7 +1137,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling): ...@@ -606,7 +1137,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
@dataclass @dataclass
class DataCollatorForPermutationLanguageModeling: class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
""" """
Data collator used for permutation language modeling. Data collator used for permutation language modeling.
...@@ -617,17 +1148,30 @@ class DataCollatorForPermutationLanguageModeling: ...@@ -617,17 +1148,30 @@ class DataCollatorForPermutationLanguageModeling:
tokenizer: PreTrainedTokenizerBase tokenizer: PreTrainedTokenizerBase
plm_probability: float = 1 / 6 plm_probability: float = 1 / 6
max_span_length: int = 5 # maximum length of a span of masked tokens max_span_length: int = 5 # maximum length of a span of masked tokens
return_tensors: str = "pt"
def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
if isinstance(examples[0], (dict, BatchEncoding)):
examples = [e["input_ids"] for e in examples]
batch = _torch_collate_batch(examples, self.tokenizer)
inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
if isinstance(examples[0], (dict, BatchEncoding)):
examples = [e["input_ids"] for e in examples]
batch = _tf_collate_batch(examples, self.tokenizer)
inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
def __call__( def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]
) -> Dict[str, torch.Tensor]:
if isinstance(examples[0], (dict, BatchEncoding)): if isinstance(examples[0], (dict, BatchEncoding)):
examples = [e["input_ids"] for e in examples] examples = [e["input_ids"] for e in examples]
batch = _collate_batch(examples, self.tokenizer) batch = _numpy_collate_batch(examples, self.tokenizer)
inputs, perm_mask, target_mapping, labels = self.mask_tokens(batch) inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch)
return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels} return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: def torch_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
""" """
The masked tokens to be predicted for a particular sequence are determined by the following algorithm: The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
...@@ -641,6 +1185,7 @@ class DataCollatorForPermutationLanguageModeling: ...@@ -641,6 +1185,7 @@ class DataCollatorForPermutationLanguageModeling:
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in 4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1. the sequence to be processed), repeat from Step 1.
""" """
import torch
if self.tokenizer.mask_token is None: if self.tokenizer.mask_token is None:
raise ValueError( raise ValueError(
...@@ -723,3 +1268,212 @@ class DataCollatorForPermutationLanguageModeling: ...@@ -723,3 +1268,212 @@ class DataCollatorForPermutationLanguageModeling:
) & masked_indices[i] ) & masked_indices[i]
return inputs.long(), perm_mask, target_mapping, labels.long() return inputs.long(), perm_mask, target_mapping, labels.long()
def tf_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
"""
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
masked)
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
masked
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
span_length]`` and mask tokens ``start_index:start_index + span_length``
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1.
"""
from random import randint
import numpy as np
import tensorflow as tf
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for permutation language modeling. Please add a mask token if you want to use this tokenizer."
)
if tf.shape(inputs)[1] % 2 != 0:
raise ValueError(
"This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see relevant comments in source code for details."
)
labels = tf.identity(inputs)
# Creating the mask and target_mapping tensors
masked_indices = np.full(labels.shape.as_list(), 0, dtype=np.bool)
labels_shape = tf.shape(labels)
target_mapping = np.zeros((labels_shape[0], labels_shape[1], labels_shape[1]), dtype=np.float32)
for i in range(len(labels)):
# Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
cur_len = 0
max_len = tf.shape(labels)[1]
while cur_len < max_len:
# Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
span_length = randint(1, self.max_span_length + 1)
# Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
context_length = int(span_length / self.plm_probability)
# Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
start_index = cur_len + randint(0, context_length - span_length + 1)
masked_indices[i, start_index : start_index + span_length] = 1
# Set `cur_len = cur_len + context_length`
cur_len += context_length
# Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
# the i-th predict corresponds to the i-th token.
target_mapping[i] = np.eye(labels_shape[1])
masked_indices = tf.cast(tf.convert_to_tensor(masked_indices), dtype=tf.bool)
target_mapping = tf.convert_to_tensor(target_mapping)
special_tokens_mask = tf.convert_to_tensor(
[
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
for val in labels.numpy().tolist()
],
)
special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool)
masked_indices = masked_indices & ~special_tokens_mask
if self.tokenizer._pad_token is not None:
padding_mask = labels == self.tokenizer.pad_token_id
masked_indices = masked_indices & ~padding_mask
# Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
non_func_mask = ~(padding_mask | special_tokens_mask)
inputs = tf.where(masked_indices, self.tokenizer.mask_token_id, inputs)
labels = tf.where(masked_indices, labels, -100) # We only compute loss on masked tokens
perm_mask = []
for i in range(len(labels)):
# Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
# determine which tokens a given token can attend to (encoded in `perm_mask`).
# Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
# (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
# we assume that reused length is half of sequence length and permutation length is equal to reused length.
# This requires that the sequence length be even.
# Create a linear factorisation order
# tf.range is the equivalent of torch.arange
perm_index = tf.range(labels_shape[1])
# Split this into two halves, assuming that half the sequence is reused each time
perm_index = tf.transpose(tf.reshape(perm_index, (-1, labels_shape[1] // 2)))
# Permute the two halves such that they do not cross over
perm_index = tf.random.shuffle(perm_index) # Shuffles along the first dimension
# Flatten this out into the desired permuted factorisation order
perm_index = tf.reshape(tf.transpose(perm_index), (-1,))
# Set the permutation indices of non-masked (non-functional) tokens to the
# smallest index (-1) so that:
# (1) They can be seen by all other positions
# (2) They cannot see masked positions, so there won't be information leak
perm_index = tf.where(~masked_indices[i] & non_func_mask[i], -1, perm_index)
# The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
# 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
# 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
perm_mask.append(
(tf.reshape(perm_index, (labels_shape[1], 1)) <= tf.reshape(perm_index, (1, labels_shape[1])))
& masked_indices[i]
)
perm_mask = tf.stack(perm_mask, axis=0)
return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64)
def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]:
"""
The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
masked)
2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
masked
3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
span_length]`` and mask tokens ``start_index:start_index + span_length``
4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
the sequence to be processed), repeat from Step 1.
"""
from random import randint
import numpy as np
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for permutation language modeling. Please add a mask token if you want to use this tokenizer."
)
if inputs.shape[1] % 2 != 0:
raise ValueError(
"This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see relevant comments in source code for details."
)
labels = np.copy(inputs)
# Creating the mask and target_mapping tensors
masked_indices = np.full(labels.shape, 0, dtype=np.bool)
target_mapping = np.zeros((labels.shape[0], labels.shape[1], labels.shape[1]), dtype=np.float32)
for i in range(labels.shape[0]):
# Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
cur_len = 0
max_len = labels.shape[1]
while cur_len < max_len:
# Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
span_length = randint(1, self.max_span_length + 1)
# Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
context_length = int(span_length / self.plm_probability)
# Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
start_index = cur_len + randint(0, context_length - span_length + 1)
masked_indices[i, start_index : start_index + span_length] = 1
# Set `cur_len = cur_len + context_length`
cur_len += context_length
# Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
# the i-th predict corresponds to the i-th token.
target_mapping[i] = np.eye(labels.shape[1])
special_tokens_mask = np.array(
[self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()],
dtype=np.bool,
)
masked_indices[special_tokens_mask] = 0
if self.tokenizer._pad_token is not None:
padding_mask = labels == self.tokenizer.pad_token_id
masked_indices[padding_mask] = 0.0
# Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
non_func_mask = ~(padding_mask | special_tokens_mask)
inputs[masked_indices] = self.tokenizer.mask_token_id
labels[~masked_indices] = -100 # We only compute loss on masked tokens
perm_mask = np.zeros((labels.shape[0], labels.shape[1], labels.shape[1]), dtype=np.float32)
for i in range(labels.shape[0]):
# Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
# determine which tokens a given token can attend to (encoded in `perm_mask`).
# Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
# (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
# we assume that reused length is half of sequence length and permutation length is equal to reused length.
# This requires that the sequence length be even.
# Create a linear factorisation order
perm_index = np.arange(labels.shape[1])
# Split this into two halves, assuming that half the sequence is reused each time
perm_index = perm_index.reshape((-1, labels.shape[1] // 2)).T
# Permute the two halves such that they do not cross over
np.random.shuffle(perm_index)
# Flatten this out into the desired permuted factorisation order
perm_index = perm_index.T.flatten()
# Set the permutation indices of non-masked (non-functional) tokens to the
# smallest index (-1) so that:
# (1) They can be seen by all other positions
# (2) They cannot see masked positions, so there won't be information leak
perm_index[~masked_indices[i] & non_func_mask[i]] = -1
# The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
# 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
# 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
perm_mask[i] = (
perm_index.reshape((labels.shape[1], 1)) <= perm_index.reshape((1, labels.shape[1]))
) & masked_indices[i]
return inputs.astype(np.int64), perm_mask, target_mapping, labels.astype(np.int64)
...@@ -12,62 +12,6 @@ class PyTorchBenchmarkArguments: ...@@ -12,62 +12,6 @@ class PyTorchBenchmarkArguments:
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
class DataCollator:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class DataCollatorForLanguageModeling:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class DataCollatorForPermutationLanguageModeling:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class DataCollatorForSeq2Seq:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class DataCollatorForSOP:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class DataCollatorForTokenClassification:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class DataCollatorForWholeWordMask:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class DataCollatorWithPadding:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
def default_data_collator(*args, **kwargs):
requires_backends(default_data_collator, ["torch"])
class GlueDataset: class GlueDataset:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
......
...@@ -17,20 +17,27 @@ import shutil ...@@ -17,20 +17,27 @@ import shutil
import tempfile import tempfile
import unittest import unittest
from transformers import BertTokenizer, is_torch_available, set_seed import numpy as np
from transformers.testing_utils import require_torch
from transformers import (
BertTokenizer,
DataCollatorForLanguageModeling,
DataCollatorForPermutationLanguageModeling,
DataCollatorForTokenClassification,
DataCollatorWithPadding,
default_data_collator,
is_tf_available,
is_torch_available,
set_seed,
)
from transformers.testing_utils import require_tf, require_torch
if is_torch_available(): if is_torch_available():
import torch import torch
from transformers import ( if is_tf_available():
DataCollatorForLanguageModeling, import tensorflow as tf
DataCollatorForPermutationLanguageModeling,
DataCollatorForTokenClassification,
DataCollatorWithPadding,
default_data_collator,
)
@require_torch @require_torch
...@@ -61,14 +68,14 @@ class DataCollatorIntegrationTest(unittest.TestCase): ...@@ -61,14 +68,14 @@ class DataCollatorIntegrationTest(unittest.TestCase):
self.assertEqual(batch["inputs"].shape, torch.Size([8, 6])) self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))
# Features can already be tensors # Features can already be tensors
features = [{"label": i, "inputs": torch.randint(10, [10])} for i in range(8)] features = [{"label": i, "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
batch = default_data_collator(features) batch = default_data_collator(features)
self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8))))) self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
self.assertEqual(batch["labels"].dtype, torch.long) self.assertEqual(batch["labels"].dtype, torch.long)
self.assertEqual(batch["inputs"].shape, torch.Size([8, 10])) self.assertEqual(batch["inputs"].shape, torch.Size([8, 10]))
# Labels can already be tensors # Labels can already be tensors
features = [{"label": torch.tensor(i), "inputs": torch.randint(10, [10])} for i in range(8)] features = [{"label": torch.tensor(i), "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
batch = default_data_collator(features) batch = default_data_collator(features)
self.assertEqual(batch["labels"].dtype, torch.long) self.assertEqual(batch["labels"].dtype, torch.long)
self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8))))) self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
...@@ -238,7 +245,7 @@ class DataCollatorIntegrationTest(unittest.TestCase): ...@@ -238,7 +245,7 @@ class DataCollatorIntegrationTest(unittest.TestCase):
self.assertEqual(batch["target_mapping"].shape, torch.Size((2, 10, 10))) self.assertEqual(batch["target_mapping"].shape, torch.Size((2, 10, 10)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
example = [torch.randint(5, [5])] example = [np.random.randint(0, 5, [5])]
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
# Expect error due to odd sequence length # Expect error due to odd sequence length
data_collator(example) data_collator(example)
...@@ -290,3 +297,529 @@ class DataCollatorIntegrationTest(unittest.TestCase): ...@@ -290,3 +297,529 @@ class DataCollatorIntegrationTest(unittest.TestCase):
self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8))) self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 8))) self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,))) self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
@require_tf
class TFDataCollatorIntegrationTest(unittest.TestCase):
def setUp(self):
super().setUp()
self.tmpdirname = tempfile.mkdtemp()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
self.vocab_file = os.path.join(self.tmpdirname, "vocab.txt")
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_default_with_dict(self):
features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].numpy().tolist(), list(range(8)))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
# With label_ids
features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].numpy().tolist(), ([[0, 1, 2]] * 8))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
# Features can already be tensors
features = [{"label": i, "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].numpy().tolist(), (list(range(8))))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 10])
# Labels can already be tensors
features = [{"label": np.array(i), "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["labels"].numpy().tolist(), list(range(8)))
self.assertEqual(batch["labels"].dtype, tf.int64)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 10])
def test_default_classification_and_regression(self):
data_collator = default_data_collator
features = [{"input_ids": [0, 1, 2, 3, 4], "label": i} for i in range(4)]
batch = data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.int64)
features = [{"input_ids": [0, 1, 2, 3, 4], "label": float(i)} for i in range(4)]
batch = data_collator(features, return_tensors="tf")
self.assertEqual(batch["labels"].dtype, tf.float32)
def test_default_with_no_labels(self):
features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertTrue("labels" not in batch)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
# With label_ids
features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="tf")
self.assertTrue("labels" not in batch)
self.assertEqual(batch["inputs"].shape.as_list(), [8, 6])
def test_data_collator_with_padding(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}]
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, [2, 8])
def test_data_collator_for_token_classification(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{"input_ids": [0, 1, 2], "labels": [0, 1, 2]},
{"input_ids": [0, 1, 2, 3, 4, 5], "labels": [0, 1, 2, 3, 4, 5]},
]
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
self.assertEqual(batch["labels"][0].numpy().tolist(), [0, 1, 2] + [-100] * 3)
data_collator = DataCollatorForTokenClassification(
tokenizer, padding="max_length", max_length=10, return_tensors="tf"
)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
self.assertEqual(batch["labels"][0].numpy().tolist(), [0, 1, 2] + [-1] * 3)
def _test_no_pad_and_pad(self, no_pad_features, pad_features):
tokenizer = BertTokenizer(self.vocab_file)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
batch = data_collator(pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
data_collator = DataCollatorForLanguageModeling(
tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="tf"
)
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
batch = data_collator(pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
tokenizer._pad_token = None
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")
with self.assertRaises(ValueError):
# Expect error due to padding token missing
data_collator(pad_features)
set_seed(42) # For reproducibility
tokenizer = BertTokenizer(self.vocab_file)
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
batch = data_collator(pad_features, return_tensors="tf")
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
batch = data_collator(pad_features, return_tensors="tf")
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 16])
self.assertEqual(batch["labels"].shape.as_list(), [2, 16])
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(tf.reduce_any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"].numpy()[~masked_tokens.numpy()].tolist()))
def test_data_collator_for_language_modeling(self):
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
self._test_no_pad_and_pad(no_pad_features, pad_features)
no_pad_features = [list(range(10)), list(range(10))]
pad_features = [list(range(5)), list(range(10))]
self._test_no_pad_and_pad(no_pad_features, pad_features)
def test_plm(self):
tokenizer = BertTokenizer(self.vocab_file)
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
data_collator = DataCollatorForPermutationLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(pad_features)
self.assertIsInstance(batch, dict)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["perm_mask"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["target_mapping"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
batch = data_collator(no_pad_features)
self.assertIsInstance(batch, dict)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
self.assertEqual(batch["perm_mask"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["target_mapping"].shape.as_list(), [2, 10, 10])
self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
example = [np.random.randint(0, 5, [5])]
with self.assertRaises(ValueError):
# Expect error due to odd sequence length
data_collator(example)
def test_nsp(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i}
for i in range(2)
]
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["labels"].shape.as_list(), [2, 5])
self.assertEqual(batch["next_sentence_label"].shape.as_list(), [2])
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
self.assertEqual(batch["next_sentence_label"].shape.as_list(), [2])
def test_sop(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{
"input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
"sentence_order_label": i,
}
for i in range(2)
]
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5])
self.assertEqual(batch["labels"].shape.as_list(), [2, 5])
self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8])
self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])
class NumpyDataCollatorIntegrationTest(unittest.TestCase):
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
self.vocab_file = os.path.join(self.tmpdirname, "vocab.txt")
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_default_with_dict(self):
features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="np")
self.assertEqual(batch["labels"].tolist(), list(range(8)))
self.assertEqual(batch["labels"].dtype, np.int64)
self.assertEqual(batch["inputs"].shape, (8, 6))
# With label_ids
features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="np")
self.assertEqual(batch["labels"].tolist(), [[0, 1, 2]] * 8)
self.assertEqual(batch["labels"].dtype, np.int64)
self.assertEqual(batch["inputs"].shape, (8, 6))
# Features can already be tensors
features = [{"label": i, "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
batch = default_data_collator(features, return_tensors="np")
self.assertEqual(batch["labels"].tolist(), list(range(8)))
self.assertEqual(batch["labels"].dtype, np.int64)
self.assertEqual(batch["inputs"].shape, (8, 10))
# Labels can already be tensors
features = [{"label": np.array(i), "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
batch = default_data_collator(features, return_tensors="np")
self.assertEqual(batch["labels"].dtype, np.int64)
self.assertEqual(batch["labels"].tolist(), (list(range(8))))
self.assertEqual(batch["labels"].dtype, np.int64)
self.assertEqual(batch["inputs"].shape, (8, 10))
def test_default_classification_and_regression(self):
data_collator = default_data_collator
features = [{"input_ids": [0, 1, 2, 3, 4], "label": i} for i in range(4)]
batch = data_collator(features, return_tensors="np")
self.assertEqual(batch["labels"].dtype, np.int64)
features = [{"input_ids": [0, 1, 2, 3, 4], "label": float(i)} for i in range(4)]
batch = data_collator(features, return_tensors="np")
self.assertEqual(batch["labels"].dtype, np.float32)
def test_default_with_no_labels(self):
features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="np")
self.assertTrue("labels" not in batch)
self.assertEqual(batch["inputs"].shape, (8, 6))
# With label_ids
features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
batch = default_data_collator(features, return_tensors="np")
self.assertTrue("labels" not in batch)
self.assertEqual(batch["inputs"].shape, (8, 6))
def test_data_collator_with_padding(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}]
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="np")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 6))
self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10, return_tensors="np")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 10))
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="np")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 8))
def test_data_collator_for_token_classification(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{"input_ids": [0, 1, 2], "labels": [0, 1, 2]},
{"input_ids": [0, 1, 2, 3, 4, 5], "labels": [0, 1, 2, 3, 4, 5]},
]
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="np")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 6))
self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["labels"].shape, (2, 6))
self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-100] * 3)
data_collator = DataCollatorForTokenClassification(
tokenizer, padding="max_length", max_length=10, return_tensors="np"
)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 10))
self.assertEqual(batch["labels"].shape, (2, 10))
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8, return_tensors="np")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 8))
self.assertEqual(batch["labels"].shape, (2, 8))
data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1, return_tensors="np")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 6))
self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
self.assertEqual(batch["labels"].shape, (2, 6))
self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)
def _test_no_pad_and_pad(self, no_pad_features, pad_features):
tokenizer = BertTokenizer(self.vocab_file)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="np")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape, (2, 10))
self.assertEqual(batch["labels"].shape, (2, 10))
batch = data_collator(pad_features, return_tensors="np")
self.assertEqual(batch["input_ids"].shape, (2, 10))
self.assertEqual(batch["labels"].shape, (2, 10))
data_collator = DataCollatorForLanguageModeling(
tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="np"
)
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape, (2, 16))
self.assertEqual(batch["labels"].shape, (2, 16))
batch = data_collator(pad_features, return_tensors="np")
self.assertEqual(batch["input_ids"].shape, (2, 16))
self.assertEqual(batch["labels"].shape, (2, 16))
tokenizer._pad_token = None
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="np")
with self.assertRaises(ValueError):
# Expect error due to padding token missing
data_collator(pad_features)
set_seed(42) # For reproducibility
tokenizer = BertTokenizer(self.vocab_file)
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="np")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape, (2, 10))
self.assertEqual(batch["labels"].shape, (2, 10))
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(np.any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
batch = data_collator(pad_features)
self.assertEqual(batch["input_ids"].shape, (2, 10))
self.assertEqual(batch["labels"].shape, (2, 10))
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(np.any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="np")
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape, (2, 16))
self.assertEqual(batch["labels"].shape, (2, 16))
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(np.any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
batch = data_collator(pad_features)
self.assertEqual(batch["input_ids"].shape, (2, 16))
self.assertEqual(batch["labels"].shape, (2, 16))
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(np.any(masked_tokens))
# self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
def test_data_collator_for_language_modeling(self):
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
self._test_no_pad_and_pad(no_pad_features, pad_features)
no_pad_features = [list(range(10)), list(range(10))]
pad_features = [list(range(5)), list(range(10))]
self._test_no_pad_and_pad(no_pad_features, pad_features)
def test_plm(self):
tokenizer = BertTokenizer(self.vocab_file)
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
data_collator = DataCollatorForPermutationLanguageModeling(tokenizer, return_tensors="np")
batch = data_collator(pad_features)
self.assertIsInstance(batch, dict)
self.assertEqual(batch["input_ids"].shape, (2, 10))
self.assertEqual(batch["perm_mask"].shape, (2, 10, 10))
self.assertEqual(batch["target_mapping"].shape, (2, 10, 10))
self.assertEqual(batch["labels"].shape, (2, 10))
batch = data_collator(no_pad_features)
self.assertIsInstance(batch, dict)
self.assertEqual(batch["input_ids"].shape, (2, 10))
self.assertEqual(batch["perm_mask"].shape, (2, 10, 10))
self.assertEqual(batch["target_mapping"].shape, (2, 10, 10))
self.assertEqual(batch["labels"].shape, (2, 10))
example = [np.random.randint(0, 5, [5])]
with self.assertRaises(ValueError):
# Expect error due to odd sequence length
data_collator(example)
def test_nsp(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i}
for i in range(2)
]
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="np")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 5))
self.assertEqual(batch["token_type_ids"].shape, (2, 5))
self.assertEqual(batch["labels"].shape, (2, 5))
self.assertEqual(batch["next_sentence_label"].shape, (2,))
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="np")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 8))
self.assertEqual(batch["token_type_ids"].shape, (2, 8))
self.assertEqual(batch["labels"].shape, (2, 8))
self.assertEqual(batch["next_sentence_label"].shape, (2,))
def test_sop(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
{
"input_ids": np.array([0, 1, 2, 3, 4]),
"token_type_ids": np.array([0, 1, 2, 3, 4]),
"sentence_order_label": i,
}
for i in range(2)
]
data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="np")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 5))
self.assertEqual(batch["token_type_ids"].shape, (2, 5))
self.assertEqual(batch["labels"].shape, (2, 5))
self.assertEqual(batch["sentence_order_label"].shape, (2,))
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="np")
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, (2, 8))
self.assertEqual(batch["token_type_ids"].shape, (2, 8))
self.assertEqual(batch["labels"].shape, (2, 8))
self.assertEqual(batch["sentence_order_label"].shape, (2,))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment