Unverified Commit e5f45227 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Trainer automatically drops unused columns in nlp datasets (#6449)

* Add a classmethod to easily build a Trainer from nlp dataset and metric

* Fix docstrings

* Split train/eval

* Formatting

* Log dropped columns + docs

* Authorize callable activations

* Poc for auto activation

* Be framework-agnostic

* Formatting

* Remove class method

* Remove unnecessary code
parent 5bf4465e
...@@ -82,6 +82,7 @@ from .file_utils import ( ...@@ -82,6 +82,7 @@ from .file_utils import (
add_start_docstrings, add_start_docstrings,
cached_path, cached_path,
is_apex_available, is_apex_available,
is_nlp_available,
is_psutil_available, is_psutil_available,
is_py3nvml_available, is_py3nvml_available,
is_tf_available, is_tf_available,
......
...@@ -64,6 +64,14 @@ except (ImportError, AssertionError): ...@@ -64,6 +64,14 @@ except (ImportError, AssertionError):
_tf_available = False # pylint: disable=invalid-name _tf_available = False # pylint: disable=invalid-name
try:
import nlp # noqa: F401
_nlp_available = True
except ImportError:
_nlp_available = False
try: try:
from torch.hub import _get_torch_home from torch.hub import _get_torch_home
...@@ -144,6 +152,10 @@ def is_torch_tpu_available(): ...@@ -144,6 +152,10 @@ def is_torch_tpu_available():
return _torch_tpu_available return _torch_tpu_available
def is_nlp_available():
return _nlp_available
def is_psutil_available(): def is_psutil_available():
return _psutil_available return _psutil_available
......
import inspect
import logging import logging
import math import math
import os import os
...@@ -19,7 +20,7 @@ from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler ...@@ -19,7 +20,7 @@ from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler
from tqdm.auto import tqdm, trange from tqdm.auto import tqdm, trange
from .data.data_collator import DataCollator, default_data_collator from .data.data_collator import DataCollator, default_data_collator
from .file_utils import is_torch_tpu_available from .file_utils import is_nlp_available, is_torch_tpu_available
from .integrations import is_comet_available, is_tensorboard_available, is_wandb_available from .integrations import is_comet_available, is_tensorboard_available, is_wandb_available
from .modeling_utils import PreTrainedModel from .modeling_utils import PreTrainedModel
from .optimization import AdamW, get_linear_schedule_with_warmup from .optimization import AdamW, get_linear_schedule_with_warmup
...@@ -41,6 +42,8 @@ else: ...@@ -41,6 +42,8 @@ else:
_use_native_amp = True _use_native_amp = True
from torch.cuda.amp import autocast from torch.cuda.amp import autocast
if is_nlp_available():
import nlp
if is_torch_tpu_available(): if is_torch_tpu_available():
import torch_xla.core.xla_model as xm import torch_xla.core.xla_model as xm
...@@ -140,19 +143,19 @@ class Trainer: ...@@ -140,19 +143,19 @@ class Trainer:
model (:class:`~transformers.PreTrainedModel`): model (:class:`~transformers.PreTrainedModel`):
The model to train, evaluate or use for predictions. The model to train, evaluate or use for predictions.
args (:class:`~transformers.TrainingArguments`): args (:class:`~transformers.TrainingArguments`):
The arguments to tweak training. The arguments to tweak for training.
data_collator (:obj:`DataCollator`, `optional`, defaults to :func:`~transformers.default_data_collator`): data_collator (:obj:`DataCollator`, `optional`, defaults to :func:`~transformers.default_data_collator`):
The function to use to from a batch from a list of elements of :obj:`train_dataset` or The function to use to form a batch from a list of elements of :obj:`train_dataset` or
:obj:`eval_dataset`. :obj:`eval_dataset`.
train_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`): train_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
The dataset to use for training. The dataset to use for training. If it is an :obj:`nlp.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed.
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`): eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
The dataset to use for evaluation. The dataset to use for evaluation. If it is an :obj:`nlp.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed.
compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`): compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`):
The function that will be used to compute metrics at evaluation. Must take a The function that will be used to compute metrics at evaluation. Must take a
:class:`~transformers.EvalPrediction` and return a dictionary string to metric values. :class:`~transformers.EvalPrediction` and return a dictionary string to metric values.
prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
When performing evaluation and predictions, only returns the loss.
tb_writer (:obj:`SummaryWriter`, `optional`): tb_writer (:obj:`SummaryWriter`, `optional`):
Object to write to TensorBoard. Object to write to TensorBoard.
optimizers (:obj:`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR`, `optional`): optimizers (:obj:`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR`, `optional`):
...@@ -228,11 +231,32 @@ class Trainer: ...@@ -228,11 +231,32 @@ class Trainer:
), ),
FutureWarning, FutureWarning,
) )
if is_nlp_available():
if isinstance(train_dataset, nlp.Dataset):
self._remove_unused_columns(self.train_dataset, description="training")
if isinstance(eval_dataset, nlp.Dataset):
self._remove_unused_columns(self.eval_dataset, description="evaluation")
self.global_step = None self.global_step = None
self.epoch = None self.epoch = None
if self.args.fp16 and _use_native_amp: if self.args.fp16 and _use_native_amp:
self.scaler = torch.cuda.amp.GradScaler() self.scaler = torch.cuda.amp.GradScaler()
def _remove_unused_columns(self, dataset: "nlp.Dataset", description: Optional[str] = None):
# Inspect model forward signature to keep only the arguments it accepts.
signature = inspect.signature(self.model.forward)
signature_columns = list(signature.parameters.keys())
# Labels may be named label or label_ids, the default data collator handles that.
signature_columns += ["label", "label_ids"]
columns = [k for k in signature_columns if k in dataset.column_names]
ignored_columns = list(set(dataset.column_names) - set(signature_columns))
dset_description = "" if description is None else f"in the {description} set "
logger.info(
f"The following columns {dset_description}don't have a corresponding argument in `{self.model.__class__.__name__}.forward` and have been ignored: {', '.join(ignored_columns)}."
)
dataset.set_format(columns=columns)
def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]: def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
if isinstance(self.train_dataset, torch.utils.data.IterableDataset): if isinstance(self.train_dataset, torch.utils.data.IterableDataset):
return None return None
...@@ -287,11 +311,13 @@ class Trainer: ...@@ -287,11 +311,13 @@ class Trainer:
Args: Args:
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`): eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
If provided, will override :obj:`self.eval_dataset`. If provided, will override :obj:`self.eval_dataset`. If it is an :obj:`nlp.Dataset`, columns not
accepted by the ``model.forward()`` method are automatically removed.
""" """
if eval_dataset is None and self.eval_dataset is None: if eval_dataset is None and self.eval_dataset is None:
raise ValueError("Trainer: evaluation requires an eval_dataset.") raise ValueError("Trainer: evaluation requires an eval_dataset.")
elif eval_dataset is not None and is_nlp_available() and isinstance(eval_dataset, nlp.Dataset):
self._remove_unused_columns(eval_dataset, description="evaluation")
eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
eval_sampler = self._get_eval_sampler(eval_dataset) eval_sampler = self._get_eval_sampler(eval_dataset)
...@@ -314,8 +340,11 @@ class Trainer: ...@@ -314,8 +340,11 @@ class Trainer:
Args: Args:
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`): eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
The test dataset to use. The test dataset to use. If it is an :obj:`nlp.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed.
""" """
if is_nlp_available() and isinstance(test_dataset, nlp.Dataset):
self._remove_unused_columns(test_dataset, description="test")
test_sampler = self._get_eval_sampler(test_dataset) test_sampler = self._get_eval_sampler(test_dataset)
# We use the same batch_size as for eval. # We use the same batch_size as for eval.
...@@ -903,7 +932,8 @@ class Trainer: ...@@ -903,7 +932,8 @@ class Trainer:
Args: Args:
eval_dataset (:obj:`Dataset`, `optional`): eval_dataset (:obj:`Dataset`, `optional`):
Pass a dataset if you wish to override :obj:`self.eval_dataset`. Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`nlp.Dataset`,
columns not accepted by the ``model.forward()`` method are automatically removed.
Returns: Returns:
A dictionary containing the evaluation loss and the potential metrics computed from the predictions. A dictionary containing the evaluation loss and the potential metrics computed from the predictions.
...@@ -929,7 +959,8 @@ class Trainer: ...@@ -929,7 +959,8 @@ class Trainer:
Args: Args:
test_dataset (:obj:`Dataset`): test_dataset (:obj:`Dataset`):
Dataset to run the predictions on. Dataset to run the predictions on. If it is an :obj:`nlp.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed.
Returns: Returns:
`NamedTuple`: `NamedTuple`:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment