"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "1073a2bde5d608f9891d6da6df7b63921dca1b71"
Unverified Commit 87716a6d authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Documentation for the Trainer API (#5383)

* Documentation for the Trainer API

* Address review comments

* Address comments
parent c4d4e8bd
...@@ -173,6 +173,7 @@ conversion utilities for the following models: ...@@ -173,6 +173,7 @@ conversion utilities for the following models:
main_classes/pipelines main_classes/pipelines
main_classes/optimizer_schedules main_classes/optimizer_schedules
main_classes/processors main_classes/processors
main_classes/trainer
model_doc/auto model_doc/auto
model_doc/encoderdecoder model_doc/encoderdecoder
model_doc/bert model_doc/bert
......
Trainer
----------
The :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` classes provide an API for feature-complete
training in most standard use cases. It's used in most of the :doc:`example scripts <../examples>`.
Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`, create a
:class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` to access all the points of
customization during training.
The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex
<https://github.com/NVIDIA/apex>`__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.
``Trainer``
~~~~~~~~~~~
.. autoclass:: transformers.Trainer
:members:
``TFTrainer``
~~~~~~~~~~~~~
.. autoclass:: transformers.TFTrainer
:members:
``TrainingArguments``
~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TrainingArguments
:members:
``TFTrainingArguments``
~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFTrainingArguments
:members:
Utilities
~~~~~~~~~
.. autoclass:: transformers.EvalPrediction
.. autofunction:: transformers.set_seed
.. autofunction:: transformers.torch_distributed_zero_first
...@@ -397,7 +397,7 @@ if is_torch_available(): ...@@ -397,7 +397,7 @@ if is_torch_available():
) )
# Trainer # Trainer
from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction from .trainer import Trainer, set_seed, torch_distributed_zero_first
from .data.data_collator import default_data_collator, DataCollator, DataCollatorForLanguageModeling from .data.data_collator import default_data_collator, DataCollator, DataCollatorForLanguageModeling
from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments
......
...@@ -61,6 +61,12 @@ logger = logging.getLogger(__name__) ...@@ -61,6 +61,12 @@ logger = logging.getLogger(__name__)
def set_seed(seed: int): def set_seed(seed: int):
"""
Helper function for reproducible behavior to set the seed in ``random``, ``numpy`` and ``torch``.
Args:
seed (:obj:`int`): The seed to set.
"""
random.seed(seed) random.seed(seed)
np.random.seed(seed) np.random.seed(seed)
torch.manual_seed(seed) torch.manual_seed(seed)
...@@ -72,6 +78,9 @@ def set_seed(seed: int): ...@@ -72,6 +78,9 @@ def set_seed(seed: int):
def torch_distributed_zero_first(local_rank: int): def torch_distributed_zero_first(local_rank: int):
""" """
Decorator to make all processes in distributed training wait for each local_master to do something. Decorator to make all processes in distributed training wait for each local_master to do something.
Args:
local_rank (:obj:`int`): The rank of the local process.
""" """
if local_rank not in [-1, 0]: if local_rank not in [-1, 0]:
torch.distributed.barrier() torch.distributed.barrier()
...@@ -133,7 +142,31 @@ def get_tpu_sampler(dataset: Dataset): ...@@ -133,7 +142,31 @@ def get_tpu_sampler(dataset: Dataset):
class Trainer: class Trainer:
""" """
Trainer is a simple but feature-complete training and eval loop for PyTorch, Trainer is a simple but feature-complete training and eval loop for PyTorch,
optimized for Transformers. optimized for 🤗 Transformers.
Args:
model (:class:`~transformers.PreTrainedModel`):
The model to train, evaluate or use for predictions.
args (:class:`~transformers.TrainingArguments`):
The arguments to tweak training.
data_collator (:obj:`DataCollator`, `optional`, defaults to :func:`~transformers.default_data_collator`):
The function to use to from a batch from a list of elements of :obj:`train_dataset` or
:obj:`eval_dataset`.
train_dataset (:obj:`Dataset`, `optional`):
The dataset to use for training.
eval_dataset (:obj:`Dataset`, `optional`):
The dataset to use for evaluation.
compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`):
The function that will be used to compute metrics at evaluation. Must take a
:class:`~transformers.EvalPrediction` and return a dictionary string to metric values.
prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
When performing evaluation and predictions, only returns the loss.
tb_writer (:obj:`SummaryWriter`, `optional`):
Object to write to TensorBoard.
optimizers (:obj:`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR`, `optional`):
A tuple containing the optimizer and the scheduler to use. Will default to an instance of
:class:`~transformers.AdamW` on your model and a scheduler given by
:func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`.
""" """
model: PreTrainedModel model: PreTrainedModel
...@@ -160,14 +193,6 @@ class Trainer: ...@@ -160,14 +193,6 @@ class Trainer:
tb_writer: Optional["SummaryWriter"] = None, tb_writer: Optional["SummaryWriter"] = None,
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None,
): ):
"""
Trainer is a simple but feature-complete training and eval loop for PyTorch,
optimized for Transformers.
Args:
prediction_loss_only:
(Optional) in evaluation and prediction, only return the loss
"""
self.model = model.to(args.device) self.model = model.to(args.device)
self.args = args self.args = args
self.data_collator = data_collator if data_collator is not None else default_data_collator self.data_collator = data_collator if data_collator is not None else default_data_collator
...@@ -210,6 +235,9 @@ class Trainer: ...@@ -210,6 +235,9 @@ class Trainer:
) )
def get_train_dataloader(self) -> DataLoader: def get_train_dataloader(self) -> DataLoader:
"""
Returns the training :class:`~torch.utils.data.DataLoader`.
"""
if self.train_dataset is None: if self.train_dataset is None:
raise ValueError("Trainer: training requires a train_dataset.") raise ValueError("Trainer: training requires a train_dataset.")
if is_torch_tpu_available(): if is_torch_tpu_available():
...@@ -232,6 +260,13 @@ class Trainer: ...@@ -232,6 +260,13 @@ class Trainer:
return data_loader return data_loader
def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
"""
Returns the evaluation :class:`~torch.utils.data.DataLoader`.
Args:
eval_dataset (:obj:`Dataset`, `optional`):
If provided, will override `self.eval_dataset`.
"""
if eval_dataset is None and self.eval_dataset is None: if eval_dataset is None and self.eval_dataset is None:
raise ValueError("Trainer: evaluation requires an eval_dataset.") raise ValueError("Trainer: evaluation requires an eval_dataset.")
...@@ -257,6 +292,12 @@ class Trainer: ...@@ -257,6 +292,12 @@ class Trainer:
return data_loader return data_loader
def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
"""
Returns the test :class:`~torch.utils.data.DataLoader`.
Args:
test_dataset (obj:`Dataset`): The test dataset to use.
"""
# We use the same batch_size as for eval. # We use the same batch_size as for eval.
if is_torch_tpu_available(): if is_torch_tpu_available():
sampler = SequentialDistributedSampler( sampler = SequentialDistributedSampler(
...@@ -283,9 +324,8 @@ class Trainer: ...@@ -283,9 +324,8 @@ class Trainer:
""" """
Setup the optimizer and the learning rate scheduler. Setup the optimizer and the learning rate scheduler.
We provide a reasonable default that works well. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
If you want to use something else, you can pass a tuple in the Trainer's init, Trainer's init through :obj:`optimizers`, or override this method in a subclass.
or override this method in a subclass.
""" """
if self.optimizers is not None: if self.optimizers is not None:
return self.optimizers return self.optimizers
...@@ -336,7 +376,7 @@ class Trainer: ...@@ -336,7 +376,7 @@ class Trainer:
def num_examples(self, dataloader: DataLoader) -> int: def num_examples(self, dataloader: DataLoader) -> int:
""" """
Helper to get num of examples from a DataLoader, by accessing its Dataset. Helper to get number of samples in a :class:`~torch.utils.data.DataLoader` by accessing its Dataset.
""" """
return len(dataloader.dataset) return len(dataloader.dataset)
...@@ -345,9 +385,9 @@ class Trainer: ...@@ -345,9 +385,9 @@ class Trainer:
Main training entry point. Main training entry point.
Args: Args:
model_path: model_path (:obj:`str`, `optional`):
(Optional) Local path to model if model to train has been instantiated from a local path Local path to the model if the model to train has been instantiated from a local path. If present,
If present, we will try reloading the optimizer/scheduler states from there. training will resume from the optimizer/scheduler states loaded here.
""" """
train_dataloader = self.get_train_dataloader() train_dataloader = self.get_train_dataloader()
if self.args.max_steps > 0: if self.args.max_steps > 0:
...@@ -611,8 +651,7 @@ class Trainer: ...@@ -611,8 +651,7 @@ class Trainer:
def save_model(self, output_dir: Optional[str] = None): def save_model(self, output_dir: Optional[str] = None):
""" """
Saving best-practices: if you use default names for the model, Will save the model, so you can reload it using :obj:`from_pretrained()`.
you can reload it using from_pretrained().
Will only save from the world_master process (unless in TPUs). Will only save from the world_master process (unless in TPUs).
""" """
...@@ -683,22 +722,18 @@ class Trainer: ...@@ -683,22 +722,18 @@ class Trainer:
logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
shutil.rmtree(checkpoint) shutil.rmtree(checkpoint)
def evaluate( def evaluate(self, eval_dataset: Optional[Dataset] = None) -> Dict[str, float]:
self, eval_dataset: Optional[Dataset] = None, prediction_loss_only: Optional[bool] = None,
) -> Dict[str, float]:
""" """
Run evaluation and return metrics. Run evaluation and returns metrics.
The calling script will be responsible for providing a method to compute metrics, as they are The calling script will be responsible for providing a method to compute metrics, as they are
task-dependent. task-dependent (pass it to the init :obj:`compute_metrics` argument).
Args: Args:
eval_dataset: (Optional) Pass a dataset if you wish to override eval_dataset (:obj:`Dataset`, `optional`):
the one on the instance. Pass a dataset if you wish to override :obj:`self.eval_dataset`.
Returns: Returns:
A dict containing: A dictionary containing the evaluation loss and the potential metrics computed from the predictions.
- the eval loss
- the potential metrics computed from the predictions
""" """
eval_dataloader = self.get_eval_dataloader(eval_dataset) eval_dataloader = self.get_eval_dataloader(eval_dataset)
...@@ -714,10 +749,22 @@ class Trainer: ...@@ -714,10 +749,22 @@ class Trainer:
def predict(self, test_dataset: Dataset) -> PredictionOutput: def predict(self, test_dataset: Dataset) -> PredictionOutput:
""" """
Run prediction and return predictions and potential metrics. Run prediction and returns predictions and potential metrics.
Depending on the dataset and your use case, your test dataset may contain labels. Depending on the dataset and your use case, your test dataset may contain labels.
In that case, this method will also return metrics, like in evaluate(). In that case, this method will also return metrics, like in :obj:`evaluate()`.
Args:
test_dataset (:obj:`Dataset`):
Dataset to run the predictions on.
Returns:
`NamedTuple`:
predictions (:obj:`np.ndarray`):
The predictions on :obj:`test_dataset`.
label_ids (:obj:`np.ndarray`, `optional`):
The labels (if the dataset contained some).
metrics (:obj:`Dict[str, float]`, `optional`):
The potential dictionary of metrics (if the dataset contained labels).
""" """
test_dataloader = self.get_test_dataloader(test_dataset) test_dataloader = self.get_test_dataloader(test_dataset)
......
...@@ -29,6 +29,34 @@ def set_seed(seed: int): ...@@ -29,6 +29,34 @@ def set_seed(seed: int):
class TFTrainer: class TFTrainer:
"""
TFTrainer is a simple but feature-complete training and eval loop for TensorFlow,
optimized for 🤗 Transformers.
Args:
model (:class:`~transformers.TFPreTrainedModel`):
The model to train, evaluate or use for predictions.
args (:class:`~transformers.TFTrainingArguments`):
The arguments to tweak training.
train_dataset (:class:`~tf.data.Dataset`, `optional`):
The dataset to use for training.
eval_dataset (:class:`~tf.data.Dataset`, `optional`):
The dataset to use for evaluation.
compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`):
The function that will be used to compute metrics at evaluation. Must take a
:class:`~transformers.EvalPrediction` and return a dictionary string to metric values.
prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
When performing evaluation and predictions, only returns the loss.
tb_writer (:obj:`tf.summary.SummaryWriter`, `optional`):
Object to write to TensorBoard.
optimizers (:obj:`Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]`, `optional`):
A tuple containing the optimizer and the scheduler to use. The optimizer default to an instance of
:class:`tf.keras.optimizers.Adam` if :obj:`args.weight_decay_rate` is 0 else an instance of
:class:`~transformers.AdamWeightDecay`. The scheduler will default to an instance of
:class:`tf.keras.optimizers.schedules.PolynomialDecay` if :obj:`args.num_warmup_steps` is 0 else
an instance of :class:`~transformers.WarmUp`.
"""
model: TFPreTrainedModel model: TFPreTrainedModel
args: TFTrainingArguments args: TFTrainingArguments
train_dataset: Optional[tf.data.Dataset] train_dataset: Optional[tf.data.Dataset]
...@@ -78,6 +106,9 @@ class TFTrainer: ...@@ -78,6 +106,9 @@ class TFTrainer:
set_seed(self.args.seed) set_seed(self.args.seed)
def get_train_tfdataset(self) -> tf.data.Dataset: def get_train_tfdataset(self) -> tf.data.Dataset:
"""
Returns the training :class:`~tf.data.Dataset`.
"""
if self.train_dataset is None: if self.train_dataset is None:
raise ValueError("Trainer: training requires a train_dataset.") raise ValueError("Trainer: training requires a train_dataset.")
...@@ -101,6 +132,13 @@ class TFTrainer: ...@@ -101,6 +132,13 @@ class TFTrainer:
return self.args.strategy.experimental_distribute_dataset(ds) return self.args.strategy.experimental_distribute_dataset(ds)
def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset: def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset:
"""
Returns the evaluation :class:`~tf.data.Dataset`.
Args:
eval_dataset (:class:`~tf.data.Dataset`, `optional`):
If provided, will override `self.eval_dataset`.
"""
if eval_dataset is None and self.eval_dataset is None: if eval_dataset is None and self.eval_dataset is None:
raise ValueError("Trainer: evaluation requires an eval_dataset.") raise ValueError("Trainer: evaluation requires an eval_dataset.")
...@@ -114,6 +152,12 @@ class TFTrainer: ...@@ -114,6 +152,12 @@ class TFTrainer:
return self.args.strategy.experimental_distribute_dataset(ds) return self.args.strategy.experimental_distribute_dataset(ds)
def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
"""
Returns a test :class:`~tf.data.Dataset`.
Args:
test_dataset (:class:`~tf.data.Dataset`): The dataset to use.
"""
ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last) ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last)
return self.args.strategy.experimental_distribute_dataset(ds) return self.args.strategy.experimental_distribute_dataset(ds)
...@@ -124,9 +168,8 @@ class TFTrainer: ...@@ -124,9 +168,8 @@ class TFTrainer:
""" """
Setup the optimizer and the learning rate scheduler. Setup the optimizer and the learning rate scheduler.
We provide a reasonable default that works well. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
If you want to use something else, you can pass a tuple in the Trainer's init, TFTrainer's init through :obj:`optimizers`, or override this method in a subclass.
or override this method in a subclass.
""" """
if self.optimizers is not None: if self.optimizers is not None:
return self.optimizers return self.optimizers
...@@ -263,11 +306,18 @@ class TFTrainer: ...@@ -263,11 +306,18 @@ class TFTrainer:
logger.info(output) logger.info(output)
def evaluate( def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, float]:
self, eval_dataset: Optional[tf.data.Dataset] = None, prediction_loss_only: Optional[bool] = None
) -> Dict[str, float]:
""" """
Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Run evaluation and returns metrics.
The calling script will be responsible for providing a method to compute metrics, as they are
task-dependent (pass it to the init :obj:`compute_metrics` argument).
Args:
eval_dataset (:class:`~tf.data.Dataset`, `optional`):
Pass a dataset if you wish to override :obj:`self.eval_dataset`.
Returns:
A dictionary containing the evaluation loss and the potential metrics computed from the predictions.
""" """
eval_ds = self.get_eval_tfdataset(eval_dataset) eval_ds = self.get_eval_tfdataset(eval_dataset)
...@@ -478,12 +528,22 @@ class TFTrainer: ...@@ -478,12 +528,22 @@ class TFTrainer:
def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput: def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput:
""" """
Run prediction and return predictions and potential metrics. Run prediction and returns predictions and potential metrics.
Depending on the dataset and your use case, your test dataset may contain labels. Depending on the dataset and your use case, your test dataset may contain labels.
In that case, this method will also return metrics, like in evaluate(). In that case, this method will also return metrics, like in :obj:`evaluate()`.
Args: Args:
test_dataset: something similar to a PT Dataset. This is just test_dataset (:class:`~tf.data.Dataset`):
temporary before to have a framework-agnostic approach for datasets. Dataset to run the predictions on.
Returns:
`NamedTuple`:
predictions (:obj:`np.ndarray`):
The predictions on :obj:`test_dataset`.
label_ids (:obj:`np.ndarray`, `optional`):
The labels (if the dataset contained some).
metrics (:obj:`Dict[str, float]`, `optional`):
The potential dictionary of metrics (if the dataset contained labels).
""" """
test_ds = self.get_test_tfdataset(test_dataset) test_ds = self.get_test_tfdataset(test_dataset)
...@@ -491,7 +551,7 @@ class TFTrainer: ...@@ -491,7 +551,7 @@ class TFTrainer:
def save_model(self, output_dir: Optional[str] = None): def save_model(self, output_dir: Optional[str] = None):
""" """
Save the pretrained model. Will save the model, so you can reload it using :obj:`from_pretrained()`.
""" """
output_dir = output_dir if output_dir is not None else self.args.output_dir output_dir = output_dir if output_dir is not None else self.args.output_dir
......
...@@ -23,8 +23,11 @@ def is_wandb_available(): ...@@ -23,8 +23,11 @@ def is_wandb_available():
class EvalPrediction(NamedTuple): class EvalPrediction(NamedTuple):
""" """
Evaluation output (always contains labels), to be used Evaluation output (always contains labels), to be used to compute metrics.
to compute metrics.
Parameters:
predictions (:obj:`np.ndarray`): Predictions of the model.
label_ids (:obj:`np.ndarray`): Targets to be matched.
""" """
predictions: np.ndarray predictions: np.ndarray
......
...@@ -35,9 +35,73 @@ class TrainingArguments: ...@@ -35,9 +35,73 @@ class TrainingArguments:
TrainingArguments is the subset of the arguments we use in our example scripts TrainingArguments is the subset of the arguments we use in our example scripts
**which relate to the training loop itself**. **which relate to the training loop itself**.
Using `HfArgumentParser` we can turn this class Using :class:`~transformers.HfArgumentParser` we can turn this class
into argparse arguments to be able to specify them on into argparse arguments to be able to specify them on the command line.
the command line.
Parameters:
output_dir (:obj:`str`):
The output directory where the model predictions and checkpoints will be written.
overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
:obj:`output_dir` points to a checkpoint directory.
do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to run training or not.
do_eval (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to run evaluation on the dev set or not.
do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to run predictions on the test set or not.
evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to run evaluation during training at each logging step or not.
per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
The batch size per GPU/TPU core/CPU for training.
per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
The batch size per GPU/TPU core/CPU for evaluation.
gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
The initial learning rate for Adam.
weight_decay (:obj:`float`, `optional`, defaults to 0):
The weight decay to apply (if not zero).
adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
Epsilon for the Adam optimizer.
max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
Maximum gradient norm (for gradient clipping).
num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
Total number of training epochs to perform.
max_steps (:obj:`int`, `optional`, defaults to -1):
If set to a positive number, the total number of training steps to perform. Overrides
:obj:`num_train_epochs`.
warmup_steps (:obj:`int`, `optional`, defaults to 0):
Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
logging_dir (:obj:`str`, `optional`):
Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
Wheter to log and evalulate the first :obj:`global_step` or not.
logging_steps (:obj:`int`, `optional`, defaults to 500):
Number of update steps between two logs.
save_steps (:obj:`int`, `optional`, defaults to 500):
Number of updates steps before two checkpoint saves.
save_total_limit (:obj:`int`, `optional`):
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
:obj:`output_dir`.
no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
Wherher to not use CUDA even when it is available or not.
seed (:obj:`int`, `optional`, defaults to 42):
Random seed for initialization.
fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.
fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
on the `apex documentation <https://nvidia.github.io/apex/amp.html>`__.
local_rank (:obj:`int`, `optional`, defaults to -1):
During distributed training, the rank of the process.
tpu_num_cores (:obj:`int`, `optional`):
When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
tpu_metrics_debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
When training on TPU, whether to print debug metrics or not.
dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
or not.
""" """
output_dir: str = field( output_dir: str = field(
...@@ -141,6 +205,9 @@ class TrainingArguments: ...@@ -141,6 +205,9 @@ class TrainingArguments:
@property @property
def train_batch_size(self) -> int: def train_batch_size(self) -> int:
"""
The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
"""
if self.per_gpu_train_batch_size: if self.per_gpu_train_batch_size:
logger.warning( logger.warning(
"Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future " "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
...@@ -151,6 +218,9 @@ class TrainingArguments: ...@@ -151,6 +218,9 @@ class TrainingArguments:
@property @property
def eval_batch_size(self) -> int: def eval_batch_size(self) -> int:
"""
The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
"""
if self.per_gpu_eval_batch_size: if self.per_gpu_eval_batch_size:
logger.warning( logger.warning(
"Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future " "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
...@@ -193,11 +263,21 @@ class TrainingArguments: ...@@ -193,11 +263,21 @@ class TrainingArguments:
@property @property
@torch_required @torch_required
def device(self) -> "torch.device": def device(self) -> "torch.device":
"""
The device used by this process.
"""
return self._setup_devices[0] return self._setup_devices[0]
@property @property
@torch_required @torch_required
def n_gpu(self): def n_gpu(self):
"""
The number of GPUs used by this process.
Note:
This will only be greater than one when you have multiple GPUs available but are not using distributed
training. For distributed training, it will always be 1.
"""
return self._setup_devices[1] return self._setup_devices[1]
def to_json_string(self): def to_json_string(self):
......
...@@ -14,6 +14,85 @@ if is_tf_available(): ...@@ -14,6 +14,85 @@ if is_tf_available():
@dataclass @dataclass
class TFTrainingArguments(TrainingArguments): class TFTrainingArguments(TrainingArguments):
"""
TrainingArguments is the subset of the arguments we use in our example scripts
**which relate to the training loop itself**.
Using :class:`~transformers.HfArgumentParser` we can turn this class
into argparse arguments to be able to specify them on the command line.
Parameters:
output_dir (:obj:`str`):
The output directory where the model predictions and checkpoints will be written.
overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
:obj:`output_dir` points to a checkpoint directory.
do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to run training or not.
do_eval (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to run evaluation on the dev set or not.
do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to run predictions on the test set or not.
evaluate_during_training (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to run evaluation during training at each logging step or not.
per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
The batch size per GPU/TPU core/CPU for training.
per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
The batch size per GPU/TPU core/CPU for evaluation.
gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
The initial learning rate for Adam.
weight_decay (:obj:`float`, `optional`, defaults to 0):
The weight decay to apply (if not zero).
adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
Epsilon for the Adam optimizer.
max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
Maximum gradient norm (for gradient clipping).
num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
Total number of training epochs to perform.
max_steps (:obj:`int`, `optional`, defaults to -1):
If set to a positive number, the total number of training steps to perform. Overrides
:obj:`num_train_epochs`.
warmup_steps (:obj:`int`, `optional`, defaults to 0):
Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
logging_dir (:obj:`str`, `optional`):
Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
Wheter to log and evalulate the first :obj:`global_step` or not.
logging_steps (:obj:`int`, `optional`, defaults to 500):
Number of update steps between two logs.
save_steps (:obj:`int`, `optional`, defaults to 500):
Number of updates steps before two checkpoint saves.
save_total_limit (:obj:`int`, `optional`):
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
:obj:`output_dir`.
no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
Wherher to not use CUDA even when it is available or not.
seed (:obj:`int`, `optional`, defaults to 42):
Random seed for initialization.
fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.
fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
on the `apex documentation <https://nvidia.github.io/apex/amp.html>`__.
local_rank (:obj:`int`, `optional`, defaults to -1):
During distributed training, the rank of the process.
tpu_num_cores (:obj:`int`, `optional`):
When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
tpu_metrics_debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
When training on TPU, whether to print debug metrics or not.
dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
or not.
tpu_name (:obj:`str`, `optional`):
The name of the TPU the process is running on.
eval_steps (:obj:`int`, `optional`, defaults to 1000):
Number of update steps before two evaluations.
debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
Wheter to activate the trace to record computation graphs and profiling information or not.
"""
tpu_name: str = field( tpu_name: str = field(
default=None, metadata={"help": "Name of TPU"}, default=None, metadata={"help": "Name of TPU"},
) )
...@@ -59,9 +138,15 @@ class TFTrainingArguments(TrainingArguments): ...@@ -59,9 +138,15 @@ class TFTrainingArguments(TrainingArguments):
@property @property
@tf_required @tf_required
def strategy(self) -> "tf.distribute.Strategy": def strategy(self) -> "tf.distribute.Strategy":
"""
The strategy used for distributed training.
"""
return self._setup_strategy return self._setup_strategy
@property @property
@tf_required @tf_required
def n_gpu(self) -> int: def n_gpu(self) -> int:
"""
The number of replicas (GPUs or TPU cores) used in this training.
"""
return self._setup_strategy.num_replicas_in_sync return self._setup_strategy.num_replicas_in_sync
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment