training_args.py 21.5 KB
Newer Older
Julien Chaumond's avatar
Julien Chaumond committed
1
2
import dataclasses
import json
Julien Plu's avatar
Julien Plu committed
3
import os
4
import warnings
5
from dataclasses import dataclass, field
6
from enum import Enum
Sylvain Gugger's avatar
Sylvain Gugger committed
7
from typing import Any, Dict, List, Optional, Tuple
Julien Chaumond's avatar
Julien Chaumond committed
8

9
from .file_utils import cached_property, is_torch_available, is_torch_tpu_available, torch_required
10
from .trainer_utils import EvaluationStrategy
Lysandre Debut's avatar
Lysandre Debut committed
11
from .utils import logging
Julien Chaumond's avatar
Julien Chaumond committed
12
13
14
15
16


if is_torch_available():
    import torch

17
if is_torch_tpu_available():
Lysandre Debut's avatar
Lysandre Debut committed
18
19
20
    import torch_xla.core.xla_model as xm


Lysandre Debut's avatar
Lysandre Debut committed
21
logger = logging.get_logger(__name__)
22
23


Julien Plu's avatar
Julien Plu committed
24
25
26
27
28
29
30
31
32
33
34
def default_logdir() -> str:
    """
    Same default as PyTorch
    """
    import socket
    from datetime import datetime

    current_time = datetime.now().strftime("%b%d_%H-%M-%S")
    return os.path.join("runs", current_time + "_" + socket.gethostname())


35
36
37
38
39
40
@dataclass
class TrainingArguments:
    """
    TrainingArguments is the subset of the arguments we use in our example scripts
    **which relate to the training loop itself**.

41
42
43
44
45
46
47
48
49
50
51
    Using :class:`~transformers.HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on the command line.

    Parameters:
        output_dir (:obj:`str`):
            The output directory where the model predictions and checkpoints will be written.
        overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
            If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
            :obj:`output_dir` points to a checkpoint directory.
        do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to run training or not.
52
53
54
        do_eval (:obj:`bool`, `optional`):
            Whether to run evaluation on the dev set or not. Will default to :obj:`evaluation_strategy` different from
            :obj:`"no"`.
55
56
        do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to run predictions on the test set or not.
57
58
59
60
61
62
63
        evaluation_strategy(:obj:`str` or :class:`~transformers.trainer_utils.EvaluationStrategy`, `optional`, defaults to :obj:`"no"`):
            The evaluation strategy to adopt during training. Possible values are:

                * :obj:`"no"`: No evaluation is done during training.
                * :obj:`"steps"`: Evaluation is done (and logged) every :obj:`eval_steps`.
                * :obj:`"epoch"`: Evaluation is done at the end of each epoch.

64
65
        prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
            When performing evaluation and predictions, only returns the loss.
66
67
68
69
70
71
        per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
            The batch size per GPU/TPU core/CPU for training.
        per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
            The batch size per GPU/TPU core/CPU for evaluation.
        gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
72
73
74
75
76
77

            .. warning::

                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
                examples.
78
79
80
81
82
83
84
85
86
        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
            The initial learning rate for Adam.
        weight_decay (:obj:`float`, `optional`, defaults to 0):
            The weight decay to apply (if not zero).
        adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
            Epsilon for the Adam optimizer.
        max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
            Maximum gradient norm (for gradient clipping).
        num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
Sylvain Gugger's avatar
Sylvain Gugger committed
87
88
            Total number of training epochs to perform (if not an integer, will perform the decimal part percents of
            the last epoch before stopping training).
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
        max_steps (:obj:`int`, `optional`, defaults to -1):
            If set to a positive number, the total number of training steps to perform. Overrides
            :obj:`num_train_epochs`.
        warmup_steps (:obj:`int`, `optional`, defaults to 0):
            Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
        logging_dir (:obj:`str`, `optional`):
            Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
        logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Wheter to log and evalulate the first :obj:`global_step` or not.
        logging_steps (:obj:`int`, `optional`, defaults to 500):
            Number of update steps between two logs.
        save_steps (:obj:`int`, `optional`, defaults to 500):
            Number of updates steps before two checkpoint saves.
        save_total_limit (:obj:`int`, `optional`):
            If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
            :obj:`output_dir`.
        no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
Alan deLevie's avatar
Alan deLevie committed
106
            Whether to not use CUDA even when it is available or not.
107
108
109
110
111
112
113
114
115
116
117
        seed (:obj:`int`, `optional`, defaults to 42):
            Random seed for initialization.
        fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.
        fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
            For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
            on the `apex documentation <https://nvidia.github.io/apex/amp.html>`__.
        local_rank (:obj:`int`, `optional`, defaults to -1):
            During distributed training, the rank of the process.
        tpu_num_cores (:obj:`int`, `optional`):
            When training on TPU, the mumber of TPU cores (automatically passed by launcher script).
118
        debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
119
120
121
122
            When training on TPU, whether to print debug metrics or not.
        dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
            or not.
123
124
125
        eval_steps (:obj:`int`, `optional`):
            Number of update steps between two evaluations if :obj:`evaluation_strategy="steps"`. Will default to the
            same value as :obj:`logging_steps` if not set.
Chady Kamar's avatar
Chady Kamar committed
126
127
        dataloader_num_workers (:obj:`int`, `optional`, defaults to 0):
            Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process.
128
129
130
131
132
        past_index (:obj:`int`, `optional`, defaults to -1):
            Some models like :doc:`TransformerXL <../model_doc/transformerxl>` or :doc`XLNet <../model_doc/xlnet>` can
            make use of the past hidden states for their predictions. If this argument is set to a positive int, the
            ``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model
            at the next training step under the keyword argument ``mems``.
133
134
        run_name (:obj:`str`, `optional`):
            A descriptor for the run. Notably used for wandb logging.
135
136
137
        disable_tqdm (:obj:`bool`, `optional`):
            Whether or not to disable the tqdm progress bars. Will default to :obj:`True` if the logging level is set
            to warn or lower (default), :obj:`False` otherwise.
138
139
140
141
142
        remove_unused_columns (:obj:`bool`, `optional`, defaults to :obj:`True`):
            If using `nlp.Dataset` datasets, whether or not to automatically remove the columns unused by the model
            forward method.

            (Note: this behavior is not implemented for :class:`~transformers.TFTrainer` yet.)
Sylvain Gugger's avatar
Sylvain Gugger committed
143
144
145
146
147
148
        label_names (:obj:`List[str]`, `optional`):
            The list of keys in your dictionary of inputs that correspond to the labels.

            Will eventually default to :obj:`["labels"]` except if the model used is one of the
            :obj:`XxxForQuestionAnswering` in which case it will default to
            :obj:`["start_positions", "end_positions"]`.
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
        load_best_model_at_end (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to load the best model found during training at the end of training.

            .. note::

                When set to :obj:`True`, the parameters :obj:`save_steps` will be ignored and the model will be saved
                after each evaluation.
        metric_for_best_model (:obj:`str`, `optional`)
            Use in conjunction with :obj:`load_best_model_at_end` to specify the metric to use to compare two different
            models. Must be the name of a metric returned by the evaluation with or without the prefix :obj:`"eval_"`.
            Will default to :obj:`"loss"` if unspecified and :obj:`load_best_model_at_end=True` (to use the evaluation
            loss).

            If you set this value, :obj:`greater_is_better` will defaut to :obj:`True`. Don't forget to set it to
            :obj:`False` if your metric is better when lower.
        greater_is_better (:obj:`bool`, `optional`)
            Use in conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better
            models should have a greater metric or not. Will default to:

            - :obj:`True` if :obj:`metric_for_best_model` is set to a value that isn't :obj:`"loss"` or
              :obj:`"eval_loss"`.
            - :obj:`False` if :obj:`metric_for_best_model` is not set, or set to :obj:`"loss"` or :obj:`"eval_loss"`.
171
172
173
174
175
176
    """

    output_dir: str = field(
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."}
    )
    overwrite_output_dir: bool = field(
177
178
179
180
181
182
183
        default=False,
        metadata={
            "help": (
                "Overwrite the content of the output directory."
                "Use this to continue training if output_dir points to a checkpoint directory."
            )
        },
184
185
186
    )

    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
187
    do_eval: bool = field(default=None, metadata={"help": "Whether to run eval on the dev set."})
Julien Chaumond's avatar
Julien Chaumond committed
188
    do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
189
    evaluate_during_training: bool = field(
190
191
192
193
194
        default=None,
        metadata={"help": "Run evaluation during training at each logging step."},
    )
    evaluation_strategy: EvaluationStrategy = field(
        default="no",
Lysandre's avatar
Lysandre committed
195
        metadata={"help": "Run evaluation during training at each logging step."},
196
    )
197
    prediction_loss_only: bool = field(
Lysandre's avatar
Lysandre committed
198
199
        default=False,
        metadata={"help": "When performing evaluation and predictions, only returns the loss."},
200
    )
201

202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
    per_device_train_batch_size: int = field(
        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
    )
    per_device_eval_batch_size: int = field(
        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
    )

    per_gpu_train_batch_size: Optional[int] = field(
        default=None,
        metadata={
            "help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. "
            "Batch size per GPU/TPU core/CPU for training."
        },
    )
    per_gpu_eval_batch_size: Optional[int] = field(
        default=None,
        metadata={
            "help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred."
            "Batch size per GPU/TPU core/CPU for evaluation."
        },
    )

224
    gradient_accumulation_steps: int = field(
225
226
        default=1,
        metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
227
228
229
230
    )

    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."})
    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay if we apply some."})
231
232
    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for Adam optimizer"})
    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for Adam optimizer"})
233
234
235
236
237
238
239
240
241
242
    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for Adam optimizer."})
    max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})

    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
    max_steps: int = field(
        default=-1,
        metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
    )
    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})

Julien Plu's avatar
Julien Plu committed
243
    logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
Julien Chaumond's avatar
Julien Chaumond committed
244
    logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
245
246
247
248
249
    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
    save_total_limit: Optional[int] = field(
        default=None,
        metadata={
250
251
252
253
            "help": (
                "Limit the total amount of checkpoints."
                "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
            )
254
255
        },
    )
Lysandre Debut's avatar
Lysandre Debut committed
256
    no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
257
258
259
260
261
262
263
264
265
    seed: int = field(default=42, metadata={"help": "random seed for initialization"})

    fp16: bool = field(
        default=False,
        metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"},
    )
    fp16_opt_level: str = field(
        default="O1",
        metadata={
266
267
268
269
            "help": (
                "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                "See details at https://nvidia.github.io/apex/amp.html"
            )
270
271
272
        },
    )
    local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
Julien Chaumond's avatar
Julien Chaumond committed
273

Lysandre Debut's avatar
Lysandre Debut committed
274
275
276
    tpu_num_cores: Optional[int] = field(
        default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
    )
277
278
279
280
281
    tpu_metrics_debug: bool = field(
        default=False,
        metadata={"help": "Deprecated, the use of `--debug` is preferred. TPU: Whether to print debug metrics"},
    )
    debug: bool = field(default=False, metadata={"help": "Whether to print debug metrics on TPU"})
Lysandre Debut's avatar
Lysandre Debut committed
282

Setu Shah's avatar
Setu Shah committed
283
284
285
    dataloader_drop_last: bool = field(
        default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
    )
286
    eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
Chady Kamar's avatar
Chady Kamar committed
287
288
    dataloader_num_workers: int = field(
        default=0,
Sylvain Gugger's avatar
Sylvain Gugger committed
289
290
291
        metadata={
            "help": "Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process."
        },
Chady Kamar's avatar
Chady Kamar committed
292
    )
Setu Shah's avatar
Setu Shah committed
293

294
295
296
297
298
    past_index: int = field(
        default=-1,
        metadata={"help": "If >=0, uses the corresponding part of the output as the past state for next step."},
    )

299
300
301
    run_name: Optional[str] = field(
        default=None, metadata={"help": "An optional descriptor for the run. Notably used for wandb logging."}
    )
302
303
304
305
    disable_tqdm: Optional[bool] = field(
        default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."}
    )

306
307
308
    remove_unused_columns: Optional[bool] = field(
        default=True, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."}
    )
Sylvain Gugger's avatar
Sylvain Gugger committed
309
310
311
312
    label_names: Optional[List[str]] = field(
        default=None, metadata={"help": "The list of keys in your dictionary of inputs that correspond to the labels."}
    )

313
314
315
316
317
318
319
320
321
322
323
    load_best_model_at_end: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether or not to load the best model found during training at the end of training."},
    )
    metric_for_best_model: Optional[str] = field(
        default=None, metadata={"help": "The metric to use to compare two different models."}
    )
    greater_is_better: Optional[bool] = field(
        default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."}
    )

Sylvain Gugger's avatar
Sylvain Gugger committed
324
325
326
    def __post_init__(self):
        if self.disable_tqdm is None:
            self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
327
328
        if self.evaluate_during_training is True:
            self.evaluation_strategy = EvaluationStrategy.STEPS
329
330
331
332
            warnings.warn(
                "The `evaluate_during_training` argument is deprecated in favor of `evaluation_strategy` (which has more options)",
                FutureWarning,
            )
333
334
335
        self.evaluation_strategy = EvaluationStrategy(self.evaluation_strategy)
        if self.do_eval is False and self.evaluation_strategy != EvaluationStrategy.NO:
            self.do_eval = True
336
337
        if self.eval_steps is None:
            self.eval_steps = self.logging_steps
338

339
340
341
342
        if self.load_best_model_at_end and self.metric_for_best_model is None:
            self.metric_for_best_model = "loss"
        if self.greater_is_better is None and self.metric_for_best_model is not None:
            self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
343
344
        if self.run_name is None:
            self.run_name = self.output_dir
345

Julien Chaumond's avatar
Julien Chaumond committed
346
347
    @property
    def train_batch_size(self) -> int:
348
349
350
        """
        The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
        """
351
352
353
354
355
356
357
        if self.per_gpu_train_batch_size:
            logger.warning(
                "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
                "version. Using `--per_device_train_batch_size` is preferred."
            )
        per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
        return per_device_batch_size * max(1, self.n_gpu)
Julien Chaumond's avatar
Julien Chaumond committed
358
359
360

    @property
    def eval_batch_size(self) -> int:
361
362
363
        """
        The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
        """
364
365
366
367
368
369
370
        if self.per_gpu_eval_batch_size:
            logger.warning(
                "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
                "version. Using `--per_device_eval_batch_size` is preferred."
            )
        per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
        return per_device_batch_size * max(1, self.n_gpu)
Julien Chaumond's avatar
Julien Chaumond committed
371
372
373
374
375
376
377
378

    @cached_property
    @torch_required
    def _setup_devices(self) -> Tuple["torch.device", int]:
        logger.info("PyTorch: setting up devices")
        if self.no_cuda:
            device = torch.device("cpu")
            n_gpu = 0
379
        elif is_torch_tpu_available():
Lysandre Debut's avatar
Lysandre Debut committed
380
381
            device = xm.xla_device()
            n_gpu = 0
Julien Chaumond's avatar
Julien Chaumond committed
382
383
384
        elif self.local_rank == -1:
            # if n_gpu is > 1 we'll use nn.DataParallel.
            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
385
386
387
388
389
            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
            # trigger an error that a device index is missing. Index 0 takes into account the
            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
            # will use the first GPU in that env, i.e. GPU#1
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
Julien Chaumond's avatar
Julien Chaumond committed
390
391
392
393
394
395
396
            n_gpu = torch.cuda.device_count()
        else:
            # Here, we'll use torch.distributed.
            # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
            torch.distributed.init_process_group(backend="nccl")
            device = torch.device("cuda", self.local_rank)
            n_gpu = 1
397
398
399
400

        if device.type == "cuda":
            torch.cuda.set_device(device)

Julien Chaumond's avatar
Julien Chaumond committed
401
402
403
404
405
        return device, n_gpu

    @property
    @torch_required
    def device(self) -> "torch.device":
406
407
408
        """
        The device used by this process.
        """
Julien Chaumond's avatar
Julien Chaumond committed
409
410
411
412
413
        return self._setup_devices[0]

    @property
    @torch_required
    def n_gpu(self):
414
415
416
417
418
419
420
        """
        The number of GPUs used by this process.

        Note:
            This will only be greater than one when you have multiple GPUs available but are not using distributed
            training. For distributed training, it will always be 1.
        """
Julien Chaumond's avatar
Julien Chaumond committed
421
422
        return self._setup_devices[1]

423
424
425
426
427
428
429
430
431
432
    def to_dict(self):
        """
        Serializes this instance while replace `Enum` by their values (for JSON serialization support).
        """
        d = dataclasses.asdict(self)
        for k, v in d.items():
            if isinstance(v, Enum):
                d[k] = v.value
        return d

Julien Chaumond's avatar
Julien Chaumond committed
433
434
435
436
    def to_json_string(self):
        """
        Serializes this instance to a JSON string.
        """
437
        return json.dumps(self.to_dict(), indent=2)
438
439
440
441
442

    def to_sanitized_dict(self) -> Dict[str, Any]:
        """
        Sanitized serialization to use with TensorBoard鈥檚 hparams
        """
443
        d = self.to_dict()
444
445
        d = {**d, **{"train_batch_size": self.train_batch_size, "eval_batch_size": self.eval_batch_size}}

446
447
448
        valid_types = [bool, int, float, str]
        if is_torch_available():
            valid_types.append(torch.Tensor)
449

450
        return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}