training_args.py 22.2 KB
Newer Older
Julien Chaumond's avatar
Julien Chaumond committed
1
2
import dataclasses
import json
Julien Plu's avatar
Julien Plu committed
3
import os
4
import warnings
5
from dataclasses import dataclass, field
6
from enum import Enum
Sylvain Gugger's avatar
Sylvain Gugger committed
7
from typing import Any, Dict, List, Optional, Tuple
Julien Chaumond's avatar
Julien Chaumond committed
8

9
from .file_utils import cached_property, is_torch_available, is_torch_tpu_available, torch_required
10
from .trainer_utils import EvaluationStrategy
Lysandre Debut's avatar
Lysandre Debut committed
11
from .utils import logging
Julien Chaumond's avatar
Julien Chaumond committed
12
13
14
15
16


if is_torch_available():
    import torch

17
if is_torch_tpu_available():
Lysandre Debut's avatar
Lysandre Debut committed
18
19
20
    import torch_xla.core.xla_model as xm


Lysandre Debut's avatar
Lysandre Debut committed
21
logger = logging.get_logger(__name__)
22
23


Julien Plu's avatar
Julien Plu committed
24
25
26
27
28
29
30
31
32
33
34
def default_logdir() -> str:
    """
    Same default as PyTorch
    """
    import socket
    from datetime import datetime

    current_time = datetime.now().strftime("%b%d_%H-%M-%S")
    return os.path.join("runs", current_time + "_" + socket.gethostname())


35
36
37
@dataclass
class TrainingArguments:
    """
Sylvain Gugger's avatar
Sylvain Gugger committed
38
39
    TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
    itself**.
40

Sylvain Gugger's avatar
Sylvain Gugger committed
41
42
    Using :class:`~transformers.HfArgumentParser` we can turn this class into argparse arguments to be able to specify
    them on the command line.
43
44
45
46
47
48
49
50
51

    Parameters:
        output_dir (:obj:`str`):
            The output directory where the model predictions and checkpoints will be written.
        overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
            If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
            :obj:`output_dir` points to a checkpoint directory.
        do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to run training or not.
52
53
54
        do_eval (:obj:`bool`, `optional`):
            Whether to run evaluation on the dev set or not. Will default to :obj:`evaluation_strategy` different from
            :obj:`"no"`.
55
56
        do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to run predictions on the test set or not.
Sylvain Gugger's avatar
Sylvain Gugger committed
57
        evaluation_strategy (:obj:`str` or :class:`~transformers.trainer_utils.EvaluationStrategy`, `optional`, defaults to :obj:`"no"`):
58
59
60
61
62
63
            The evaluation strategy to adopt during training. Possible values are:

                * :obj:`"no"`: No evaluation is done during training.
                * :obj:`"steps"`: Evaluation is done (and logged) every :obj:`eval_steps`.
                * :obj:`"epoch"`: Evaluation is done at the end of each epoch.

64
65
        prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
            When performing evaluation and predictions, only returns the loss.
66
67
68
69
        per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
            The batch size per GPU/TPU core/CPU for training.
        per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
            The batch size per GPU/TPU core/CPU for evaluation.
70
        gradient_accumulation_steps (:obj:`int`, `optional`, defaults to 1):
71
            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
72
73
74
75
76
77

            .. warning::

                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
                examples.
78
79
80
81
        eval_accumulation_steps (:obj:`int`, `optional`):
            Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
            left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
            requires more memory).
82
83
84
85
86
87
88
89
90
        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
            The initial learning rate for Adam.
        weight_decay (:obj:`float`, `optional`, defaults to 0):
            The weight decay to apply (if not zero).
        adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
            Epsilon for the Adam optimizer.
        max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
            Maximum gradient norm (for gradient clipping).
        num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
Sylvain Gugger's avatar
Sylvain Gugger committed
91
92
            Total number of training epochs to perform (if not an integer, will perform the decimal part percents of
            the last epoch before stopping training).
93
94
95
96
97
98
99
100
        max_steps (:obj:`int`, `optional`, defaults to -1):
            If set to a positive number, the total number of training steps to perform. Overrides
            :obj:`num_train_epochs`.
        warmup_steps (:obj:`int`, `optional`, defaults to 0):
            Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
        logging_dir (:obj:`str`, `optional`):
            Tensorboard log directory. Will default to `runs/**CURRENT_DATETIME_HOSTNAME**`.
        logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
Tiger's avatar
Tiger committed
101
            Whether to log and evaluate the first :obj:`global_step` or not.
102
103
104
105
106
107
108
109
        logging_steps (:obj:`int`, `optional`, defaults to 500):
            Number of update steps between two logs.
        save_steps (:obj:`int`, `optional`, defaults to 500):
            Number of updates steps before two checkpoint saves.
        save_total_limit (:obj:`int`, `optional`):
            If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
            :obj:`output_dir`.
        no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
Alan deLevie's avatar
Alan deLevie committed
110
            Whether to not use CUDA even when it is available or not.
111
112
113
114
115
116
117
118
119
120
        seed (:obj:`int`, `optional`, defaults to 42):
            Random seed for initialization.
        fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.
        fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
            For :obj:`fp16` training, apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
            on the `apex documentation <https://nvidia.github.io/apex/amp.html>`__.
        local_rank (:obj:`int`, `optional`, defaults to -1):
            During distributed training, the rank of the process.
        tpu_num_cores (:obj:`int`, `optional`):
Tiger's avatar
Tiger committed
121
            When training on TPU, the number of TPU cores (automatically passed by launcher script).
122
        debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
123
124
125
126
            When training on TPU, whether to print debug metrics or not.
        dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
            or not.
127
128
129
        eval_steps (:obj:`int`, `optional`):
            Number of update steps between two evaluations if :obj:`evaluation_strategy="steps"`. Will default to the
            same value as :obj:`logging_steps` if not set.
Chady Kamar's avatar
Chady Kamar committed
130
        dataloader_num_workers (:obj:`int`, `optional`, defaults to 0):
Sylvain Gugger's avatar
Sylvain Gugger committed
131
132
            Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
            main process.
133
134
135
136
137
        past_index (:obj:`int`, `optional`, defaults to -1):
            Some models like :doc:`TransformerXL <../model_doc/transformerxl>` or :doc`XLNet <../model_doc/xlnet>` can
            make use of the past hidden states for their predictions. If this argument is set to a positive int, the
            ``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model
            at the next training step under the keyword argument ``mems``.
138
139
        run_name (:obj:`str`, `optional`):
            A descriptor for the run. Notably used for wandb logging.
140
141
142
        disable_tqdm (:obj:`bool`, `optional`):
            Whether or not to disable the tqdm progress bars. Will default to :obj:`True` if the logging level is set
            to warn or lower (default), :obj:`False` otherwise.
143
144
145
146
        remove_unused_columns (:obj:`bool`, `optional`, defaults to :obj:`True`):
            If using `nlp.Dataset` datasets, whether or not to automatically remove the columns unused by the model
            forward method.

Sylvain Gugger's avatar
Sylvain Gugger committed
147
148
149
            (Note that this behavior is not implemented for :class:`~transformers.TFTrainer` yet.)
        label_names (:obj:`List[str]`, `optional`):
            The list of keys in your dictionary of inputs that correspond to the labels.
Sylvain Gugger's avatar
Sylvain Gugger committed
150
151

            Will eventually default to :obj:`["labels"]` except if the model used is one of the
Sylvain Gugger's avatar
Sylvain Gugger committed
152
            :obj:`XxxForQuestionAnswering` in which case it will default to :obj:`["start_positions",
Sylvain Gugger's avatar
Sylvain Gugger committed
153
154
155
            "end_positions"]`.
        load_best_model_at_end (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to load the best model found during training at the end of training.
156
157
158
159
160

            .. note::

                When set to :obj:`True`, the parameters :obj:`save_steps` will be ignored and the model will be saved
                after each evaluation.
Sylvain Gugger's avatar
Sylvain Gugger committed
161
        metric_for_best_model (:obj:`str`, `optional`):
162
163
164
165
166
            Use in conjunction with :obj:`load_best_model_at_end` to specify the metric to use to compare two different
            models. Must be the name of a metric returned by the evaluation with or without the prefix :obj:`"eval_"`.
            Will default to :obj:`"loss"` if unspecified and :obj:`load_best_model_at_end=True` (to use the evaluation
            loss).

Tiger's avatar
Tiger committed
167
            If you set this value, :obj:`greater_is_better` will default to :obj:`True`. Don't forget to set it to
Sylvain Gugger's avatar
Sylvain Gugger committed
168
169
170
171
            :obj:`False` if your metric is better when lower.
        greater_is_better (:obj:`bool`, `optional`):
            Use in conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better
            models should have a greater metric or not. Will default to:
172
173
174
175

            - :obj:`True` if :obj:`metric_for_best_model` is set to a value that isn't :obj:`"loss"` or
              :obj:`"eval_loss"`.
            - :obj:`False` if :obj:`metric_for_best_model` is not set, or set to :obj:`"loss"` or :obj:`"eval_loss"`.
176
177
178
179
180
181
    """

    output_dir: str = field(
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."}
    )
    overwrite_output_dir: bool = field(
182
183
184
185
186
187
188
        default=False,
        metadata={
            "help": (
                "Overwrite the content of the output directory."
                "Use this to continue training if output_dir points to a checkpoint directory."
            )
        },
189
190
191
    )

    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
192
    do_eval: bool = field(default=None, metadata={"help": "Whether to run eval on the dev set."})
Julien Chaumond's avatar
Julien Chaumond committed
193
    do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
194
    evaluate_during_training: bool = field(
195
        default=False,
196
197
198
199
        metadata={"help": "Run evaluation during training at each logging step."},
    )
    evaluation_strategy: EvaluationStrategy = field(
        default="no",
Lysandre's avatar
Lysandre committed
200
        metadata={"help": "Run evaluation during training at each logging step."},
201
    )
202
    prediction_loss_only: bool = field(
Lysandre's avatar
Lysandre committed
203
204
        default=False,
        metadata={"help": "When performing evaluation and predictions, only returns the loss."},
205
    )
206

207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
    per_device_train_batch_size: int = field(
        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
    )
    per_device_eval_batch_size: int = field(
        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
    )

    per_gpu_train_batch_size: Optional[int] = field(
        default=None,
        metadata={
            "help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. "
            "Batch size per GPU/TPU core/CPU for training."
        },
    )
    per_gpu_eval_batch_size: Optional[int] = field(
        default=None,
        metadata={
            "help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred."
            "Batch size per GPU/TPU core/CPU for evaluation."
        },
    )

229
    gradient_accumulation_steps: int = field(
230
231
        default=1,
        metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
232
    )
233
234
235
236
    eval_accumulation_steps: Optional[int] = field(
        default=None,
        metadata={"help": "Number of predictions steps to accumulate before moving the tensors to the CPU."},
    )
237
238
239

    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."})
    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay if we apply some."})
240
241
    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for Adam optimizer"})
    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for Adam optimizer"})
242
243
244
245
246
247
248
249
250
251
    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for Adam optimizer."})
    max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})

    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
    max_steps: int = field(
        default=-1,
        metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
    )
    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})

Julien Plu's avatar
Julien Plu committed
252
    logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
Julien Chaumond's avatar
Julien Chaumond committed
253
    logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
254
255
256
257
258
    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
    save_total_limit: Optional[int] = field(
        default=None,
        metadata={
259
260
261
262
            "help": (
                "Limit the total amount of checkpoints."
                "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
            )
263
264
        },
    )
Lysandre Debut's avatar
Lysandre Debut committed
265
    no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
266
267
268
269
270
271
272
273
274
    seed: int = field(default=42, metadata={"help": "random seed for initialization"})

    fp16: bool = field(
        default=False,
        metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"},
    )
    fp16_opt_level: str = field(
        default="O1",
        metadata={
275
276
277
278
            "help": (
                "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                "See details at https://nvidia.github.io/apex/amp.html"
            )
279
280
281
        },
    )
    local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
Julien Chaumond's avatar
Julien Chaumond committed
282

Lysandre Debut's avatar
Lysandre Debut committed
283
284
285
    tpu_num_cores: Optional[int] = field(
        default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
    )
286
287
288
289
290
    tpu_metrics_debug: bool = field(
        default=False,
        metadata={"help": "Deprecated, the use of `--debug` is preferred. TPU: Whether to print debug metrics"},
    )
    debug: bool = field(default=False, metadata={"help": "Whether to print debug metrics on TPU"})
Lysandre Debut's avatar
Lysandre Debut committed
291

Setu Shah's avatar
Setu Shah committed
292
293
294
    dataloader_drop_last: bool = field(
        default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
    )
295
    eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
Chady Kamar's avatar
Chady Kamar committed
296
297
    dataloader_num_workers: int = field(
        default=0,
Sylvain Gugger's avatar
Sylvain Gugger committed
298
299
300
        metadata={
            "help": "Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process."
        },
Chady Kamar's avatar
Chady Kamar committed
301
    )
Setu Shah's avatar
Setu Shah committed
302

303
304
305
306
307
    past_index: int = field(
        default=-1,
        metadata={"help": "If >=0, uses the corresponding part of the output as the past state for next step."},
    )

308
309
310
    run_name: Optional[str] = field(
        default=None, metadata={"help": "An optional descriptor for the run. Notably used for wandb logging."}
    )
311
312
313
314
    disable_tqdm: Optional[bool] = field(
        default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."}
    )

315
316
317
    remove_unused_columns: Optional[bool] = field(
        default=True, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."}
    )
Sylvain Gugger's avatar
Sylvain Gugger committed
318
319
320
321
    label_names: Optional[List[str]] = field(
        default=None, metadata={"help": "The list of keys in your dictionary of inputs that correspond to the labels."}
    )

322
323
324
325
326
327
328
329
330
331
332
    load_best_model_at_end: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether or not to load the best model found during training at the end of training."},
    )
    metric_for_best_model: Optional[str] = field(
        default=None, metadata={"help": "The metric to use to compare two different models."}
    )
    greater_is_better: Optional[bool] = field(
        default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."}
    )

Sylvain Gugger's avatar
Sylvain Gugger committed
333
334
335
    def __post_init__(self):
        if self.disable_tqdm is None:
            self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
336
337
        if self.evaluate_during_training is True:
            self.evaluation_strategy = EvaluationStrategy.STEPS
338
339
340
341
            warnings.warn(
                "The `evaluate_during_training` argument is deprecated in favor of `evaluation_strategy` (which has more options)",
                FutureWarning,
            )
342
343
344
        self.evaluation_strategy = EvaluationStrategy(self.evaluation_strategy)
        if self.do_eval is False and self.evaluation_strategy != EvaluationStrategy.NO:
            self.do_eval = True
345
346
        if self.eval_steps is None:
            self.eval_steps = self.logging_steps
347

348
349
350
351
        if self.load_best_model_at_end and self.metric_for_best_model is None:
            self.metric_for_best_model = "loss"
        if self.greater_is_better is None and self.metric_for_best_model is not None:
            self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
352
353
        if self.run_name is None:
            self.run_name = self.output_dir
354

355
        if is_torch_available() and self.device.type != "cuda" and self.fp16:
356
357
            raise ValueError("AMP (`--fp16`) can only be used on CUDA devices.")

Julien Chaumond's avatar
Julien Chaumond committed
358
359
    @property
    def train_batch_size(self) -> int:
360
361
362
        """
        The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
        """
363
364
365
366
367
368
369
        if self.per_gpu_train_batch_size:
            logger.warning(
                "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
                "version. Using `--per_device_train_batch_size` is preferred."
            )
        per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
        return per_device_batch_size * max(1, self.n_gpu)
Julien Chaumond's avatar
Julien Chaumond committed
370
371
372

    @property
    def eval_batch_size(self) -> int:
373
374
375
        """
        The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
        """
376
377
378
379
380
381
382
        if self.per_gpu_eval_batch_size:
            logger.warning(
                "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
                "version. Using `--per_device_eval_batch_size` is preferred."
            )
        per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
        return per_device_batch_size * max(1, self.n_gpu)
Julien Chaumond's avatar
Julien Chaumond committed
383
384
385
386
387
388
389
390

    @cached_property
    @torch_required
    def _setup_devices(self) -> Tuple["torch.device", int]:
        logger.info("PyTorch: setting up devices")
        if self.no_cuda:
            device = torch.device("cpu")
            n_gpu = 0
391
        elif is_torch_tpu_available():
Lysandre Debut's avatar
Lysandre Debut committed
392
393
            device = xm.xla_device()
            n_gpu = 0
Julien Chaumond's avatar
Julien Chaumond committed
394
395
396
        elif self.local_rank == -1:
            # if n_gpu is > 1 we'll use nn.DataParallel.
            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
397
398
399
400
401
            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
            # trigger an error that a device index is missing. Index 0 takes into account the
            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
            # will use the first GPU in that env, i.e. GPU#1
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
Julien Chaumond's avatar
Julien Chaumond committed
402
403
404
405
406
407
408
            n_gpu = torch.cuda.device_count()
        else:
            # Here, we'll use torch.distributed.
            # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
            torch.distributed.init_process_group(backend="nccl")
            device = torch.device("cuda", self.local_rank)
            n_gpu = 1
409
410
411
412

        if device.type == "cuda":
            torch.cuda.set_device(device)

Julien Chaumond's avatar
Julien Chaumond committed
413
414
415
416
417
        return device, n_gpu

    @property
    @torch_required
    def device(self) -> "torch.device":
418
419
420
        """
        The device used by this process.
        """
Julien Chaumond's avatar
Julien Chaumond committed
421
422
423
424
425
        return self._setup_devices[0]

    @property
    @torch_required
    def n_gpu(self):
426
427
428
429
430
431
432
        """
        The number of GPUs used by this process.

        Note:
            This will only be greater than one when you have multiple GPUs available but are not using distributed
            training. For distributed training, it will always be 1.
        """
Julien Chaumond's avatar
Julien Chaumond committed
433
434
        return self._setup_devices[1]

435
436
437
438
439
440
441
442
443
444
    def to_dict(self):
        """
        Serializes this instance while replace `Enum` by their values (for JSON serialization support).
        """
        d = dataclasses.asdict(self)
        for k, v in d.items():
            if isinstance(v, Enum):
                d[k] = v.value
        return d

Julien Chaumond's avatar
Julien Chaumond committed
445
446
447
448
    def to_json_string(self):
        """
        Serializes this instance to a JSON string.
        """
449
        return json.dumps(self.to_dict(), indent=2)
450
451
452
453
454

    def to_sanitized_dict(self) -> Dict[str, Any]:
        """
        Sanitized serialization to use with TensorBoard鈥檚 hparams
        """
455
        d = self.to_dict()
456
457
        d = {**d, **{"train_batch_size": self.train_batch_size, "eval_batch_size": self.eval_batch_size}}

458
459
460
        valid_types = [bool, int, float, str]
        if is_torch_available():
            valid_types.append(torch.Tensor)
461

462
        return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}