test_trainer.py 127 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# coding=utf-8
# Copyright 2018 the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

16
import dataclasses
17
import gc
18
import json
19
import math
20
import os
21
import random
Sylvain Gugger's avatar
Sylvain Gugger committed
22
import re
23
import subprocess
24
import sys
25
import tempfile
Julien Chaumond's avatar
Julien Chaumond committed
26
import unittest
27
from itertools import product
28
from pathlib import Path
29
from typing import Dict, List
30
from unittest.mock import Mock, patch
Julien Chaumond's avatar
Julien Chaumond committed
31

Sylvain Gugger's avatar
Sylvain Gugger committed
32
import numpy as np
33
from huggingface_hub import HfFolder, delete_repo, list_repo_commits, list_repo_files
34
from parameterized import parameterized
Sylvain Gugger's avatar
Sylvain Gugger committed
35
from requests.exceptions import HTTPError
36

37
38
39
40
from transformers import (
    AutoTokenizer,
    IntervalStrategy,
    PretrainedConfig,
41
    TrainerCallback,
42
    TrainingArguments,
43
    get_polynomial_decay_schedule_with_warmup,
44
45
46
    is_torch_available,
    logging,
)
47
from transformers.hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS
48
from transformers.testing_utils import (
Sylvain Gugger's avatar
Sylvain Gugger committed
49
    ENDPOINT_STAGING,
50
    TOKEN,
Sylvain Gugger's avatar
Sylvain Gugger committed
51
    USER,
52
    CaptureLogger,
53
    TestCasePlus,
54
    backend_device_count,
55
    execute_subprocess_async,
56
    get_gpu_count,
57
    get_tests_dir,
Sylvain Gugger's avatar
Sylvain Gugger committed
58
    is_staging_test,
Yih-Dar's avatar
Yih-Dar committed
59
    require_accelerate,
60
    require_intel_extension_for_pytorch,
61
    require_optuna,
62
    require_ray,
63
    require_safetensors,
64
    require_sentencepiece,
65
    require_sigopt,
66
    require_tensorboard,
67
68
    require_tokenizers,
    require_torch,
69
70
    require_torch_accelerator,
    require_torch_bf16,
71
    require_torch_gpu,
72
73
    require_torch_multi_accelerator,
    require_torch_non_multi_accelerator,
74
    require_torch_non_multi_gpu,
75
    require_torch_tensorrt_fx,
76
    require_torch_tf32,
77
    require_torch_up_to_2_accelerators,
78
    require_torchdynamo,
79
    require_wandb,
80
    slow,
81
    torch_device,
82
)
83
84
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend, get_last_checkpoint
85
from transformers.training_args import OptimizerNames
86
from transformers.utils import (
87
88
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
89
90
91
92
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
    is_apex_available,
    is_bitsandbytes_available,
93
    is_safetensors_available,
94
95
    is_torchdistx_available,
)
96
from transformers.utils.hp_naming import TrialShortNamer
Julien Chaumond's avatar
Julien Chaumond committed
97
98
99
100


if is_torch_available():
    import torch
101
    from torch import nn
102
103
    from torch.utils.data import IterableDataset

104
    import transformers.optimization
Julien Chaumond's avatar
Julien Chaumond committed
105
    from transformers import (
106
        AutoModelForCausalLM,
Julien Chaumond's avatar
Julien Chaumond committed
107
        AutoModelForSequenceClassification,
108
        EarlyStoppingCallback,
Julien Chaumond's avatar
Julien Chaumond committed
109
110
        GlueDataset,
        GlueDataTrainingArguments,
111
112
        GPT2Config,
        GPT2LMHeadModel,
113
        LineByLineTextDataset,
114
        PreTrainedModel,
115
        Trainer,
116
        TrainerState,
Julien Chaumond's avatar
Julien Chaumond committed
117
    )
118
    from transformers.modeling_utils import unwrap_model
Julien Chaumond's avatar
Julien Chaumond committed
119

120
121
122
    if is_safetensors_available():
        import safetensors.torch

Julien Chaumond's avatar
Julien Chaumond committed
123

124
PATH_SAMPLE_TEXT = f"{get_tests_dir()}/fixtures/sample_text.txt"
Julien Chaumond's avatar
Julien Chaumond committed
125
126


Sylvain Gugger's avatar
Sylvain Gugger committed
127
class RegressionDataset:
Sylvain Gugger's avatar
Sylvain Gugger committed
128
    def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
Sylvain Gugger's avatar
Sylvain Gugger committed
129
        np.random.seed(seed)
Sylvain Gugger's avatar
Sylvain Gugger committed
130
        self.label_names = ["labels"] if label_names is None else label_names
Sylvain Gugger's avatar
Sylvain Gugger committed
131
132
        self.length = length
        self.x = np.random.normal(size=(length,)).astype(np.float32)
Sylvain Gugger's avatar
Sylvain Gugger committed
133
134
        self.ys = [a * self.x + b + np.random.normal(scale=0.1, size=(length,)) for _ in self.label_names]
        self.ys = [y.astype(np.float32) for y in self.ys]
Julien Chaumond's avatar
Julien Chaumond committed
135

Sylvain Gugger's avatar
Sylvain Gugger committed
136
137
138
139
    def __len__(self):
        return self.length

    def __getitem__(self, i):
Sylvain Gugger's avatar
Sylvain Gugger committed
140
141
142
        result = {name: y[i] for name, y in zip(self.label_names, self.ys)}
        result["input_x"] = self.x[i]
        return result
Sylvain Gugger's avatar
Sylvain Gugger committed
143
144


145
146
147
148
@dataclasses.dataclass
class RegressionTrainingArguments(TrainingArguments):
    a: float = 0.0
    b: float = 0.0
149
    keep_report_to: bool = False
150

151
    def __post_init__(self):
152
        super().__post_init__()
153
154
155
156
        # save resources not dealing with reporting unless specified (also avoids the warning when it's not set)
        # can be explicitly disabled via `keep_report_to`
        if not self.keep_report_to:
            self.report_to = []
157

158

159
160
161
162
163
164
165
166
167
168
169
170
class RepeatDataset:
    def __init__(self, x, length=64):
        self.x = x
        self.length = length

    def __len__(self):
        return self.length

    def __getitem__(self, i):
        return {"input_ids": self.x, "labels": self.x}


171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
class DynamicShapesDataset:
    def __init__(self, length=64, seed=42, batch_size=8):
        self.length = length
        np.random.seed(seed)
        sizes = np.random.randint(1, 20, (length // batch_size,))
        # For easy batching, we make every batch_size consecutive samples the same size.
        self.xs = [np.random.normal(size=(s,)) for s in sizes.repeat(batch_size)]
        self.ys = [np.random.normal(size=(s,)) for s in sizes.repeat(batch_size)]

    def __len__(self):
        return self.length

    def __getitem__(self, i):
        return {"input_x": self.xs[i], "labels": self.ys[i]}


Sylvain Gugger's avatar
Sylvain Gugger committed
187
188
189
190
191
192
193
194
class AlmostAccuracy:
    def __init__(self, thresh=0.25):
        self.thresh = thresh

    def __call__(self, eval_pred):
        predictions, labels = eval_pred
        true = np.abs(predictions - labels) <= self.thresh
        return {"accuracy": true.astype(np.float32).mean().item()}
195

Julien Chaumond's avatar
Julien Chaumond committed
196

197
class RegressionModelConfig(PretrainedConfig):
198
    def __init__(self, a=0, b=0, double_output=False, random_torch=True, **kwargs):
199
200
201
202
        super().__init__(**kwargs)
        self.a = a
        self.b = b
        self.double_output = double_output
203
        self.random_torch = random_torch
204
        self.hidden_size = 1
205
206


207
208
209
if is_torch_available():

    class SampleIterableDataset(IterableDataset):
210
211
        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
            self.dataset = RegressionDataset(a=a, b=b, length=length, seed=seed, label_names=label_names)
212
213

        def __iter__(self):
214
215
            for i in range(len(self.dataset)):
                yield self.dataset[i]
216

217
218
219
220
221
222
223
224
225
226
    class FiniteIterableDataset(SampleIterableDataset):
        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
            super().__init__(a, b, length, seed, label_names)
            self.current_sample = 0

        def __iter__(self):
            while self.current_sample < len(self.dataset):
                yield self.dataset[self.current_sample]
                self.current_sample += 1

227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
    class MultiLoader:
        def __init__(self, loaders):
            self.loaders = loaders

        def __len__(self):
            return sum(len(loader) for loader in self.loaders)

        def __iter__(self):
            for loader in self.loaders:
                yield from loader

    class CustomDataloaderTrainer(Trainer):
        def get_train_dataloader(self):
            dataloaders = [super().get_train_dataloader(), super().get_train_dataloader()]
            return MultiLoader(dataloaders)

        def get_eval_dataloader(self, eval_dataset):
            dataloaders = [super().get_eval_dataloader(eval_dataset), super().get_eval_dataloader(eval_dataset)]
            return MultiLoader(dataloaders)

247
    class RegressionModel(nn.Module):
248
        def __init__(self, a=0, b=0, double_output=False):
Sylvain Gugger's avatar
Sylvain Gugger committed
249
            super().__init__()
250
251
            self.a = nn.Parameter(torch.tensor(a).float())
            self.b = nn.Parameter(torch.tensor(b).float())
252
253
            self.double_output = double_output
            self.config = None
Sylvain Gugger's avatar
Sylvain Gugger committed
254

Stas Bekman's avatar
Stas Bekman committed
255
        def forward(self, input_x, labels=None, **kwargs):
Sylvain Gugger's avatar
Sylvain Gugger committed
256
257
            y = input_x * self.a + self.b
            if labels is None:
258
                return (y, y) if self.double_output else (y,)
259
            loss = nn.functional.mse_loss(y, labels)
260
            return (loss, y, y) if self.double_output else (loss, y)
Sylvain Gugger's avatar
Sylvain Gugger committed
261

262
    class RegressionDictModel(nn.Module):
263
264
        def __init__(self, a=0, b=0):
            super().__init__()
265
266
            self.a = nn.Parameter(torch.tensor(a).float())
            self.b = nn.Parameter(torch.tensor(b).float())
267
268
            self.config = None

Stas Bekman's avatar
Stas Bekman committed
269
        def forward(self, input_x, labels=None, **kwargs):
270
271
272
            y = input_x * self.a + self.b
            result = {"output": y}
            if labels is not None:
273
                result["loss"] = nn.functional.mse_loss(y, labels)
274
275
            return result

276
277
278
279
280
281
    class RegressionPreTrainedModel(PreTrainedModel):
        config_class = RegressionModelConfig
        base_model_prefix = "regression"

        def __init__(self, config):
            super().__init__(config)
282
283
            self.a = nn.Parameter(torch.tensor(config.a).float())
            self.b = nn.Parameter(torch.tensor(config.b).float())
284
285
            self.double_output = config.double_output

Stas Bekman's avatar
Stas Bekman committed
286
        def forward(self, input_x, labels=None, **kwargs):
287
288
289
            y = input_x * self.a + self.b
            if labels is None:
                return (y, y) if self.double_output else (y,)
290
            loss = nn.functional.mse_loss(y, labels)
291
292
            return (loss, y, y) if self.double_output else (loss, y)

293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
    class RegressionPreTrainedModelWithGradientCheckpointing(PreTrainedModel):
        config_class = RegressionModelConfig
        base_model_prefix = "regression"
        supports_gradient_checkpointing = True

        def __init__(self, config):
            super().__init__(config)
            self.layers = nn.ModuleList([nn.Linear(config.hidden_size, config.hidden_size) for _ in range(4)])
            self.head = nn.Linear(config.hidden_size, 1)
            self.gradient_checkpointing = False
            self.double_output = config.double_output

        def forward(self, input_x, labels=None, **kwargs):
            y = input_x.unsqueeze(0)

            for layer in self.layers:
                if self.training and self.gradient_checkpointing:
                    outputs = self._gradient_checkpointing_func(layer.__call__, y)
                else:
                    outputs = layer(y)

                y = outputs * 3

            logits = self.head(y)

            if labels is None:
                return (logits, logits) if self.double_output else (logits,)

            loss = nn.functional.mse_loss(logits, labels)

            return (loss, y, y) if self.double_output else (loss, y)

325
326
327
328
329
330
    class RegressionRandomPreTrainedModel(PreTrainedModel):
        config_class = RegressionModelConfig
        base_model_prefix = "regression"

        def __init__(self, config):
            super().__init__(config)
331
332
            self.a = nn.Parameter(torch.tensor(config.a).float())
            self.b = nn.Parameter(torch.tensor(config.b).float())
333
            self.random_torch = config.random_torch
334
335
336

        def forward(self, input_x, labels=None, **kwargs):
            y = input_x * self.a + self.b
337
338
            if self.random_torch:
                torch_rand = torch.randn(1).squeeze()
339
340
341
            np_rand = np.random.rand()
            rand_rand = random.random()

342
343
344
            if self.random_torch:
                y += 0.05 * torch_rand
            y += 0.05 * torch.tensor(np_rand + rand_rand)
345
346
347

            if labels is None:
                return (y,)
348
            loss = nn.functional.mse_loss(y, labels)
349
350
            return (loss, y)

351
    class TstLayer(nn.Module):
352
353
        def __init__(self, hidden_size):
            super().__init__()
354
355
356
357
358
            self.linear1 = nn.Linear(hidden_size, hidden_size)
            self.ln1 = nn.LayerNorm(hidden_size)
            self.linear2 = nn.Linear(hidden_size, hidden_size)
            self.ln2 = nn.LayerNorm(hidden_size)
            self.bias = nn.Parameter(torch.zeros(hidden_size))
359
360

        def forward(self, x):
361
362
            h = self.ln1(nn.functional.relu(self.linear1(x)))
            h = nn.functional.relu(self.linear2(x))
363
364
            return self.ln2(x + h + self.bias)

365
366
367
    def get_regression_trainer(
        a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, keep_report_to=False, **kwargs
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
368
        label_names = kwargs.get("label_names", None)
369
        gradient_checkpointing = kwargs.get("gradient_checkpointing", False)
Sylvain Gugger's avatar
Sylvain Gugger committed
370
371
        train_dataset = RegressionDataset(length=train_len, label_names=label_names)
        eval_dataset = RegressionDataset(length=eval_len, label_names=label_names)
372
373
374
375

        model_init = kwargs.pop("model_init", None)
        if model_init is not None:
            model = None
376
        else:
377
378
            if pretrained:
                config = RegressionModelConfig(a=a, b=b, double_output=double_output)
379
380
381
382
383
384
385
                # We infer the correct model class if one uses gradient_checkpointing or not
                target_cls = (
                    RegressionPreTrainedModel
                    if not gradient_checkpointing
                    else RegressionPreTrainedModelWithGradientCheckpointing
                )
                model = target_cls(config)
386
387
388
            else:
                model = RegressionModel(a=a, b=b, double_output=double_output)

Sylvain Gugger's avatar
Sylvain Gugger committed
389
390
391
        compute_metrics = kwargs.pop("compute_metrics", None)
        data_collator = kwargs.pop("data_collator", None)
        optimizers = kwargs.pop("optimizers", (None, None))
392
        output_dir = kwargs.pop("output_dir", "./regression")
393
        preprocess_logits_for_metrics = kwargs.pop("preprocess_logits_for_metrics", None)
394

395
        args = RegressionTrainingArguments(output_dir, a=a, b=b, keep_report_to=keep_report_to, **kwargs)
Sylvain Gugger's avatar
Sylvain Gugger committed
396
397
398
399
400
401
402
403
        return Trainer(
            model,
            args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            optimizers=optimizers,
404
            model_init=model_init,
405
            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
Sylvain Gugger's avatar
Sylvain Gugger committed
406
407
        )

408

409
class TrainerIntegrationCommon:
410
    def check_saved_checkpoints(self, output_dir, freq, total, is_pretrained=True, safe_weights=True):
411
412
        weights_file = WEIGHTS_NAME if not safe_weights else SAFE_WEIGHTS_NAME
        file_list = [weights_file, "training_args.bin", "optimizer.pt", "scheduler.pt", "trainer_state.json"]
413
414
415
416
417
418
419
420
421
        if is_pretrained:
            file_list.append("config.json")
        for step in range(freq, total, freq):
            checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
            self.assertTrue(os.path.isdir(checkpoint))
            for filename in file_list:
                self.assertTrue(os.path.isfile(os.path.join(checkpoint, filename)))

    def check_best_model_has_been_loaded(
422
        self, output_dir, freq, total, trainer, metric, greater_is_better=False, is_pretrained=True, safe_weights=True
423
424
    ):
        checkpoint = os.path.join(output_dir, f"checkpoint-{(total // freq) * freq}")
425
        log_history = TrainerState.load_from_json(os.path.join(checkpoint, "trainer_state.json")).log_history
426
427
428
429
430
431
432
433
434
435

        values = [d[metric] for d in log_history]
        best_value = max(values) if greater_is_better else min(values)
        best_checkpoint = (values.index(best_value) + 1) * freq
        checkpoint = os.path.join(output_dir, f"checkpoint-{best_checkpoint}")
        if is_pretrained:
            best_model = RegressionPreTrainedModel.from_pretrained(checkpoint)
            best_model.to(trainer.args.device)
        else:
            best_model = RegressionModel()
436
437
438
439
            if not safe_weights:
                state_dict = torch.load(os.path.join(checkpoint, WEIGHTS_NAME))
            else:
                state_dict = safetensors.torch.load_file(os.path.join(checkpoint, SAFE_WEIGHTS_NAME))
440
            best_model.load_state_dict(state_dict)
441
            best_model.to(trainer.args.device)
442
443
444
445
446
447
        self.assertTrue(torch.allclose(best_model.a, trainer.model.a))
        self.assertTrue(torch.allclose(best_model.b, trainer.model.b))

        metrics = trainer.evaluate()
        self.assertEqual(metrics[metric], best_value)

448
449
450
451
452
453
454
455
    def check_trainer_state_are_the_same(self, trainer_state, trainer_state1):
        # We'll pop things so operate on copies.
        state = trainer_state.copy()
        state1 = trainer_state1.copy()
        # Log history main contain different logs for the time metrics (after resuming a training).
        log_history = state.pop("log_history", None)
        log_history1 = state1.pop("log_history", None)
        self.assertEqual(state, state1)
456
        skip_log_keys = ["train_runtime", "train_samples_per_second", "train_steps_per_second", "train_loss"]
457
        for log, log1 in zip(log_history, log_history1):
458
459
460
            for key in skip_log_keys:
                _ = log.pop(key, None)
                _ = log1.pop(key, None)
461
462
            self.assertEqual(log, log1)

463
    def convert_to_sharded_checkpoint(self, folder, save_safe=True, load_safe=True):
464
        # Converts a checkpoint of a regression model to a sharded checkpoint.
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
        if load_safe:
            loader = safetensors.torch.load_file
            weights_file = os.path.join(folder, SAFE_WEIGHTS_NAME)
        else:
            loader = torch.load
            weights_file = os.path.join(folder, WEIGHTS_NAME)

        if save_safe:
            extension = "safetensors"
            saver = safetensors.torch.save_file
            index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME)
            shard_name = SAFE_WEIGHTS_NAME
        else:
            extension = "bin"
            saver = torch.save
            index_file = os.path.join(folder, WEIGHTS_INDEX_NAME)
            shard_name = WEIGHTS_NAME

        state_dict = loader(weights_file)

        os.remove(weights_file)
486
487
488
        keys = list(state_dict.keys())

        shard_files = [
489
490
            shard_name.replace(f".{extension}", f"-{idx+1:05d}-of-{len(keys):05d}.{extension}")
            for idx in range(len(keys))
491
492
493
        ]
        index = {"metadata": {}, "weight_map": {key: shard_files[i] for i, key in enumerate(keys)}}

494
        with open(index_file, "w", encoding="utf-8") as f:
495
496
497
498
            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
            f.write(content)

        for param_name, shard_file in zip(keys, shard_files):
499
            saver({param_name: state_dict[param_name]}, os.path.join(folder, shard_file))
500

501
502
503
504

@require_torch
@require_sentencepiece
@require_tokenizers
505
506
507
508
509
510
511
512
class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
    """
    Only tests that want to tap into the auto-pre-run 2 trainings:
    - self.default_trained_model
    - self.alternate_trained_model
    directly, or via check_trained_model
    """

513
514
    def setUp(self):
        super().setUp()
515
        args = TrainingArguments("..")
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size
        trainer = get_regression_trainer(learning_rate=0.1)
        trainer.train()
        self.default_trained_model = (trainer.model.a, trainer.model.b)

        trainer = get_regression_trainer(learning_rate=0.1, seed=314)
        trainer.train()
        self.alternate_trained_model = (trainer.model.a, trainer.model.b)

    def check_trained_model(self, model, alternate_seed=False):
        # Checks a training seeded with learning_rate = 0.1
        (a, b) = self.alternate_trained_model if alternate_seed else self.default_trained_model
        self.assertTrue(torch.allclose(model.a, a))
        self.assertTrue(torch.allclose(model.b, b))

532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
    def test_reproducible_training(self):
        # Checks that training worked, model trained and seed made a reproducible training.
        trainer = get_regression_trainer(learning_rate=0.1)
        trainer.train()
        self.check_trained_model(trainer.model)

        # Checks that a different seed gets different (reproducible) results.
        trainer = get_regression_trainer(learning_rate=0.1, seed=314)
        trainer.train()
        self.check_trained_model(trainer.model, alternate_seed=True)

    def test_trainer_with_datasets(self):
        import datasets

        np.random.seed(42)
        x = np.random.normal(size=(64,)).astype(np.float32)
        y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,))
        train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y})

        # Base training. Should have the same results as test_reproducible_training
        model = RegressionModel()
        args = TrainingArguments("./regression", learning_rate=0.1)
        trainer = Trainer(model, args, train_dataset=train_dataset)
        trainer.train()
        self.check_trained_model(trainer.model)

        # Can return tensors.
        train_dataset.set_format(type="torch", dtype=torch.float32)
        model = RegressionModel()
        trainer = Trainer(model, args, train_dataset=train_dataset)
        trainer.train()
        self.check_trained_model(trainer.model)

        # Adding one column not used by the model should have no impact
        z = np.random.normal(size=(64,)).astype(np.float32)
        train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
        model = RegressionModel()
        trainer = Trainer(model, args, train_dataset=train_dataset)
        trainer.train()
        self.check_trained_model(trainer.model)

    def test_model_init(self):
        train_dataset = RegressionDataset()
        args = TrainingArguments("./regression", learning_rate=0.1)
        trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel())
        trainer.train()
        self.check_trained_model(trainer.model)

        # Re-training should restart from scratch, thus lead the same results.
        trainer.train()
        self.check_trained_model(trainer.model)

        # Re-training should restart from scratch, thus lead the same results and new seed should be used.
585
        trainer.args.seed = 314
586
587
588
589
590
591
592
593
594
595
596
        trainer.train()
        self.check_trained_model(trainer.model, alternate_seed=True)

    def test_gradient_accumulation(self):
        # Training with half the batch size but accumulation steps as 2 should give the same results.
        trainer = get_regression_trainer(
            gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1
        )
        trainer.train()
        self.check_trained_model(trainer.model)

597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
    def test_gradient_checkpointing(self):
        trainer = get_regression_trainer(
            per_device_train_batch_size=1,
            learning_rate=0.1,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": False},
        )
        previous_params = {k: v.detach().clone() for k, v in trainer.model.named_parameters()}

        trainer.train()

        # Check if model weights have been updated
        for k, v in trainer.model.named_parameters():
            self.assertFalse(
                torch.allclose(previous_params[k], v, rtol=1e-4, atol=1e-4),
                f"Model weights for {k} have not been updated",
            )

615
    def test_training_loss(self):
616
        n_gpus = max(1, backend_device_count(torch_device))
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635

        # With even logs
        trainer = get_regression_trainer(logging_steps=64 / (8 * n_gpus))
        trainer.train()
        log_history = trainer.state.log_history

        losses = [log["loss"] for log in log_history if "loss" in log]
        train_loss = log_history[-1]["train_loss"]
        self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4)

        # With uneven logs
        trainer = get_regression_trainer(logging_steps=5)
        trainer.train()
        log_history = trainer.state.log_history

        # Training loss should be the same as before
        new_train_loss = log_history[-1]["train_loss"]
        self.assertAlmostEqual(train_loss, new_train_loss, places=4)

636
637
638
639
640
641
642
643
644
645
646
647
648
649
    def test_custom_optimizer(self):
        train_dataset = RegressionDataset()
        args = TrainingArguments("./regression")
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
        trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
        trainer.train()

        (a, b) = self.default_trained_model
        self.assertFalse(torch.allclose(trainer.model.a, a))
        self.assertFalse(torch.allclose(trainer.model.b, b))
        self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)

650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
    def test_lr_scheduler_kwargs(self):
        # test scheduler kwargs passed via TrainingArguments
        train_dataset = RegressionDataset()
        model = RegressionModel()
        num_steps, num_warmup_steps = 10, 2
        extra_kwargs = {"power": 5.0, "lr_end": 1e-5}  # Non-default arguments
        args = TrainingArguments(
            "./regression",
            lr_scheduler_type="polynomial",
            lr_scheduler_kwargs=extra_kwargs,
            learning_rate=0.2,
            warmup_steps=num_warmup_steps,
        )
        trainer = Trainer(model, args, train_dataset=train_dataset)
        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)

        # Checking that the scheduler was created
        self.assertIsNotNone(trainer.lr_scheduler)

        # Checking that the correct args were passed
        sched1 = trainer.lr_scheduler
        sched2 = get_polynomial_decay_schedule_with_warmup(
            trainer.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_steps, **extra_kwargs
        )
        self.assertEqual(sched1.lr_lambdas[0].args, sched2.lr_lambdas[0].args)
        self.assertEqual(sched1.lr_lambdas[0].keywords, sched2.lr_lambdas[0].keywords)

677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
    def test_reduce_lr_on_plateau_args(self):
        # test passed arguments for a custom ReduceLROnPlateau scheduler
        train_dataset = RegressionDataset(length=64)
        eval_dataset = RegressionDataset(length=64)
        args = TrainingArguments(
            "./regression",
            evaluation_strategy="epoch",
            metric_for_best_model="eval_loss",
        )
        model = RegressionModel()
        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, cooldown=2)
        trainer = Trainer(
            model, args, train_dataset=train_dataset, eval_dataset=eval_dataset, optimizers=(optimizer, lr_scheduler)
        )
        trainer.train()

        self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
        self.assertEqual(trainer.lr_scheduler.factor, 0.2)
        self.assertEqual(trainer.lr_scheduler.patience, 5)
        self.assertEqual(trainer.lr_scheduler.cooldown, 2)

    def test_reduce_lr_on_plateau(self):
        # test the ReduceLROnPlateau scheduler

        class TrainerWithLRLogs(Trainer):
            def log(self, logs):
                # the LR is computed after metrics and does not exist for the first epoch
                if hasattr(self.lr_scheduler, "_last_lr"):
706
                    logs["learning_rate"] = self.lr_scheduler._last_lr[0]
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
                super().log(logs)

        train_dataset = RegressionDataset(length=64)
        eval_dataset = RegressionDataset(length=64)

        args = TrainingArguments(
            "./regression",
            lr_scheduler_type="reduce_lr_on_plateau",
            evaluation_strategy="epoch",
            metric_for_best_model="eval_loss",
            num_train_epochs=10,
            learning_rate=0.2,
        )
        model = RegressionModel()
        trainer = TrainerWithLRLogs(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
        trainer.train()

        self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
        patience = trainer.lr_scheduler.patience

        logs = trainer.state.log_history[1:]
        best_loss = logs[0]["eval_loss"]
        bad_epochs = 0
        for i, log in enumerate(logs[:-1]):  # Compare learning rate to next epoch's
            loss = log["eval_loss"]
            just_decreased = False
            if loss > best_loss:
                bad_epochs += 1
                if bad_epochs > patience:
736
                    self.assertLess(logs[i + 1]["learning_rate"], log["learning_rate"])
737
738
739
740
741
742
                    just_decreased = True
                    bad_epochs = 0
            else:
                best_loss = loss
                bad_epochs = 0
            if not just_decreased:
743
                self.assertEqual(logs[i + 1]["learning_rate"], log["learning_rate"])
744

745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
    def test_adafactor_lr_none(self):
        # test the special case where lr=None, since Trainer can't not have lr_scheduler

        from transformers.optimization import Adafactor, AdafactorSchedule

        train_dataset = RegressionDataset()
        args = TrainingArguments("./regression")
        model = RegressionModel()
        optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
        lr_scheduler = AdafactorSchedule(optimizer)
        trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
        trainer.train()

        (a, b) = self.default_trained_model
        self.assertFalse(torch.allclose(trainer.model.a, a))
        self.assertFalse(torch.allclose(trainer.model.b, b))
        self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)

763
764
    @require_torch_accelerator
    @require_torch_bf16
765
766
767
768
769
770
771
772
773
774
775
776
    def test_mixed_bf16(self):
        # very basic test
        trainer = get_regression_trainer(learning_rate=0.1, bf16=True)
        trainer.train()
        self.check_trained_model(trainer.model)

        # --bf16 --half_precision_backend apex can't be used together
        with self.assertRaises(ValueError):
            trainer = get_regression_trainer(learning_rate=0.1, bf16=True, half_precision_backend="apex")

        # will add more specific tests once there are some bugs to fix

777
778
779
780
781
782
783
784
    @require_torch_gpu
    @require_torch_tf32
    def test_tf32(self):
        # very basic test
        trainer = get_regression_trainer(learning_rate=0.1, tf32=True)
        trainer.train()
        self.check_trained_model(trainer.model)

785
786
787
788
789
790
791

@require_torch
@require_sentencepiece
@require_tokenizers
class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
    def setUp(self):
        super().setUp()
792
        args = TrainingArguments("..")
793
794
795
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

796
797
798
799
800
801
802
803
804
805
806
807
808
    def test_trainer_works_with_dict(self):
        # Edge case because Apex with mode O2 will change our models to return dicts. This test checks it doesn't break
        # anything.
        train_dataset = RegressionDataset()
        eval_dataset = RegressionDataset()
        model = RegressionDictModel()
        args = TrainingArguments("./regression")
        trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
        trainer.train()
        _ = trainer.evaluate()
        _ = trainer.predict(eval_dataset)

    def test_evaluation_with_keys_to_drop(self):
809
        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
810
811
812
813
814
815
816
817
818
819
820
821
822
        tiny_gpt2 = GPT2LMHeadModel(config)
        x = torch.randint(0, 100, (128,))
        eval_dataset = RepeatDataset(x)
        args = TrainingArguments("./test")
        trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
        # By default the past_key_values are removed
        result = trainer.predict(eval_dataset)
        self.assertTrue(isinstance(result.predictions, np.ndarray))
        # We can still get them by setting ignore_keys to []
        result = trainer.predict(eval_dataset, ignore_keys=[])
        self.assertTrue(isinstance(result.predictions, tuple))
        self.assertEqual(len(result.predictions), 2)

823
824
825
    def test_training_arguments_are_left_untouched(self):
        trainer = get_regression_trainer()
        trainer.train()
826
        args = TrainingArguments("./regression", report_to=[])
827
828
        dict1, dict2 = args.to_dict(), trainer.args.to_dict()
        for key in dict1.keys():
829
            # Logging dir can be slightly different as they default to something with the time.
Sylvain Gugger's avatar
Sylvain Gugger committed
830
            if key != "logging_dir":
831
                self.assertEqual(dict1[key], dict2[key])
832

Sylvain Gugger's avatar
Sylvain Gugger committed
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
    def test_number_of_steps_in_training(self):
        # Regular training has n_epochs * len(train_dl) steps
        trainer = get_regression_trainer(learning_rate=0.1)
        train_output = trainer.train()
        self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)

        # Check passing num_train_epochs works (and a float version too):
        trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5)
        train_output = trainer.train()
        self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size))

        # If we pass a max_steps, num_train_epochs is ignored
        trainer = get_regression_trainer(learning_rate=0.1, max_steps=10)
        train_output = trainer.train()
        self.assertEqual(train_output.global_step, 10)

849
    @require_torch_bf16
850
851
852
853
    @require_intel_extension_for_pytorch
    def test_number_of_steps_in_training_with_ipex(self):
        for mix_bf16 in [True, False]:
            # Regular training has n_epochs * len(train_dl) steps
854
            trainer = get_regression_trainer(learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True)
855
            train_output = trainer.train()
856
            self.assertEqual(train_output.global_step, self.n_epochs * 64 / trainer.args.train_batch_size)
857
858
859

            # Check passing num_train_epochs works (and a float version too):
            trainer = get_regression_trainer(
860
                learning_rate=0.1, num_train_epochs=1.5, use_ipex=True, bf16=mix_bf16, use_cpu=True
861
862
            )
            train_output = trainer.train()
863
            self.assertEqual(train_output.global_step, int(1.5 * 64 / trainer.args.train_batch_size))
864
865
866

            # If we pass a max_steps, num_train_epochs is ignored
            trainer = get_regression_trainer(
867
                learning_rate=0.1, max_steps=10, use_ipex=True, bf16=mix_bf16, use_cpu=True
868
869
870
871
            )
            train_output = trainer.train()
            self.assertEqual(train_output.global_step, 10)

872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
    def test_neftune(self):
        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
        tiny_gpt2 = GPT2LMHeadModel(config)
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)

        # Trainer without inf/nan filter
        args = TrainingArguments(
            "./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)

        trainer.model = trainer._activate_neftune(trainer.model)

        dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device)

        emb1 = trainer.model.get_input_embeddings()(dummy_input)
        emb2 = trainer.model.get_input_embeddings()(dummy_input)

        self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!")

        # redefine the model
        tiny_gpt2 = GPT2LMHeadModel(config)
        # Trainer without inf/nan filter
        args = TrainingArguments(
            "./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)

        # Check that it trains without errors
        trainer.train()

        # Make sure forward pass works fine
        _ = trainer.model(dummy_input)
        self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0)

        trainer.model.eval()

        # Check that we get identical embeddings just in case
        emb1 = trainer.model.get_input_embeddings()(dummy_input)
        emb2 = trainer.model.get_input_embeddings()(dummy_input)

        self.assertTrue(torch.allclose(emb1, emb2), "Neftune noise is still applied!")

916
    def test_logging_inf_nan_filter(self):
917
        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
        tiny_gpt2 = GPT2LMHeadModel(config)
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)

        # Trainer without inf/nan filter
        args = TrainingArguments("./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False)
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
        trainer.train()
        log_history_no_filter = trainer.state.log_history

        # Trainer with inf/nan filter
        args = TrainingArguments("./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True)
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
        trainer.train()
        log_history_filter = trainer.state.log_history

        def is_any_loss_nan_or_inf(log_history):
            losses = [l["loss"] for l in log_history[:-1]]
            return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses)

        self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter))
        self.assertFalse(is_any_loss_nan_or_inf(log_history_filter))

Sylvain Gugger's avatar
Sylvain Gugger committed
941
    def test_train_and_eval_dataloaders(self):
942
        n_gpu = max(1, backend_device_count(torch_device))
Sylvain Gugger's avatar
Sylvain Gugger committed
943
        trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16)
944
        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
Sylvain Gugger's avatar
Sylvain Gugger committed
945
        trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16)
946
        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16 * n_gpu)
Sylvain Gugger's avatar
Sylvain Gugger committed
947
948
949
950
951

        # Check drop_last works
        trainer = get_regression_trainer(
            train_len=66, eval_len=74, learning_rate=0.1, per_device_train_batch_size=16, per_device_eval_batch_size=32
        )
952
953
        self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu) + 1)
        self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu) + 1)
Sylvain Gugger's avatar
Sylvain Gugger committed
954
955
956
957
958
959
960
961
962

        trainer = get_regression_trainer(
            train_len=66,
            eval_len=74,
            learning_rate=0.1,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            dataloader_drop_last=True,
        )
963
964
        self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu))
        self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu))
Sylvain Gugger's avatar
Sylvain Gugger committed
965

966
        # Check passing a new dataset for evaluation works
Sylvain Gugger's avatar
Sylvain Gugger committed
967
        new_eval_dataset = RegressionDataset(length=128)
968
        self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32 * n_gpu))
Sylvain Gugger's avatar
Sylvain Gugger committed
969

970
971
972
973
974
975
976
977
978
    # tests that we do not require dataloader to have a .dataset attribute
    def test_dataloader_without_dataset(self):
        train_dataset = RegressionDataset(length=128)
        trainer = CustomDataloaderTrainer(
            model=RegressionModel(), train_dataset=train_dataset, eval_dataset=train_dataset
        )
        trainer.train()
        trainer.evaluate()

979
    @require_torch_multi_accelerator
980
981
982
983
984
    def test_data_is_not_parallelized_when_model_is_parallel(self):
        model = RegressionModel()
        # Make the Trainer believe it's a parallelized model
        model.is_parallelizable = True
        model.model_parallel = True
985
986
        args = TrainingArguments("./regression", per_device_train_batch_size=16, per_device_eval_batch_size=16)
        trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset())
987
988
        # Check the Trainer was fooled
        self.assertTrue(trainer.is_model_parallel)
989
        self.assertEqual(trainer.args.n_gpu, 1)
990
991

        # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
992
        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
993
        self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
994
        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
995
996
        self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)

Sylvain Gugger's avatar
Sylvain Gugger committed
997
998
999
1000
    def test_evaluate(self):
        trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy())
        results = trainer.evaluate()

Sylvain Gugger's avatar
Sylvain Gugger committed
1001
        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
Sylvain Gugger's avatar
Sylvain Gugger committed
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
        pred = 1.5 * x + 2.5
        expected_loss = ((pred - y) ** 2).mean()
        self.assertAlmostEqual(results["eval_loss"], expected_loss)
        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

        # With a number of elements not a round multiple of the batch size
        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy())
        results = trainer.evaluate()

Sylvain Gugger's avatar
Sylvain Gugger committed
1012
        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
Sylvain Gugger's avatar
Sylvain Gugger committed
1013
1014
1015
1016
1017
1018
        pred = 1.5 * x + 2.5
        expected_loss = ((pred - y) ** 2).mean()
        self.assertAlmostEqual(results["eval_loss"], expected_loss)
        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
        # With logits preprocess
        trainer = get_regression_trainer(
            a=1.5,
            b=2.5,
            compute_metrics=AlmostAccuracy(),
            preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
        )
        results = trainer.evaluate()

        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
        pred = 1.5 * x + 2.5
        expected_loss = ((pred - y) ** 2).mean()
        self.assertAlmostEqual(results["eval_loss"], expected_loss)
        expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
    def test_evaluate_with_jit(self):
        trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), jit_mode_eval=True)
        results = trainer.evaluate()

        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
        pred = 1.5 * x + 2.5
        expected_loss = ((pred - y) ** 2).mean()
        self.assertAlmostEqual(results["eval_loss"], expected_loss)
        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

        # With a number of elements not a round multiple of the batch size
        trainer = get_regression_trainer(
            a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), jit_mode_eval=True
        )
        results = trainer.evaluate()

        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
        pred = 1.5 * x + 2.5
        expected_loss = ((pred - y) ** 2).mean()
        self.assertAlmostEqual(results["eval_loss"], expected_loss)
        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

        # With logits preprocess
        trainer = get_regression_trainer(
            a=1.5,
            b=2.5,
            compute_metrics=AlmostAccuracy(),
            preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
            jit_mode_eval=True,
        )
        results = trainer.evaluate()

        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
        pred = 1.5 * x + 2.5
        expected_loss = ((pred - y) ** 2).mean()
        self.assertAlmostEqual(results["eval_loss"], expected_loss)
        expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

1076
    @require_torch_bf16
1077
1078
1079
1080
    @require_intel_extension_for_pytorch
    def test_evaluate_with_ipex(self):
        for mix_bf16 in [True, False]:
            trainer = get_regression_trainer(
1081
                a=1.5, b=2.5, use_ipex=True, compute_metrics=AlmostAccuracy(), bf16=mix_bf16, use_cpu=True
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
            )
            results = trainer.evaluate()

            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
            pred = 1.5 * x + 2.5
            expected_loss = ((pred - y) ** 2).mean()
            self.assertAlmostEqual(results["eval_loss"], expected_loss)
            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

            # With a number of elements not a round multiple of the batch size
            trainer = get_regression_trainer(
                a=1.5,
                b=2.5,
                use_ipex=True,
                eval_len=66,
                compute_metrics=AlmostAccuracy(),
                bf16=mix_bf16,
1100
                use_cpu=True,
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
            )
            results = trainer.evaluate()

            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
            pred = 1.5 * x + 2.5
            expected_loss = ((pred - y) ** 2).mean()
            self.assertAlmostEqual(results["eval_loss"], expected_loss)
            expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

            # With logits preprocess
            trainer = get_regression_trainer(
                a=1.5,
                b=2.5,
                use_ipex=True,
                compute_metrics=AlmostAccuracy(),
                preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
                bf16=mix_bf16,
1119
                use_cpu=True,
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
            )
            results = trainer.evaluate()

            x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
            pred = 1.5 * x + 2.5
            expected_loss = ((pred - y) ** 2).mean()
            self.assertAlmostEqual(results["eval_loss"], expected_loss)
            expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
            self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

Sylvain Gugger's avatar
Sylvain Gugger committed
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
    def test_predict(self):
        trainer = get_regression_trainer(a=1.5, b=2.5)
        preds = trainer.predict(trainer.eval_dataset).predictions
        x = trainer.eval_dataset.x
        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))

        # With a number of elements not a round multiple of the batch size
        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66)
        preds = trainer.predict(trainer.eval_dataset).predictions
        x = trainer.eval_dataset.x
        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))

1142
1143
1144
1145
        # With more than one output of the model
        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True)
        preds = trainer.predict(trainer.eval_dataset).predictions
        x = trainer.eval_dataset.x
1146
        self.assertEqual(len(preds), 2)
1147
1148
1149
        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))

Sylvain Gugger's avatar
Sylvain Gugger committed
1150
1151
1152
1153
1154
1155
        # With more than one output/label of the model
        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"])
        outputs = trainer.predict(trainer.eval_dataset)
        preds = outputs.predictions
        labels = outputs.label_ids
        x = trainer.eval_dataset.x
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
        self.assertEqual(len(preds), 2)
        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))

    def test_predict_with_jit(self):
        trainer = get_regression_trainer(a=1.5, b=2.5, jit_mode_eval=True)
        preds = trainer.predict(trainer.eval_dataset).predictions
        x = trainer.eval_dataset.x
        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))

        # With a number of elements not a round multiple of the batch size
        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, jit_mode_eval=True)
        preds = trainer.predict(trainer.eval_dataset).predictions
        x = trainer.eval_dataset.x
        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))

        # With more than one output of the model
        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, jit_mode_eval=True)
        preds = trainer.predict(trainer.eval_dataset).predictions
        x = trainer.eval_dataset.x
        self.assertEqual(len(preds), 2)
        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))

        # With more than one output/label of the model
        trainer = get_regression_trainer(
            a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"], jit_mode_eval=True
        )
        outputs = trainer.predict(trainer.eval_dataset)
        preds = outputs.predictions
        labels = outputs.label_ids
        x = trainer.eval_dataset.x
1190
        self.assertEqual(len(preds), 2)
Sylvain Gugger's avatar
Sylvain Gugger committed
1191
1192
1193
1194
1195
        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))

1196
    @require_torch_bf16
1197
1198
1199
    @require_intel_extension_for_pytorch
    def test_predict_with_ipex(self):
        for mix_bf16 in [True, False]:
1200
            trainer = get_regression_trainer(a=1.5, b=2.5, use_ipex=True, bf16=mix_bf16, use_cpu=True)
1201
1202
1203
1204
1205
            preds = trainer.predict(trainer.eval_dataset).predictions
            x = trainer.eval_dataset.x
            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))

            # With a number of elements not a round multiple of the batch size
1206
            trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, use_ipex=True, bf16=mix_bf16, use_cpu=True)
1207
1208
1209
1210
1211
1212
            preds = trainer.predict(trainer.eval_dataset).predictions
            x = trainer.eval_dataset.x
            self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))

            # With more than one output of the model
            trainer = get_regression_trainer(
1213
                a=1.5, b=2.5, double_output=True, use_ipex=True, bf16=mix_bf16, use_cpu=True
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
            )
            preds = trainer.predict(trainer.eval_dataset).predictions
            x = trainer.eval_dataset.x
            self.assertEqual(len(preds), 2)
            self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
            self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))

            # With more than one output/label of the model
            trainer = get_regression_trainer(
                a=1.5,
                b=2.5,
                double_output=True,
                label_names=["labels", "labels_2"],
                use_ipex=True,
                bf16=mix_bf16,
1229
                use_cpu=True,
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
            )
            outputs = trainer.predict(trainer.eval_dataset)
            preds = outputs.predictions
            labels = outputs.label_ids
            x = trainer.eval_dataset.x
            self.assertEqual(len(preds), 2)
            self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
            self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
            self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
            self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))

1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
    def test_dynamic_shapes(self):
        eval_dataset = DynamicShapesDataset(batch_size=self.batch_size)
        model = RegressionModel(a=2, b=1)
        args = TrainingArguments("./regression")
        trainer = Trainer(model, args, eval_dataset=eval_dataset)

        # Check evaluation can run to completion
        _ = trainer.evaluate()

        # Check predictions
        preds = trainer.predict(eval_dataset)
        for expected, seen in zip(eval_dataset.ys, preds.label_ids):
            self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]]))
            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))

        for expected, seen in zip(eval_dataset.xs, preds.predictions):
            self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]]))
            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))

        # Same tests with eval accumulation
        args = TrainingArguments("./regression", eval_accumulation_steps=2)
        trainer = Trainer(model, args, eval_dataset=eval_dataset)

        # Check evaluation can run to completion
        _ = trainer.evaluate()

        # Check predictions
        preds = trainer.predict(eval_dataset)
        for expected, seen in zip(eval_dataset.ys, preds.label_ids):
            self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]]))
            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))

        for expected, seen in zip(eval_dataset.xs, preds.predictions):
            self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]]))
            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))

1277
    def test_log_level(self):
1278
        # testing only --log_level (--log_level_replica requires multiple gpus and DDP and is tested elsewhere)
1279
1280
1281
        logger = logging.get_logger()
        log_info_string = "Running training"

1282
1283
        # test with the default log_level - should be the same as before and thus we test depending on is_info
        is_info = logging.get_verbosity() <= 20
1284
1285
1286
        with CaptureLogger(logger) as cl:
            trainer = get_regression_trainer()
            trainer.train()
1287
1288
1289
1290
        if is_info:
            self.assertIn(log_info_string, cl.out)
        else:
            self.assertNotIn(log_info_string, cl.out)
1291

1292
        # test with low log_level - lower than info
1293
1294
1295
1296
1297
        with CaptureLogger(logger) as cl:
            trainer = get_regression_trainer(log_level="debug")
            trainer.train()
        self.assertIn(log_info_string, cl.out)

1298
        # test with high log_level - should be quiet
1299
1300
1301
1302
1303
        with CaptureLogger(logger) as cl:
            trainer = get_regression_trainer(log_level="error")
            trainer.train()
        self.assertNotIn(log_info_string, cl.out)

1304
1305
1306
1307
1308
1309
1310
1311
    def test_save_checkpoints(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
            trainer.train()
            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size))

        # With a regular model that is not a PreTrainedModel
        with tempfile.TemporaryDirectory() as tmpdir:
1312
            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, pretrained=False)
1313
1314
1315
            trainer.train()
            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)

1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
    def test_save_checkpoints_is_atomic(self):
        class UnsaveableTokenizer(PreTrainedTokenizerBase):
            def save_pretrained(self, *args, **kwargs):
                raise OSError("simulated file write error")

        with tempfile.TemporaryDirectory() as tmpdir:
            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
            # Attach unsaveable tokenizer to partially fail checkpointing
            trainer.tokenizer = UnsaveableTokenizer()
            with self.assertRaises(OSError) as _context:
                trainer.train()
            assert get_last_checkpoint(tmpdir) is None

1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
    @require_safetensors
    def test_safe_checkpoints(self):
        for save_safetensors in [True, False]:
            with tempfile.TemporaryDirectory() as tmpdir:
                trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, save_safetensors=save_safetensors)
                trainer.train()
                self.check_saved_checkpoints(
                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
                )

            # With a regular model that is not a PreTrainedModel
            with tempfile.TemporaryDirectory() as tmpdir:
                trainer = get_regression_trainer(
                    output_dir=tmpdir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
                )
                trainer.train()
                self.check_saved_checkpoints(
                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
                )

1349
    @require_torch_multi_accelerator
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
    def test_run_seq2seq_double_train_wrap_once(self):
        # test that we don't wrap the model more than once
        # since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for
        # example DataParallel(DataParallel(model))

        trainer = get_regression_trainer()
        trainer.train()
        model_wrapped_before = trainer.model_wrapped
        trainer.train()
        model_wrapped_after = trainer.model_wrapped
        self.assertIs(model_wrapped_before, model_wrapped_after, "should be not wrapped twice")

1362
    @require_torch_up_to_2_accelerators
1363
    def test_can_resume_training(self):
1364
1365
1366
        # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
        # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
        # won't be the same since the training dataloader is shuffled).
1367

1368
        with tempfile.TemporaryDirectory() as tmpdir:
1369
1370
1371
1372
1373
1374
1375
            kwargs = {
                "output_dir": tmpdir,
                "train_len": 128,
                "save_steps": 5,
                "learning_rate": 0.1,
                "logging_steps": 5,
            }
1376
            trainer = get_regression_trainer(**kwargs)
1377
1378
1379
1380
1381
1382
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
            state = dataclasses.asdict(trainer.state)

            checkpoint = os.path.join(tmpdir, "checkpoint-5")

1383
            # Reinitialize trainer
1384
            trainer = get_regression_trainer(**kwargs)
1385

1386
            trainer.train(resume_from_checkpoint=checkpoint)
1387
1388
1389
1390
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
1391
            self.check_trainer_state_are_the_same(state, state1)
1392

1393
1394
1395
1396
            # Now check with a later checkpoint that it also works when we span over one epoch
            checkpoint = os.path.join(tmpdir, "checkpoint-15")

            # Reinitialize trainer and load model
1397
            trainer = get_regression_trainer(**kwargs)
1398

1399
            trainer.train(resume_from_checkpoint=checkpoint)
1400
1401
1402
1403
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
1404
            self.check_trainer_state_are_the_same(state, state1)
1405

1406
1407
        # With a regular model that is not a PreTrainedModel
        with tempfile.TemporaryDirectory() as tmpdir:
1408
1409
1410
1411
1412
1413
1414
            kwargs = {
                "output_dir": tmpdir,
                "train_len": 128,
                "save_steps": 5,
                "learning_rate": 0.1,
                "pretrained": False,
            }
1415
1416

            trainer = get_regression_trainer(**kwargs)
1417
1418
1419
1420
1421
1422
1423
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
            state = dataclasses.asdict(trainer.state)

            checkpoint = os.path.join(tmpdir, "checkpoint-5")

            # Reinitialize trainer and load model
1424
            trainer = get_regression_trainer(**kwargs)
1425

1426
            trainer.train(resume_from_checkpoint=checkpoint)
1427
1428
1429
1430
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
1431
            self.check_trainer_state_are_the_same(state, state1)
1432

1433
1434
1435
1436
            # Now check with a later checkpoint that it also works when we span over one epoch
            checkpoint = os.path.join(tmpdir, "checkpoint-15")

            # Reinitialize trainer and load model
1437
            trainer = get_regression_trainer(**kwargs)
1438

1439
            trainer.train(resume_from_checkpoint=checkpoint)
1440
1441
1442
1443
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
1444
            self.check_trainer_state_are_the_same(state, state1)
1445

1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
        # Now check failures

        # 1. fail to find a bogus checkpoint
        trainer = get_regression_trainer()
        with self.assertRaises(Exception) as context:
            trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
        self.assertTrue("Can't find a valid checkpoint at" in str(context.exception))

        # 2. fail to find any checkpoint - due a fresh output_dir
        output_dir2 = self.get_auto_remove_tmp_dir()
        trainer = get_regression_trainer(output_dir=output_dir2)
        with self.assertRaises(Exception) as context:
            trainer.train(resume_from_checkpoint=True)
        self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))

1461
    def test_resume_training_with_randomness(self):
1462
1463
1464
1465
        # For more than 1 GPUs, since the randomness is introduced in the model and with DataParallel (which is used
        # in this test for more than 2 GPUs), the calls to the torch RNG will happen in a random order (sometimes
        # GPU 0 will call first and sometimes GPU 1).
        random_torch = not torch.cuda.is_available() or torch.cuda.device_count() <= 1
1466
1467
1468
1469
1470
1471

        if torch.cuda.is_available():
            torch.backends.cudnn.deterministic = True
        train_dataset = RegressionDataset(length=128)
        eval_dataset = RegressionDataset()

1472
1473
1474
        with self.subTest("Test every step"):
            config = RegressionModelConfig(a=0, b=2, random_torch=random_torch)
            model = RegressionRandomPreTrainedModel(config)
1475

1476
1477
1478
            tmp_dir = self.get_auto_remove_tmp_dir()
            args = RegressionTrainingArguments(tmp_dir, save_steps=5, learning_rate=0.1)
            trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
1479

1480
1481
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
1482

1483
1484
1485
1486
1487
            model = RegressionRandomPreTrainedModel(config)
            trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
            trainer.train(resume_from_checkpoint=os.path.join(tmp_dir, "checkpoint-15"))
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()

1488
1489
            self.assertAlmostEqual(a, a1, delta=1e-5)
            self.assertAlmostEqual(b, b1, delta=1e-5)
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511

        with self.subTest("Test every epoch"):
            config = RegressionModelConfig(a=0, b=2, random_torch=random_torch)
            model = RegressionRandomPreTrainedModel(config)

            tmp_dir = self.get_auto_remove_tmp_dir()
            args = RegressionTrainingArguments(tmp_dir, save_strategy="epoch", learning_rate=0.1)
            trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)

            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()

            model = RegressionRandomPreTrainedModel(config)
            trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)

            checkpoints = [d for d in os.listdir(tmp_dir) if d.startswith("checkpoint-")]
            # There should be one checkpoint per epoch.
            self.assertEqual(len(checkpoints), 3)
            checkpoint_dir = sorted(checkpoints, key=lambda x: int(x.replace("checkpoint-", "")))[0]

            trainer.train(resume_from_checkpoint=os.path.join(tmp_dir, checkpoint_dir))
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
1512

1513
1514
            self.assertAlmostEqual(a, a1, delta=1e-5)
            self.assertAlmostEqual(b, b1, delta=1e-5)
1515

1516
    @slow
Yih-Dar's avatar
Yih-Dar committed
1517
    @require_accelerate
1518
    @require_torch_non_multi_accelerator
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
    def test_auto_batch_size_finder(self):
        if torch.cuda.is_available():
            torch.backends.cudnn.deterministic = True

        SRC_DIR = os.path.abspath(
            os.path.join(os.path.dirname(__file__), "..", "..", "examples", "pytorch", "text-classification")
        )
        sys.path.append(SRC_DIR)
        import run_glue

        with tempfile.TemporaryDirectory() as tmpdir:
            testargs = f"""
                run_glue.py
                --model_name_or_path distilbert-base-uncased
                --task_name mrpc
                --do_train
                --do_eval
                --max_seq_len 128
                --per_device_train_batch_size 4096
                --learning_rate 2e-5
                --num_train_epochs 1
                --output_dir {tmpdir}
                --auto_find_batch_size 0
                """.split()
            with self.assertRaises(RuntimeError):
                with patch.object(sys, "argv", testargs):
                    run_glue.main()

        testargs[-1] = "1"
        with patch.object(sys, "argv", testargs):
            run_glue.main()

1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
    def test_auto_batch_size_with_resume_from_checkpoint(self):
        train_dataset = RegressionDataset(length=128)

        config = RegressionModelConfig(a=0, b=2)
        model = RegressionRandomPreTrainedModel(config)

        tmp_dir = self.get_auto_remove_tmp_dir()

        class MockCudaOOMCallback(TrainerCallback):
            def on_step_end(self, args, state, control, **kwargs):
                # simulate OOM on the first step
1562
                if state.train_batch_size >= 16:
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
                    raise RuntimeError("CUDA out of memory.")

        args = RegressionTrainingArguments(
            tmp_dir,
            do_train=True,
            max_steps=2,
            save_steps=1,
            per_device_train_batch_size=16,
            auto_find_batch_size=True,
        )
        trainer = Trainer(model, args, train_dataset=train_dataset, callbacks=[MockCudaOOMCallback()])
        trainer.train()
        # After `auto_find_batch_size` is ran we should now be at 8
        self.assertEqual(trainer._train_batch_size, 8)

        # We can then make a new Trainer
        trainer = Trainer(model, args, train_dataset=train_dataset)
        # Check we are at 16 to start
1581
        self.assertEqual(trainer._train_batch_size, 16 * max(trainer.args.n_gpu, 1))
1582
1583
1584
1585
        trainer.train(resume_from_checkpoint=True)
        # We should be back to 8 again, picking up based upon the last ran Trainer
        self.assertEqual(trainer._train_batch_size, 8)

1586
    # regression for this issue: https://github.com/huggingface/transformers/issues/12970
1587
    def test_training_with_resume_from_checkpoint_false(self):
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
        train_dataset = RegressionDataset(length=128)
        eval_dataset = RegressionDataset()

        config = RegressionModelConfig(a=0, b=2)
        model = RegressionRandomPreTrainedModel(config)

        tmp_dir = self.get_auto_remove_tmp_dir()
        args = RegressionTrainingArguments(tmp_dir, save_steps=5, learning_rate=0.1)
        trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)

        trainer.train(resume_from_checkpoint=False)

1600
    @require_torch_up_to_2_accelerators
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
    def test_resume_training_with_shard_checkpoint(self):
        # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
        # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
        # won't be the same since the training dataloader is shuffled).

        with tempfile.TemporaryDirectory() as tmpdir:
            trainer = get_regression_trainer(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1)
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
            state = dataclasses.asdict(trainer.state)

            checkpoint = os.path.join(tmpdir, "checkpoint-5")
            self.convert_to_sharded_checkpoint(checkpoint)

            # Reinitialize trainer
            trainer = get_regression_trainer(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1)

            trainer.train(resume_from_checkpoint=checkpoint)
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

1625
    @require_safetensors
1626
    @require_torch_up_to_2_accelerators
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
    def test_resume_training_with_safe_checkpoint(self):
        # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
        # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
        # won't be the same since the training dataloader is shuffled).

        for initial_safe in [False, True]:
            for loaded_safe in [False, True]:
                with tempfile.TemporaryDirectory() as tmpdir:
                    trainer = get_regression_trainer(
                        output_dir=tmpdir,
                        train_len=128,
                        save_steps=5,
                        learning_rate=0.1,
                        save_safetensors=initial_safe,
                    )
                    trainer.train()
                    (a, b) = trainer.model.a.item(), trainer.model.b.item()
                    state = dataclasses.asdict(trainer.state)

                    checkpoint = os.path.join(tmpdir, "checkpoint-5")
                    self.convert_to_sharded_checkpoint(checkpoint, load_safe=initial_safe, save_safe=loaded_safe)

                    # Reinitialize trainer
                    trainer = get_regression_trainer(
                        output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1, save_safetensors=loaded_safe
                    )

                    trainer.train(resume_from_checkpoint=checkpoint)
                    (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
                    state1 = dataclasses.asdict(trainer.state)
                    self.assertEqual(a, a1)
                    self.assertEqual(b, b1)
                    self.check_trainer_state_are_the_same(state, state1)

1661
    @require_torch_up_to_2_accelerators
1662
    def test_resume_training_with_gradient_accumulation(self):
1663
1664
1665
1666
        # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
        # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
        # won't be the same since the training dataloader is shuffled).

1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
        with tempfile.TemporaryDirectory() as tmpdir:
            trainer = get_regression_trainer(
                output_dir=tmpdir,
                train_len=128,
                gradient_accumulation_steps=2,
                per_device_train_batch_size=4,
                save_steps=5,
                learning_rate=0.1,
            )
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
            state = dataclasses.asdict(trainer.state)

            checkpoint = os.path.join(tmpdir, "checkpoint-5")

1682
1683
1684
1685
1686
1687
1688
1689
1690
            # Reinitialize trainer
            trainer = get_regression_trainer(
                output_dir=tmpdir,
                train_len=128,
                gradient_accumulation_steps=2,
                per_device_train_batch_size=4,
                save_steps=5,
                learning_rate=0.1,
            )
1691

1692
            trainer.train(resume_from_checkpoint=checkpoint)
1693
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
1694
1695
1696
1697
1698
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

1699
    @require_torch_up_to_2_accelerators
1700
    def test_resume_training_with_frozen_params(self):
1701
1702
1703
1704
        # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
        # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
        # won't be the same since the training dataloader is shuffled).

1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
        with tempfile.TemporaryDirectory() as tmpdir:
            trainer = get_regression_trainer(
                output_dir=tmpdir,
                train_len=128,
                per_device_train_batch_size=4,
                save_steps=5,
                learning_rate=0.1,
            )
            trainer.model.a.requires_grad_(False)
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
            state = dataclasses.asdict(trainer.state)

            checkpoint = os.path.join(tmpdir, "checkpoint-5")

            # Reinitialize trainer
            trainer = get_regression_trainer(
                output_dir=tmpdir,
                train_len=128,
                per_device_train_batch_size=4,
                save_steps=5,
                learning_rate=0.1,
            )
            trainer.model.a.requires_grad_(False)

            trainer.train(resume_from_checkpoint=checkpoint)

            self.assertFalse(trainer.model.a.requires_grad)
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
1734
1735
1736
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
1737
            self.check_trainer_state_are_the_same(state, state1)
1738

1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
    def test_load_best_model_at_end(self):
        total = int(self.n_epochs * 64 / self.batch_size)
        with tempfile.TemporaryDirectory() as tmpdir:
            trainer = get_regression_trainer(
                a=1.5,
                b=2.5,
                output_dir=tmpdir,
                learning_rate=0.1,
                eval_steps=5,
                evaluation_strategy="steps",
1749
                save_steps=5,
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
                load_best_model_at_end=True,
            )
            self.assertFalse(trainer.args.greater_is_better)
            trainer.train()
            self.check_saved_checkpoints(tmpdir, 5, total)
            self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss")

        with tempfile.TemporaryDirectory() as tmpdir:
            trainer = get_regression_trainer(
                a=1.5,
                b=2.5,
                output_dir=tmpdir,
                learning_rate=0.1,
                eval_steps=5,
                evaluation_strategy="steps",
1765
                save_steps=5,
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
                load_best_model_at_end=True,
                metric_for_best_model="accuracy",
                compute_metrics=AlmostAccuracy(),
            )
            self.assertTrue(trainer.args.greater_is_better)
            trainer.train()
            self.check_saved_checkpoints(tmpdir, 5, total)
            self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_accuracy", greater_is_better=True)

        with tempfile.TemporaryDirectory() as tmpdir:
            trainer = get_regression_trainer(
                a=1.5,
                b=2.5,
                output_dir=tmpdir,
                learning_rate=0.1,
                evaluation_strategy="epoch",
1782
                save_strategy="epoch",
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
                load_best_model_at_end=True,
                metric_for_best_model="accuracy",
                compute_metrics=AlmostAccuracy(),
            )
            self.assertTrue(trainer.args.greater_is_better)
            trainer.train()
            self.check_saved_checkpoints(tmpdir, 64 // self.batch_size, total)
            self.check_best_model_has_been_loaded(
                tmpdir, 64 // self.batch_size, total, trainer, "eval_accuracy", greater_is_better=True
            )

        # Test this works with a non PreTrainedModel
        with tempfile.TemporaryDirectory() as tmpdir:
            trainer = get_regression_trainer(
                output_dir=tmpdir,
                learning_rate=0.1,
                eval_steps=5,
                evaluation_strategy="steps",
1801
                save_steps=5,
1802
                load_best_model_at_end=True,
1803
                pretrained=False,
1804
1805
1806
1807
1808
1809
            )
            self.assertFalse(trainer.args.greater_is_better)
            trainer.train()
            self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=False)
            self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss", is_pretrained=False)

1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
    @require_safetensors
    def test_load_best_model_from_safetensors(self):
        total = int(self.n_epochs * 64 / self.batch_size)
        for save_safetensors, pretrained in product([False, True], [False, True]):
            with tempfile.TemporaryDirectory() as tmpdir:
                trainer = get_regression_trainer(
                    a=1.5,
                    b=2.5,
                    output_dir=tmpdir,
                    learning_rate=0.1,
                    eval_steps=5,
                    evaluation_strategy="steps",
                    save_steps=5,
                    load_best_model_at_end=True,
                    save_safetensors=save_safetensors,
                    pretrained=pretrained,
                )
                self.assertFalse(trainer.args.greater_is_better)
                trainer.train()
                self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=pretrained, safe_weights=save_safetensors)
                self.check_best_model_has_been_loaded(
                    tmpdir, 5, total, trainer, "eval_loss", is_pretrained=pretrained, safe_weights=save_safetensors
                )

1834
    @slow
Julien Chaumond's avatar
Julien Chaumond committed
1835
1836
1837
1838
1839
    def test_trainer_eval_mrpc(self):
        MODEL_ID = "bert-base-cased-finetuned-mrpc"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
        data_args = GlueDataTrainingArguments(
1840
            task_name="mrpc", data_dir=f"{get_tests_dir()}/fixtures/tests_samples/MRPC", overwrite_cache=True
Julien Chaumond's avatar
Julien Chaumond committed
1841
        )
1842
        eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")
Julien Chaumond's avatar
Julien Chaumond committed
1843

1844
        training_args = TrainingArguments(output_dir="./examples", use_cpu=True)
Julien Chaumond's avatar
Julien Chaumond committed
1845
1846
        trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset)
        result = trainer.evaluate()
1847
        self.assertLess(result["eval_loss"], 0.2)
Julien Chaumond's avatar
Julien Chaumond committed
1848

1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
    @slow
    def test_trainer_eval_multiple(self):
        MODEL_ID = "gpt2"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
        dataset = LineByLineTextDataset(
            tokenizer=tokenizer,
            file_path=PATH_SAMPLE_TEXT,
            block_size=tokenizer.max_len_single_sentence,
        )
        for example in dataset.examples:
            example["labels"] = example["input_ids"]
        training_args = TrainingArguments(
            output_dir="./examples",
            use_cpu=True,
            per_device_eval_batch_size=1,
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            eval_dataset={
                "data1": dataset,
                "data2": dataset,
            },
        )
        result = trainer.evaluate()
        self.assertIn("eval_data1_loss", result)
        self.assertIn("eval_data2_loss", result)

1878
    @slow
Julien Chaumond's avatar
Julien Chaumond committed
1879
1880
1881
1882
    def test_trainer_eval_lm(self):
        MODEL_ID = "distilroberta-base"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        dataset = LineByLineTextDataset(
Lysandre's avatar
Lysandre committed
1883
1884
1885
            tokenizer=tokenizer,
            file_path=PATH_SAMPLE_TEXT,
            block_size=tokenizer.max_len_single_sentence,
Julien Chaumond's avatar
Julien Chaumond committed
1886
1887
        )
        self.assertEqual(len(dataset), 31)
1888

1889
    def test_training_iterable_dataset(self):
1890
1891
        config = RegressionModelConfig()
        model = RegressionPreTrainedModel(config)
1892
1893
        # Adding one column not used by the model should have no impact
        train_dataset = SampleIterableDataset(label_names=["labels", "extra"])
1894

1895
        args = RegressionTrainingArguments(output_dir="./examples", max_steps=4)
1896
        trainer = Trainer(model=model, args=args, train_dataset=train_dataset)
1897
        trainer.train()
1898
        self.assertEqual(trainer.state.global_step, 4)
1899

1900
1901
        loader = trainer.get_train_dataloader()
        self.assertIsInstance(loader, torch.utils.data.DataLoader)
1902
1903
        self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)

1904
1905
1906
    def test_evaluation_iterable_dataset(self):
        config = RegressionModelConfig(a=1.5, b=2.5)
        model = RegressionPreTrainedModel(config)
1907
1908
        # Adding one column not used by the model should have no impact
        eval_dataset = SampleIterableDataset(label_names=["labels", "extra"])
1909
1910
1911
1912

        args = RegressionTrainingArguments(output_dir="./examples")
        trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy())
        results = trainer.evaluate()
1913

1914
1915
1916
1917
1918
1919
        x, y = trainer.eval_dataset.dataset.x, trainer.eval_dataset.dataset.ys[0]
        pred = 1.5 * x + 2.5
        expected_loss = ((pred - y) ** 2).mean()
        self.assertAlmostEqual(results["eval_loss"], expected_loss)
        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
1920

1921
1922
1923
        # With a number of elements not a round multiple of the batch size
        eval_dataset = SampleIterableDataset(length=66)
        results = trainer.evaluate(eval_dataset)
1924

1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
        x, y = eval_dataset.dataset.x, eval_dataset.dataset.ys[0]
        pred = 1.5 * x + 2.5
        expected_loss = ((pred - y) ** 2).mean()
        self.assertAlmostEqual(results["eval_loss"], expected_loss)
        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

    def test_predict_iterable_dataset(self):
        config = RegressionModelConfig(a=1.5, b=2.5)
        model = RegressionPreTrainedModel(config)
        eval_dataset = SampleIterableDataset()

        args = RegressionTrainingArguments(output_dir="./examples")
        trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy())

        preds = trainer.predict(trainer.eval_dataset).predictions
        x = eval_dataset.dataset.x
        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))

        # With a number of elements not a round multiple of the batch size
1945
1946
        # Adding one column not used by the model should have no impact
        test_dataset = SampleIterableDataset(length=66, label_names=["labels", "extra"])
1947
1948
1949
        preds = trainer.predict(test_dataset).predictions
        x = test_dataset.dataset.x
        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964

    def test_num_train_epochs_in_training(self):
        # len(train_dl) < gradient_accumulation_steps shouldn't give ``ZeroDivisionError`` when ``max_steps`` is given.
        # It should give 1 update step for each epoch.
        trainer = get_regression_trainer(
            max_steps=3, train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5
        )
        train_output = trainer.train()
        self.assertEqual(train_output.global_step, 3)

        # Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if
        # len(train_dl) < gradient_accumulation_steps.
        trainer = get_regression_trainer(train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5)
        train_output = trainer.train()
        self.assertEqual(train_output.global_step, int(self.n_epochs))
Marcin Zab艂ocki's avatar
Marcin Zab艂ocki committed
1965

1966
1967
    def test_early_stopping_callback(self):
        # early stopping stops training before num_training_epochs
1968
1969
1970
1971
1972
1973
1974
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                num_train_epochs=20,
                gradient_accumulation_steps=1,
                per_device_train_batch_size=16,
                load_best_model_at_end=True,
1975
                evaluation_strategy=IntervalStrategy.EPOCH,
1976
                save_strategy=IntervalStrategy.EPOCH,
1977
1978
1979
1980
1981
1982
                compute_metrics=AlmostAccuracy(),
                metric_for_best_model="accuracy",
            )
            trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
            train_output = trainer.train()
            self.assertLess(train_output.global_step, 20 * 64 / 16)
1983
1984

        # Invalid inputs to trainer with early stopping callback result in assertion error
1985
1986
1987
1988
1989
1990
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                num_train_epochs=20,
                gradient_accumulation_steps=1,
                per_device_train_batch_size=16,
1991
                evaluation_strategy=IntervalStrategy.EPOCH,
1992
1993
1994
1995
                compute_metrics=AlmostAccuracy(),
                metric_for_best_model="accuracy",
            )
            trainer.add_callback(EarlyStoppingCallback(1))
1996
            self.assertEqual(trainer.state.global_step, 0)
1997
1998
1999
2000
            try:
                trainer.train()
            except AssertionError:
                self.assertEqual(trainer.state.global_step, 0)
2001

Marcin Zab艂ocki's avatar
Marcin Zab艂ocki committed
2002
2003
2004
2005
    def test_flos_extraction(self):
        trainer = get_regression_trainer(learning_rate=0.1)

        def assert_flos_extraction(trainer, wrapped_model_to_check):
2006
2007
            self.assertEqual(trainer.model, unwrap_model(wrapped_model_to_check))
            self.assertGreaterEqual(getattr(unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0)
Marcin Zab艂ocki's avatar
Marcin Zab艂ocki committed
2008
2009
2010
2011
2012

        # with plain model
        assert_flos_extraction(trainer, trainer.model)

        # with enforced DataParallel
2013
        assert_flos_extraction(trainer, nn.DataParallel(trainer.model))
2014

2015
2016
2017
        trainer.train()
        self.assertTrue(isinstance(trainer.state.total_flos, float))

2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
    def check_checkpoint_deletion(self, trainer, output_dir, expected):
        # Make fake checkpoints
        for n in [5, 10, 15, 20, 25]:
            os.makedirs(os.path.join(output_dir, f"{PREFIX_CHECKPOINT_DIR}-{n}"), exist_ok=True)
        trainer._rotate_checkpoints(output_dir=output_dir)
        glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{PREFIX_CHECKPOINT_DIR}-*")]
        values = [int(re.match(f".*{PREFIX_CHECKPOINT_DIR}-([0-9]+)", d).groups()[0]) for d in glob_checkpoints]
        self.assertSetEqual(set(values), set(expected))

    def test_checkpoint_rotation(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            # Without best model at end
            trainer = get_regression_trainer(output_dir=tmp_dir, save_total_limit=2)
            self.check_checkpoint_deletion(trainer, tmp_dir, [20, 25])

            # With best model at end
2034
2035
2036
            trainer = get_regression_trainer(
                output_dir=tmp_dir, evaluation_strategy="steps", load_best_model_at_end=True, save_total_limit=2
            )
2037
2038
2039
2040
2041
            trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-5")
            self.check_checkpoint_deletion(trainer, tmp_dir, [5, 25])

            # Edge case: we don't always honor save_total_limit=1 if load_best_model_at_end=True to be able to resume
            # from checkpoint
2042
2043
2044
            trainer = get_regression_trainer(
                output_dir=tmp_dir, evaluation_strategy="steps", load_best_model_at_end=True, save_total_limit=1
            )
2045
2046
2047
2048
2049
2050
            trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-25")
            self.check_checkpoint_deletion(trainer, tmp_dir, [25])

            trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-5")
            self.check_checkpoint_deletion(trainer, tmp_dir, [5, 25])

2051
2052
2053
2054
    def check_mem_metrics(self, trainer, check_func):
        metrics = trainer.train().metrics
        check_func("init_mem_cpu_alloc_delta", metrics)
        check_func("train_mem_cpu_alloc_delta", metrics)
2055
        if backend_device_count(torch_device) > 0:
2056
2057
2058
2059
2060
            check_func("init_mem_gpu_alloc_delta", metrics)
            check_func("train_mem_gpu_alloc_delta", metrics)

        metrics = trainer.evaluate()
        check_func("eval_mem_cpu_alloc_delta", metrics)
2061
        if backend_device_count(torch_device) > 0:
2062
2063
2064
2065
            check_func("eval_mem_gpu_alloc_delta", metrics)

        metrics = trainer.predict(RegressionDataset()).metrics
        check_func("test_mem_cpu_alloc_delta", metrics)
2066
        if backend_device_count(torch_device) > 0:
2067
2068
2069
2070
            check_func("test_mem_gpu_alloc_delta", metrics)

    def test_mem_metrics(self):
        # with mem metrics enabled
2071
        trainer = get_regression_trainer(skip_memory_metrics=False)
2072
2073
2074
2075
2076
2077
        self.check_mem_metrics(trainer, self.assertIn)

        # with mem metrics disabled
        trainer = get_regression_trainer(skip_memory_metrics=True)
        self.check_mem_metrics(trainer, self.assertNotIn)

2078
    @require_torch_accelerator
2079
2080
2081
2082
    def test_fp16_full_eval(self):
        # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
        # it's using pretty large safety margins, but small enough to detect broken functionality.
        debug = 0
2083
        n_gpus = backend_device_count(torch_device)
2084
2085

        bs = 8
2086
        eval_len = 16 * n_gpus
2087
2088
2089
2090
2091
        # make the params somewhat big so that there will be enough RAM consumed to be able to
        # measure things. We should get about 64KB for a+b in fp32
        a = torch.ones(1000, bs) + 0.001
        b = torch.ones(1000, bs) - 0.001

2092
        # 1. with fp16_full_eval disabled
2093
        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, skip_memory_metrics=False)
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
        metrics = trainer.evaluate()
        del trainer
        gc.collect()

        fp32_init = metrics["init_mem_gpu_alloc_delta"]
        fp32_eval = metrics["eval_mem_gpu_alloc_delta"]

        if debug:
            print(f"fp32_init {fp32_init}")
            print(f"fp32_eval {fp32_eval}")

        # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram.
        # perfect world: fp32_init == 64<<10
        self.assertGreater(fp32_init, 59_000)
        # after eval should be no extra memory allocated - with a small margin (other than the peak
        # memory consumption for the forward calculation that gets recovered)
        # perfect world: fp32_eval == close to zero
        self.assertLess(fp32_eval, 5_000)

2113
        # 2. with fp16_full_eval enabled
2114
        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False)
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
        metrics = trainer.evaluate()
        fp16_init = metrics["init_mem_gpu_alloc_delta"]
        fp16_eval = metrics["eval_mem_gpu_alloc_delta"]

        if debug:
            print(f"fp16_init {fp16_init}")
            print(f"fp16_eval {fp16_eval}")

        # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0
        # perfect world: fp16_init == close to zero
        self.assertLess(fp16_init, 5_000)
        # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back)
        # perfect world: fp32_init == 32<<10
        self.assertGreater(fp16_eval, 27_000)

        # 3. relative comparison fp32 vs full fp16
        # should be about half of fp16_init
        # perfect world: fp32_init/2 == fp16_eval
        self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)

2135
2136
    @require_torch_non_multi_gpu
    @require_torchdynamo
2137
    @require_torch_tensorrt_fx
2138
    def test_torchdynamo_full_eval(self):
Yih-Dar's avatar
Yih-Dar committed
2139
2140
        import torchdynamo

2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
        # torchdynamo at the moment doesn't support DP/DDP, therefore require a single gpu
        n_gpus = get_gpu_count()

        bs = 8
        eval_len = 16 * n_gpus
        # make the params are somewhat big so that there will be enough RAM consumed to be able to
        # measure things. We should get about 64KB for a+b in fp32
        a = torch.ones(1000, bs) + 0.001
        b = torch.ones(1000, bs) - 0.001

        # 1. Default - without TorchDynamo
        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len)
        metrics = trainer.evaluate()
        original_eval_loss = metrics["eval_loss"]
        del trainer

        # 2. TorchDynamo eager
        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="eager")
        metrics = trainer.evaluate()
        self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
        del trainer
Yih-Dar's avatar
Yih-Dar committed
2162
        torchdynamo.reset()
2163
2164
2165
2166
2167

        # 3. TorchDynamo nvfuser
        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="nvfuser")
        metrics = trainer.evaluate()
        self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
Yih-Dar's avatar
Yih-Dar committed
2168
        torchdynamo.reset()
2169

2170
2171
2172
2173
        # 4. TorchDynamo fx2trt
        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, torchdynamo="fx2trt")
        metrics = trainer.evaluate()
        self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
Yih-Dar's avatar
Yih-Dar committed
2174
        torchdynamo.reset()
2175

2176
    @unittest.skip("torch 2.0.0 gives `ModuleNotFoundError: No module named 'torchdynamo'`.")
2177
2178
2179
2180
    @require_torch_non_multi_gpu
    @require_torchdynamo
    def test_torchdynamo_memory(self):
        # torchdynamo at the moment doesn't support DP/DDP, therefore require a single gpu
Yih-Dar's avatar
Yih-Dar committed
2181
2182
        import torchdynamo

2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
        class CustomTrainer(Trainer):
            def compute_loss(self, model, inputs, return_outputs=False):
                x = inputs["x"]
                output = model(x)
                if self.args.n_gpu == 1:
                    return output.mean()
                return output

        class MyModule(torch.nn.Module):
            """Simple module that does aggressive fusion"""

            def __init__(self):
                super().__init__()

            def forward(self, x):
                for _ in range(20):
Yih-Dar's avatar
Yih-Dar committed
2199
                    x = torch.cos(x)
2200
2201
2202
2203
                return x

        mod = MyModule()

2204
        # 1. without TorchDynamo (eager baseline)
2205
2206
2207
2208
2209
2210
2211
        a = torch.ones(1024, 1024, device="cuda", requires_grad=True)
        a.grad = None
        trainer = CustomTrainer(model=mod)
        # warmup
        for _ in range(10):
            orig_loss = trainer.training_step(mod, {"x": a})

2212
2213
2214
        # resets
        gc.collect()
        torch.cuda.empty_cache()
2215
        torch.cuda.reset_peak_memory_stats()
2216

2217
2218
        orig_loss = trainer.training_step(mod, {"x": a})
        orig_peak_mem = torch.cuda.max_memory_allocated()
Yih-Dar's avatar
Yih-Dar committed
2219
        torchdynamo.reset()
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
        del trainer

        # 2. TorchDynamo nvfuser
        a = torch.ones(1024, 1024, device="cuda", requires_grad=True)
        a.grad = None
        args = TrainingArguments(output_dir="None", torchdynamo="nvfuser")
        trainer = CustomTrainer(model=mod, args=args)
        # warmup
        for _ in range(10):
            loss = trainer.training_step(mod, {"x": a})

2231
2232
2233
        # resets
        gc.collect()
        torch.cuda.empty_cache()
2234
        torch.cuda.reset_peak_memory_stats()
2235

2236
2237
        loss = trainer.training_step(mod, {"x": a})
        peak_mem = torch.cuda.max_memory_allocated()
Yih-Dar's avatar
Yih-Dar committed
2238
        torchdynamo.reset()
2239
2240
2241
2242
2243
2244
2245
2246
2247
        del trainer

        # Functional check
        self.assertAlmostEqual(loss, orig_loss)

        # AOT Autograd recomputaion and nvfuser recomputation optimization
        # aggressively fuses the operations and reduce the memory footprint.
        self.assertGreater(orig_peak_mem, peak_mem * 2)

2248
2249
    @require_torch_accelerator
    @require_torch_bf16
2250
2251
2252
2253
2254
2255
    def test_bf16_full_eval(self):
        # note: most of the logic is the same as test_fp16_full_eval

        # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
        # it's using pretty large safety margins, but small enough to detect broken functionality.
        debug = 0
2256
        n_gpus = backend_device_count(torch_device)
2257
2258
2259
2260
2261
2262
2263
2264

        bs = 8
        eval_len = 16 * n_gpus
        # make the params somewhat big so that there will be enough RAM consumed to be able to
        # measure things. We should get about 64KB for a+b in fp32
        a = torch.ones(1000, bs) + 0.001
        b = torch.ones(1000, bs) - 0.001

2265
        # 1. with bf16_full_eval disabled
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, skip_memory_metrics=False)
        metrics = trainer.evaluate()
        del trainer
        gc.collect()

        fp32_init = metrics["init_mem_gpu_alloc_delta"]
        fp32_eval = metrics["eval_mem_gpu_alloc_delta"]

        if debug:
            print(f"fp32_init {fp32_init}")
            print(f"fp32_eval {fp32_eval}")

        # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram.
        # perfect world: fp32_init == 64<<10
        self.assertGreater(fp32_init, 59_000)
        # after eval should be no extra memory allocated - with a small margin (other than the peak
        # memory consumption for the forward calculation that gets recovered)
        # perfect world: fp32_eval == close to zero
        self.assertLess(fp32_eval, 5_000)

2286
        # 2. with bf16_full_eval enabled
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, bf16_full_eval=True, skip_memory_metrics=False)
        metrics = trainer.evaluate()
        bf16_init = metrics["init_mem_gpu_alloc_delta"]
        bf16_eval = metrics["eval_mem_gpu_alloc_delta"]

        if debug:
            print(f"bf16_init {bf16_init}")
            print(f"bf16_eval {bf16_eval}")

        # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0
        # perfect world: bf16_init == close to zero
        self.assertLess(bf16_init, 5_000)
        # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back)
        # perfect world: fp32_init == 32<<10
        self.assertGreater(bf16_eval, 27_000)

        # 3. relative comparison fp32 vs full bf16
        # should be about half of bf16_init
        # perfect world: fp32_init/2 == bf16_eval
        self.assertAlmostEqual(bf16_eval, fp32_init / 2, delta=5_000)

2308
    def test_no_wd_param_group(self):
2309
        model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
2310
2311
        trainer = Trainer(model=model)
        trainer.create_optimizer_and_scheduler(10)
2312
        wd_names = ['0.linear1.weight', '0.linear2.weight', '1.0.linear1.weight', '1.0.linear2.weight', '1.1.linear1.weight', '1.1.linear2.weight']  # fmt: skip
2313
2314
2315
2316
2317
        wd_params = [p for n, p in model.named_parameters() if n in wd_names]
        no_wd_params = [p for n, p in model.named_parameters() if n not in wd_names]
        self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params)
        self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)

2318
    @slow
2319
    @require_torch_multi_accelerator
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
    def test_end_to_end_example(self):
        # Tests that `translation.py` will run without issues
        script_path = os.path.abspath(
            os.path.join(
                os.path.dirname(__file__), "..", "..", "examples", "pytorch", "translation", "run_translation.py"
            )
        )

        with tempfile.TemporaryDirectory() as tmpdir:
            command = [
                "accelerate",
                "launch",
                script_path,
                "--model_name_or_path",
                "t5-small",
                "--per_device_train_batch_size",
                "1",
                "--output_dir",
                tmpdir,
                "--overwrite_output_dir",
                "--do_train",
                "--max_train_samples",
                "64",
                "--num_train_epochs",
                "1",
                "--dataset_name",
                "wmt16",
                "--dataset_config",
                "ro-en",
                "--source_lang",
                "en",
                "--target_lang",
                "ro",
                "--do_predict",
                "--max_predict_samples",
                "64",
                "--predict_with_generate",
                "--ddp_timeout",
                "60",
            ]
            execute_subprocess_async(command)
            # successful return here == success - any errors would have caused an error or a timeout in the sub-call

2363

Sylvain Gugger's avatar
Sylvain Gugger committed
2364
2365
2366
2367
2368
@require_torch
@is_staging_test
class TrainerIntegrationWithHubTester(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
2369
2370
        cls._token = TOKEN
        HfFolder.save_token(TOKEN)
Sylvain Gugger's avatar
Sylvain Gugger committed
2371
2372
2373

    @classmethod
    def tearDownClass(cls):
2374
        for model in ["test-trainer", "test-trainer-epoch", "test-trainer-step", "test-trainer-tensorboard"]:
2375
            try:
2376
                delete_repo(token=cls._token, repo_id=model)
2377
2378
            except HTTPError:
                pass
Sylvain Gugger's avatar
Sylvain Gugger committed
2379
2380

        try:
2381
            delete_repo(token=cls._token, repo_id="valid_org/test-trainer-org")
Sylvain Gugger's avatar
Sylvain Gugger committed
2382
2383
2384
2385
2386
        except HTTPError:
            pass

    def test_push_to_hub(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
2387
2388
2389
            trainer = get_regression_trainer(
                output_dir=os.path.join(tmp_dir, "test-trainer"),
                push_to_hub=True,
2390
                hub_token=self._token,
2391
2392
            )
            url = trainer.push_to_hub()
Sylvain Gugger's avatar
Sylvain Gugger committed
2393
2394
2395
2396
2397
2398

            # Extract repo_name from the url
            re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
            self.assertTrue(re_search is not None)
            repo_name = re_search.groups()[0]

2399
            self.assertEqual(repo_name, f"{USER}/test-trainer")
Sylvain Gugger's avatar
Sylvain Gugger committed
2400
2401
2402
2403
2404
2405
2406
2407
2408

            model = RegressionPreTrainedModel.from_pretrained(repo_name)
            self.assertEqual(model.a.item(), trainer.model.a.item())
            self.assertEqual(model.b.item(), trainer.model.b.item())

    def test_push_to_hub_in_organization(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(output_dir=tmp_dir)
            trainer.save_model()
2409
2410
2411
            trainer = get_regression_trainer(
                output_dir=os.path.join(tmp_dir, "test-trainer-org"),
                push_to_hub=True,
2412
2413
                hub_model_id="valid_org/test-trainer-org",
                hub_token=self._token,
2414
            )
2415
            url = trainer.push_to_hub()
Sylvain Gugger's avatar
Sylvain Gugger committed
2416
2417
2418
2419
2420

            # Extract repo_name from the url
            re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
            self.assertTrue(re_search is not None)
            repo_name = re_search.groups()[0]
2421
            self.assertEqual(repo_name, "valid_org/test-trainer-org")
Sylvain Gugger's avatar
Sylvain Gugger committed
2422

2423
            model = RegressionPreTrainedModel.from_pretrained("valid_org/test-trainer-org")
Sylvain Gugger's avatar
Sylvain Gugger committed
2424
2425
2426
            self.assertEqual(model.a.item(), trainer.model.a.item())
            self.assertEqual(model.b.item(), trainer.model.b.item())

2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
    def get_commit_history(self, repo):
        commit_logs = subprocess.run(
            "git log".split(),
            stderr=subprocess.PIPE,
            stdout=subprocess.PIPE,
            check=True,
            encoding="utf-8",
            cwd=repo,
        ).stdout
        commits = commit_logs.split("\n\n")[1::2]
        return [commit.strip() for commit in commits]

    def test_push_to_hub_with_saves_each_epoch(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=os.path.join(tmp_dir, "test-trainer-epoch"),
                push_to_hub=True,
                hub_token=self._token,
2445
2446
                # To avoid any flakiness if the training goes faster than the uploads.
                hub_always_push=True,
2447
2448
2449
2450
                save_strategy="epoch",
            )
            trainer.train()

2451
2452
2453
2454
2455
        commits = list_repo_commits(f"{USER}/test-trainer-epoch", token=self._token)
        commits = [c.title for c in commits]
        self.assertIn("initial commit", commits)
        for i in range(1, 4):
            self.assertIn(f"Training in progress, epoch {i}", commits)
2456
2457

    def test_push_to_hub_with_saves_each_n_steps(self):
2458
        num_gpus = max(1, backend_device_count(torch_device))
2459
2460
2461
        if num_gpus > 2:
            return

2462
2463
2464
2465
2466
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=os.path.join(tmp_dir, "test-trainer-step"),
                push_to_hub=True,
                hub_token=self._token,
2467
2468
                # To avoid any flakiness if the training goes faster than the uploads.
                hub_always_push=True,
2469
2470
2471
2472
2473
                save_strategy="steps",
                save_steps=5,
            )
            trainer.train()

2474
2475
2476
        commits = list_repo_commits(f"{USER}/test-trainer-step", token=self._token)
        commits = [c.title for c in commits]
        self.assertIn("initial commit", commits)
2477

2478
2479
2480
2481
        # max_steps depend on the number of available GPUs
        max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader()))
        for i in range(5, max_steps, 5):
            self.assertIn(f"Training in progress, step {i}", commits)
2482

2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
    @require_tensorboard
    def test_push_to_hub_with_tensorboard_logs(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=os.path.join(tmp_dir, "test-trainer-tensorboard"),
                hub_token=self._token,
                save_strategy="epoch",
                report_to=["tensorboard"],
                keep_report_to=True,
            )
            trainer.train()
            # Push the runs via `push_to_hub()`
            trainer.push_to_hub()

        files = list_repo_files(f"{USER}/test-trainer-tensorboard", token=self._token)
        found_log = False
        for f in files:
            if len(f.split("runs")) > 1 and "events.out.tfevents" in f:
                found_log = True

        assert found_log is True, "No tensorboard log found in repo"

Sylvain Gugger's avatar
Sylvain Gugger committed
2505

2506
2507
@require_torch
@require_optuna
2508
class TrainerHyperParameterOptunaIntegrationTest(unittest.TestCase):
2509
    def setUp(self):
2510
        args = TrainingArguments("..")
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

    def test_hyperparameter_search(self):
        class MyTrialShortNamer(TrialShortNamer):
            DEFAULTS = {"a": 0, "b": 0}

        def hp_space(trial):
            return {}

        def model_init(trial):
            if trial is not None:
                a = trial.suggest_int("a", -4, 4)
                b = trial.suggest_int("b", -4, 4)
            else:
                a = 0
                b = 0
            config = RegressionModelConfig(a=a, b=b, double_output=False)

            return RegressionPreTrainedModel(config)

        def hp_name(trial):
            return MyTrialShortNamer.shortname(trial.params)

2535
2536
2537
2538
2539
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                learning_rate=0.1,
                logging_steps=1,
2540
                evaluation_strategy=IntervalStrategy.EPOCH,
2541
                save_strategy=IntervalStrategy.EPOCH,
2542
2543
2544
2545
2546
2547
2548
2549
                num_train_epochs=4,
                disable_tqdm=True,
                load_best_model_at_end=True,
                logging_dir="runs",
                run_name="test",
                model_init=model_init,
            )
            trainer.hyperparameter_search(direction="minimize", hp_space=hp_space, hp_name=hp_name, n_trials=4)
2550
2551


2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
@require_torch
@require_optuna
class TrainerHyperParameterMultiObjectOptunaIntegrationTest(unittest.TestCase):
    def setUp(self):
        args = TrainingArguments("..")
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

    def test_hyperparameter_search(self):
        class MyTrialShortNamer(TrialShortNamer):
            DEFAULTS = {"a": 0, "b": 0}

        def hp_space(trial):
            return {}

        def model_init(trial):
            if trial is not None:
                a = trial.suggest_int("a", -4, 4)
                b = trial.suggest_int("b", -4, 4)
            else:
                a = 0
                b = 0
            config = RegressionModelConfig(a=a, b=b, double_output=False)

            return RegressionPreTrainedModel(config)

        def hp_name(trial):
            return MyTrialShortNamer.shortname(trial.params)

        def compute_objective(metrics: Dict[str, float]) -> List[float]:
            return metrics["eval_loss"], metrics["eval_accuracy"]

        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                learning_rate=0.1,
                logging_steps=1,
                evaluation_strategy=IntervalStrategy.EPOCH,
                save_strategy=IntervalStrategy.EPOCH,
                num_train_epochs=10,
                disable_tqdm=True,
                load_best_model_at_end=True,
                logging_dir="runs",
                run_name="test",
                model_init=model_init,
                compute_metrics=AlmostAccuracy(),
            )
            trainer.hyperparameter_search(
                direction=["minimize", "maximize"],
                hp_space=hp_space,
                hp_name=hp_name,
                n_trials=4,
                compute_objective=compute_objective,
            )


2608
2609
2610
2611
@require_torch
@require_ray
class TrainerHyperParameterRayIntegrationTest(unittest.TestCase):
    def setUp(self):
2612
        args = TrainingArguments("..")
2613
2614
2615
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

2616
    def ray_hyperparameter_search(self):
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
        class MyTrialShortNamer(TrialShortNamer):
            DEFAULTS = {"a": 0, "b": 0}

        def hp_space(trial):
            from ray import tune

            return {
                "a": tune.randint(-4, 4),
                "b": tune.randint(-4, 4),
            }

        def model_init(config):
2629
2630
2631
2632
2633
2634
2635
            if config is None:
                a = 0
                b = 0
            else:
                a = config["a"]
                b = config["b"]
            model_config = RegressionModelConfig(a=a, b=b, double_output=False)
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646

            return RegressionPreTrainedModel(model_config)

        def hp_name(params):
            return MyTrialShortNamer.shortname(params)

        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                learning_rate=0.1,
                logging_steps=1,
2647
                evaluation_strategy=IntervalStrategy.EPOCH,
2648
                save_strategy=IntervalStrategy.EPOCH,
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
                num_train_epochs=4,
                disable_tqdm=True,
                load_best_model_at_end=True,
                logging_dir="runs",
                run_name="test",
                model_init=model_init,
            )
            trainer.hyperparameter_search(
                direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="ray", n_trials=4
            )
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669

    def test_hyperparameter_search(self):
        self.ray_hyperparameter_search()

    def test_hyperparameter_search_ray_client(self):
        import ray
        from ray.util.client.ray_client_helpers import ray_start_client_server

        with ray_start_client_server():
            assert ray.util.client.ray.is_connected()
            self.ray_hyperparameter_search()
2670
2671


2672
@slow
2673
2674
2675
2676
@require_torch
@require_sigopt
class TrainerHyperParameterSigOptIntegrationTest(unittest.TestCase):
    def setUp(self):
2677
        args = TrainingArguments("..")
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

    def test_hyperparameter_search(self):
        class MyTrialShortNamer(TrialShortNamer):
            DEFAULTS = {"a": 0, "b": 0}

        def hp_space(trial):
            return [
                {"bounds": {"min": -4, "max": 4}, "name": "a", "type": "int"},
                {"bounds": {"min": -4, "max": 4}, "name": "b", "type": "int"},
            ]

        def model_init(trial):
            if trial is not None:
                a = trial.assignments["a"]
                b = trial.assignments["b"]
            else:
                a = 0
                b = 0
            config = RegressionModelConfig(a=a, b=b, double_output=False)

            return RegressionPreTrainedModel(config)

        def hp_name(trial):
            return MyTrialShortNamer.shortname(trial.assignments)

        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                learning_rate=0.1,
                logging_steps=1,
                evaluation_strategy=IntervalStrategy.EPOCH,
                save_strategy=IntervalStrategy.EPOCH,
                num_train_epochs=4,
                disable_tqdm=True,
                load_best_model_at_end=True,
                logging_dir="runs",
                run_name="test",
                model_init=model_init,
            )
            trainer.hyperparameter_search(
                direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="sigopt", n_trials=4
            )
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731


optim_test_params = []
if is_torch_available():
    default_adam_kwargs = {
        "betas": (TrainingArguments.adam_beta1, TrainingArguments.adam_beta2),
        "eps": TrainingArguments.adam_epsilon,
        "lr": TrainingArguments.learning_rate,
    }

2732
2733
2734
2735
2736
    default_lion_kwargs = {
        "betas": (TrainingArguments.adam_beta1, TrainingArguments.adam_beta2),
        "lr": TrainingArguments.learning_rate,
    }

2737
2738
2739
2740
2741
2742
2743
    default_anyprecision_kwargs = {
        "use_kahan_summation": False,
        "momentum_dtype": torch.float32,
        "variance_dtype": torch.float32,
        "compensation_buffer_dtype": torch.bfloat16,
    }

2744
2745
    optim_test_params = [
        (
2746
            TrainingArguments(optim=OptimizerNames.ADAMW_HF, output_dir="None"),
2747
2748
2749
2750
            transformers.optimization.AdamW,
            default_adam_kwargs,
        ),
        (
2751
            TrainingArguments(optim=OptimizerNames.ADAMW_HF.value, output_dir="None"),
2752
2753
2754
2755
            transformers.optimization.AdamW,
            default_adam_kwargs,
        ),
        (
2756
            TrainingArguments(optim=OptimizerNames.ADAMW_TORCH, output_dir="None"),
2757
2758
2759
2760
            torch.optim.AdamW,
            default_adam_kwargs,
        ),
        (
2761
            TrainingArguments(optim=OptimizerNames.ADAFACTOR, output_dir="None"),
2762
2763
2764
2765
2766
2767
2768
2769
            transformers.optimization.Adafactor,
            {
                "scale_parameter": False,
                "relative_step": False,
                "lr": TrainingArguments.learning_rate,
            },
        ),
    ]
2770

2771
2772
2773
2774
2775
    if is_apex_available():
        import apex

        optim_test_params.append(
            (
2776
                TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir="None"),
2777
2778
2779
2780
2781
                apex.optimizers.FusedAdam,
                default_adam_kwargs,
            )
        )

2782
2783
2784
2785
2786
    if is_bitsandbytes_available():
        import bitsandbytes as bnb

        optim_test_params.append(
            (
2787
                TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None"),
2788
                bnb.optim.AdamW,
2789
2790
2791
2792
                default_adam_kwargs,
            )
        )

2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
        optim_test_params.append(
            (
                TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir="None"),
                bnb.optim.AdamW,
                default_adam_kwargs,
            )
        )

        optim_test_params.append(
            (
                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None"),
                bnb.optim.AdamW,
                default_adam_kwargs,
            )
        )

        optim_test_params.append(
            (
                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None"),
                bnb.optim.AdamW,
                default_adam_kwargs,
            )
        )

        optim_test_params.append(
            (
                TrainingArguments(optim=OptimizerNames.LION, output_dir="None"),
                bnb.optim.Lion,
                default_lion_kwargs,
            )
        )

        optim_test_params.append(
            (
                TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir="None"),
                bnb.optim.Lion,
                default_lion_kwargs,
            )
        )

        optim_test_params.append(
            (
                TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None"),
                bnb.optim.Lion,
                default_lion_kwargs,
            )
        )

2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
    if is_torchdistx_available():
        import torchdistx

        optim_test_params.append(
            (
                TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir="None"),
                torchdistx.optimizers.AnyPrecisionAdamW,
                dict(default_adam_kwargs, **default_anyprecision_kwargs),
            )
        )

2852
2853
2854

@require_torch
class TrainerOptimizerChoiceTest(unittest.TestCase):
2855
2856
    def check_optim_and_kwargs(self, training_args: TrainingArguments, expected_cls, expected_kwargs):
        actual_cls, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
2857
2858
2859
        self.assertEqual(expected_cls, actual_cls)
        self.assertIsNotNone(optim_kwargs)

2860
        for p, v in expected_kwargs.items():
2861
2862
2863
2864
2865
            self.assertTrue(p in optim_kwargs)
            actual_v = optim_kwargs[p]
            self.assertTrue(actual_v == v, f"Failed check for {p}. Expected {v}, but got {actual_v}.")

    @parameterized.expand(optim_test_params, skip_on_empty=True)
2866
    def test_optim_supported(self, training_args: TrainingArguments, expected_cls, expected_kwargs):
2867
        # exercises all the valid --optim options
2868
        self.check_optim_and_kwargs(training_args, expected_cls, expected_kwargs)
2869

2870
        trainer = get_regression_trainer(**training_args.to_dict())
2871
2872
2873
2874
        trainer.train()

    def test_fused_adam(self):
        # Pretend that apex is installed and mock apex.optimizers.FusedAdam exists.
2875
2876
        # Trainer.get_optimizer_cls_and_kwargs does not use FusedAdam. It only has to return the
        # class given, so mocking apex.optimizers.FusedAdam should be fine for testing and allow
2877
2878
2879
2880
2881
2882
2883
2884
2885
        # the test to run without requiring an apex installation.
        mock = Mock()
        modules = {
            "apex": mock,
            "apex.optimizers": mock.optimizers,
            "apex.optimizers.FusedAdam": mock.optimizers.FusedAdam,
        }
        with patch.dict("sys.modules", modules):
            self.check_optim_and_kwargs(
2886
                TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir="None"),
2887
                mock.optimizers.FusedAdam,
2888
                default_adam_kwargs,
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
            )

    def test_fused_adam_no_apex(self):
        args = TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir="None")

        # Pretend that apex does not exist, even if installed. By setting apex to None, importing
        # apex will fail even if apex is installed.
        with patch.dict("sys.modules", {"apex.optimizers": None}):
            with self.assertRaises(ValueError):
                Trainer.get_optimizer_cls_and_kwargs(args)
2899

2900
2901
2902
2903
2904
2905
2906
2907
2908
    def test_bnb_adam8bit(self):
        # Pretend that Bits and Bytes is installed and mock bnb.optim.Adam8bit exists.
        # Trainer.get_optimizer_cls_and_kwargs does not use Adam8bit. It only has to return the
        # class given, so mocking bnb.optim.Adam8bit should be fine for testing and allow
        # the test to run without requiring a bnb installation.
        mock = Mock()
        modules = {
            "bitsandbytes": mock,
            "bitsandbytes.optim": mock.optim,
2909
            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
2910
2911
2912
        }
        with patch.dict("sys.modules", modules):
            self.check_optim_and_kwargs(
2913
                TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None"),
2914
                mock.optim.AdamW,
2915
                default_adam_kwargs,
2916
2917
            )

2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
    def test_bnb_paged_adam8bit_alias(self):
        mock = Mock()
        modules = {
            "bitsandbytes": mock,
            "bitsandbytes.optim": mock.optim,
            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
        }
        with patch.dict("sys.modules", modules):
            self.check_optim_and_kwargs(
                TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir="None"),
                mock.optim.AdamW,
                default_adam_kwargs,
            )

    def test_bnb_paged_adam(self):
        mock = Mock()
        modules = {
            "bitsandbytes": mock,
            "bitsandbytes.optim": mock.optim,
            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
        }
        with patch.dict("sys.modules", modules):
            self.check_optim_and_kwargs(
                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None"),
                mock.optim.AdamW,
                default_adam_kwargs,
            )

    def test_bnb_paged_adam8bit(self):
        mock = Mock()
        modules = {
            "bitsandbytes": mock,
            "bitsandbytes.optim": mock.optim,
            "bitsandbytes.optim.AdamW": mock.optim.AdamW,
        }
        with patch.dict("sys.modules", modules):
            self.check_optim_and_kwargs(
                TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None"),
                mock.optim.AdamW,
                default_adam_kwargs,
            )

    def test_bnb_lion(self):
        mock = Mock()
        modules = {
            "bitsandbytes": mock,
            "bitsandbytes.optim": mock.optim,
            "bitsandbytes.optim.Lion": mock.optim.Lion,
        }
        with patch.dict("sys.modules", modules):
            self.check_optim_and_kwargs(
                TrainingArguments(optim=OptimizerNames.LION, output_dir="None"),
                mock.optim.Lion,
                default_lion_kwargs,
            )

    def test_bnb_lion8bit(self):
        mock = Mock()
        modules = {
            "bitsandbytes": mock,
            "bitsandbytes.optim": mock.optim,
            "bitsandbytes.optim.Lion": mock.optim.Lion,
        }
        with patch.dict("sys.modules", modules):
            self.check_optim_and_kwargs(
                TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir="None"),
                mock.optim.Lion,
                default_lion_kwargs,
            )

    def test_bnb_paged_lion8bit(self):
        mock = Mock()
        modules = {
            "bitsandbytes": mock,
            "bitsandbytes.optim": mock.optim,
            "bitsandbytes.optim.Lion": mock.optim.Lion,
        }
        with patch.dict("sys.modules", modules):
            self.check_optim_and_kwargs(
                TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None"),
                mock.optim.Lion,
                default_lion_kwargs,
            )

    def test_bnb_paged_lion(self):
        mock = Mock()
        modules = {
            "bitsandbytes": mock,
            "bitsandbytes.optim": mock.optim,
            "bitsandbytes.optim.Lion": mock.optim.Lion,
        }
        with patch.dict("sys.modules", modules):
            self.check_optim_and_kwargs(
                TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir="None"),
                mock.optim.Lion,
                default_lion_kwargs,
            )

3016
3017
3018
3019
3020
    def test_bnb_adam8bit_no_bnb(self):
        args = TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None")

        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
        # bnb will fail even if bnb is installed.
Younes Belkada's avatar
Younes Belkada committed
3021
        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
3022
3023
3024
            with self.assertRaises(ValueError):
                Trainer.get_optimizer_cls_and_kwargs(args)

3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
    def test_bnb_paged_adam_no_bnb(self):
        args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir="None")

        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
        # bnb will fail even if bnb is installed.
        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
            with self.assertRaises(ValueError):
                Trainer.get_optimizer_cls_and_kwargs(args)

    def test_bnb_paged_adam8bit_no_bnb(self):
        args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir="None")

        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
        # bnb will fail even if bnb is installed.
        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
            with self.assertRaises(ValueError):
                Trainer.get_optimizer_cls_and_kwargs(args)

    def test_bnb_paged_lion_no_bnb(self):
        args = TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir="None")

        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
        # bnb will fail even if bnb is installed.
        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
            with self.assertRaises(ValueError):
                Trainer.get_optimizer_cls_and_kwargs(args)

    def test_bnb_paged_lion8bit_no_bnb(self):
        args = TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir="None")

        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
        # bnb will fail even if bnb is installed.
        with patch.dict("sys.modules", {"bitsandbytes.optim": None}):
            with self.assertRaises(ValueError):
                Trainer.get_optimizer_cls_and_kwargs(args)

3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
    def test_anyprecision_adamw(self):
        # Pretend that torchdistx is installed and mock torchdistx.optimizers.AnyPrecisionAdamW exists.
        # Trainer.get_optimizer_cls_and_kwargs does not use AnyPrecisioinAdamW. It only has to return the
        # class given, so mocking torchdistx.optimizers.AnyPrecisionAdamW should be fine for testing and allow
        # the test to run without requiring a bnb installation.
        mock = Mock()
        modules = {
            "torchdistx": mock,
            "torchdistx.optimizers": mock.optimizers,
            "torchdistx.optimizers.AnyPrecisionAdamW.": mock.optimizers.AnyPrecisionAdamW,
        }
        with patch.dict("sys.modules", modules):
            self.check_optim_and_kwargs(
                TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir="None"),
                mock.optimizers.AnyPrecisionAdamW,
                dict(default_adam_kwargs, **default_anyprecision_kwargs),
            )

    def test_no_torchdistx_anyprecision_adamw(self):
        args = TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir="None")

        # Pretend that torchdistx does not exist, even if installed. By setting torchdistx to None, importing
        # torchdistx.optimizers will fail even if torchdistx is installed.
        with patch.dict("sys.modules", {"torchdistx.optimizers": None}):
            with self.assertRaises(ValueError):
                Trainer.get_optimizer_cls_and_kwargs(args)

3088
3089
3090
3091
3092

@require_torch
@require_wandb
class TrainerHyperParameterWandbIntegrationTest(unittest.TestCase):
    def setUp(self):
3093
        args = TrainingArguments("..")
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

    def test_hyperparameter_search(self):
        class MyTrialShortNamer(TrialShortNamer):
            DEFAULTS = {"a": 0, "b": 0}

        def hp_space(trial):
            return {
                "method": "random",
                "metric": {},
                "parameters": {
                    "a": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
                    "b": {"distribution": "int_uniform", "min": 1, "max": 6},
                },
            }

        def model_init(config):
            if config is None:
                a = 0
                b = 0
            else:
                a = config["a"]
                b = config["b"]
            model_config = RegressionModelConfig(a=a, b=b, double_output=False)

            return RegressionPreTrainedModel(model_config)

        def hp_name(params):
            return MyTrialShortNamer.shortname(params)

        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                learning_rate=0.1,
                logging_steps=1,
                evaluation_strategy=IntervalStrategy.EPOCH,
                save_strategy=IntervalStrategy.EPOCH,
                num_train_epochs=4,
                disable_tqdm=True,
                load_best_model_at_end=True,
                logging_dir="runs",
                run_name="test",
                model_init=model_init,
            )
            trainer.hyperparameter_search(
                direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="wandb", n_trials=4, anonymous="must"
            )
3142
3143
3144
3145
3146
3147
3148
3149


class HyperParameterSearchBackendsTest(unittest.TestCase):
    def test_hyperparameter_search_backends(self):
        self.assertEqual(
            list(ALL_HYPERPARAMETER_SEARCH_BACKENDS.keys()),
            list(HPSearchBackend),
        )