test_trainer_ext.py 13.7 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import math
Suraj Patil's avatar
Suraj Patil committed
16
import os
17
import re
Suraj Patil's avatar
Suraj Patil committed
18
import sys
Stas Bekman's avatar
Stas Bekman committed
19
from pathlib import Path
20
from typing import Tuple
Suraj Patil's avatar
Suraj Patil committed
21
22
from unittest.mock import patch

23
from parameterized import parameterized
24

25
from transformers.testing_utils import (
26
    CaptureStderr,
27
    ExtendSysPath,
28
    TestCasePlus,
29
    backend_device_count,
30
    execute_subprocess_async,
31
    get_torch_dist_unique_port,
32
33
    require_apex,
    require_bitsandbytes,
34
    require_torch,
35
    require_torch_gpu,
36
37
    require_torch_multi_accelerator,
    require_torch_non_multi_accelerator,
38
    slow,
39
    torch_device,
40
)
Sylvain Gugger's avatar
Sylvain Gugger committed
41
42
from transformers.trainer_callback import TrainerState
from transformers.trainer_utils import set_seed
Suraj Patil's avatar
Suraj Patil committed
43

44
45

bindir = os.path.abspath(os.path.dirname(__file__))
Sylvain Gugger's avatar
Sylvain Gugger committed
46
with ExtendSysPath(f"{bindir}/../../examples/pytorch/translation"):
47
    from run_translation import main  # noqa
48

Suraj Patil's avatar
Suraj Patil committed
49

50
set_seed(42)
Suraj Patil's avatar
Suraj Patil committed
51
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
Sylvain Gugger's avatar
Sylvain Gugger committed
52
MBART_TINY = "sshleifer/tiny-mbart"
Suraj Patil's avatar
Suraj Patil committed
53
54


55
@require_torch
56
class TestTrainerExt(TestCasePlus):
57
58
59
60
61
62
63
64
    def run_seq2seq_quick(
        self,
        distributed=False,
        extra_args_str=None,
        predict_with_generate=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
65
        n_gpus_to_use=None,
66
    ):
67
68
69
70
71
72
73
74
        output_dir = self.run_trainer(
            eval_steps=1,
            max_len=12,
            model_name=MBART_TINY,
            num_train_epochs=1,
            distributed=distributed,
            extra_args_str=extra_args_str,
            predict_with_generate=predict_with_generate,
75
76
77
            do_train=do_train,
            do_eval=do_eval,
            do_predict=do_predict,
78
            n_gpus_to_use=n_gpus_to_use,
79
        )
80
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
81
82
83
84

        if not do_eval:
            return

85
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
86

87
        first_step_stats = eval_metrics[0]
88
89
        if predict_with_generate:
            assert "eval_bleu" in first_step_stats
Suraj Patil's avatar
Suraj Patil committed
90

91
92
93
            last_step_stats = eval_metrics[-1]
            assert isinstance(last_step_stats["eval_bleu"], float)
            assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`"
94

95
    @require_torch_non_multi_accelerator
96
97
    def test_run_seq2seq_no_dist(self):
        self.run_seq2seq_quick()
98

99
    # verify that the trainer can handle non-distributed with n_gpu > 1
100
    @require_torch_multi_accelerator
101
102
    def test_run_seq2seq_dp(self):
        self.run_seq2seq_quick(distributed=False)
103

104
    # verify that the trainer can handle distributed with n_gpu > 1
105
    @require_torch_multi_accelerator
106
107
    def test_run_seq2seq_ddp(self):
        self.run_seq2seq_quick(distributed=True)
108

109
    @require_apex
110
    @require_torch_gpu
111
    def test_run_seq2seq_apex(self):
112
113
114
115
116
117
118
119
120
121
122
123
        # XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same
        # program and it breaks other tests that run from the same pytest worker, therefore until this is
        # sorted out it must be run only in an external program, that is distributed=True in this
        # test and only under one or more gpus - if we want cpu will need to make a special test
        #
        # specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via
        # 2nd main() call it botches the future eval.
        #
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
        # test 2nd time - was getting eval_loss': nan'
        # to reproduce the problem set distributed=False
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
124

125
    @parameterized.expand(["base", "low", "high", "mixed"])
126
    @require_torch_multi_accelerator
127
128
    def test_trainer_log_level_replica(self, experiment_id):
        # as each sub-test is slow-ish split into multiple sub-tests to avoid CI timeout
129
        experiments = {
130
            # test with the default log_level - should be info and thus log info once
131
            "base": {"extra_args_str": "", "n_matches": 1},
132
133
            # test with low log_level and log_level_replica - should be noisy on all processes
            # now the info string should appear twice on 2 processes
134
            "low": {"extra_args_str": "--log_level debug --log_level_replica debug", "n_matches": 2},
135
136
            # test with high log_level and low log_level_replica
            # now the info string should appear once only on the replica
137
            "high": {"extra_args_str": "--log_level error --log_level_replica debug", "n_matches": 1},
138
            # test with high log_level and log_level_replica - should be quiet on all processes
139
140
            "mixed": {"extra_args_str": "--log_level error --log_level_replica error", "n_matches": 0},
        }
141

142
        data = experiments[experiment_id]
143
144
145
146
147
148
149
        kwargs = {
            "distributed": True,
            "predict_with_generate": False,
            "do_eval": False,
            "do_predict": False,
            "n_gpus_to_use": 2,
        }
150
        log_info_string = "Running training"
151
        with CaptureStderr() as cl:
152
            self.run_seq2seq_quick(**kwargs, extra_args_str=data["extra_args_str"])
153
        n_matches = len(re.findall(log_info_string, cl.err))
154
        self.assertEqual(n_matches, data["n_matches"])
155

156
    @slow
157
    def test_run_seq2seq(self):
158
        output_dir = self.run_trainer(
159
160
161
162
163
164
            eval_steps=2,
            max_len=128,
            model_name=MARIAN_MODEL,
            learning_rate=3e-4,
            num_train_epochs=10,
            distributed=False,
165
        )
Suraj Patil's avatar
Suraj Patil committed
166

167
168
169
170
171
        # Check metrics
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
        last_step_stats = eval_metrics[-1]
172

173
        assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing"
174
        assert isinstance(last_step_stats["eval_bleu"], float)
175

176
177
178
        # test if do_predict saves generations and metrics
        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
179
180
        assert "generated_predictions.txt" in contents
        assert "predict_results.json" in contents
181

182
183
184
185
186
187
    @slow
    @require_bitsandbytes
    def test_run_seq2seq_bnb(self):
        from transformers.training_args import OptimizerNames

        def train_and_return_metrics(optim: str) -> Tuple[int, float]:
Stas Bekman's avatar
Stas Bekman committed
188
            extra_args = "--skip_memory_metrics 0"
189
190
191
192
193
194

            output_dir = self.run_trainer(
                max_len=128,
                model_name=MARIAN_MODEL,
                learning_rate=3e-4,
                num_train_epochs=1,
Stas Bekman's avatar
Stas Bekman committed
195
                optim=optim,
196
197
198
199
                distributed=True,  # force run in a new process
                extra_args_str=extra_args,
                do_eval=False,
                do_predict=False,
Stas Bekman's avatar
Stas Bekman committed
200
                n_gpus_to_use=1,  # to allow deterministic fixed memory usage
201
202
203
204
            )

            # Check metrics
            logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
Stas Bekman's avatar
Stas Bekman committed
205
206
            gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20)
            gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20)
207
208

            loss = logs[0]["train_loss"]
Stas Bekman's avatar
Stas Bekman committed
209
            return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss
210
211
212
213

        gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
        gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)

Stas Bekman's avatar
Stas Bekman committed
214
        gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb
215
216
217

        gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
        gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
Stas Bekman's avatar
Stas Bekman committed
218
        gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb
219

Stas Bekman's avatar
Stas Bekman committed
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
        # sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which
        # doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized
        # in 2 bytes and the diff in optim memory usage is derived as so:
        #
        # - normal 25*8=~200MB (8 bytes per param)
        # - bnb    25*2= ~50MB (2 bytes per param)
        #
        # Thus we should expect ~150MB total memory saved.
        #
        # Peak memory should be the same - the total should be different by about that same margin
        #
        # After leaving a small margin to accommodate for differences between gpus let's check
        # that we have at least 120MB in savings
        expected_savings = 120

        # uncomment the following if this test starts failing - requires py38 for a new print feature
        # gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb
        # print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB")
        # print(f" {gpu_alloc_mem_bnb=}MB  {gpu_peak_mem_bnb=}MB  {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB")
        # print(f"{gpu_alloc_mem_diff=}MB")
        # print(f"{gpu_peak_mem_diff=}MB")
        # print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB")
        # print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB")
243
244

        self.assertGreater(
Stas Bekman's avatar
Stas Bekman committed
245
246
247
248
249
            gpu_alloc_mem_diff,
            expected_savings,
            "should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got"
            f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and"
            f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB",
250
251
252
        )

        self.assertGreater(
Stas Bekman's avatar
Stas Bekman committed
253
254
255
256
257
            gpu_total_mem_diff,
            expected_savings,
            "should use ~150MB less total gpu memory with BNB, compared to without it for this model but got"
            f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and"
            f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB",
258
259
260
        )

        self.assertEqual(
261
            loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
262
263
        )

264
    def run_trainer(
265
        self,
266
        max_len: int,
267
268
        model_name: str,
        num_train_epochs: int,
269
        learning_rate: float = 3e-3,
Stas Bekman's avatar
Stas Bekman committed
270
        optim: str = "adafactor",
271
272
        distributed: bool = False,
        extra_args_str: str = None,
Stas Bekman's avatar
Stas Bekman committed
273
        eval_steps: int = 0,
274
        predict_with_generate: bool = True,
275
276
277
        do_train: bool = True,
        do_eval: bool = True,
        do_predict: bool = True,
Stas Bekman's avatar
Stas Bekman committed
278
        n_gpus_to_use: int = None,
279
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
280
        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
281
        output_dir = self.get_auto_remove_tmp_dir()
282
        args_train = f"""
283
            --model_name_or_path {model_name}
284
285
286
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --test_file {data_dir}/test.json
287
288
            --output_dir {output_dir}
            --overwrite_output_dir
289
            --max_train_samples 8
290
            --max_source_length {max_len}
291
            --max_target_length {max_len}
292
293
294
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
295
            --learning_rate {learning_rate}
296
297
            --warmup_steps 8
            --logging_steps 0
298
            --logging_strategy no
299
            --save_steps {str(eval_steps)}
300
            --group_by_length
301
            --label_smoothing_factor 0.1
302
303
            --target_lang ro_RO
            --source_lang en_XX
Stas Bekman's avatar
Stas Bekman committed
304
        """.split()
305
306
307
308
309
310
311
312

        args_eval = f"""
            --do_eval
            --per_device_eval_batch_size 4
            --max_eval_samples 8
            --val_max_target_length {max_len}
            --evaluation_strategy steps
            --eval_steps {str(eval_steps)}
Stas Bekman's avatar
Stas Bekman committed
313
        """.split()
314
315
316

        args_predict = """
            --do_predict
Stas Bekman's avatar
Stas Bekman committed
317
        """.split()
318

Stas Bekman's avatar
Stas Bekman committed
319
        args = []
320
321
322
323
324
325
326
327
328
        if do_train:
            args += args_train

        if do_eval:
            args += args_eval

        if do_predict:
            args += args_predict

329
        if predict_with_generate:
Stas Bekman's avatar
Stas Bekman committed
330
            args += "--predict_with_generate".split()
331

Stas Bekman's avatar
Stas Bekman committed
332
333
334
335
336
        if do_train:
            if optim == "adafactor":
                args += "--adafactor".split()
            else:
                args += f"--optim {optim}".split()
337

338
        if extra_args_str is not None:
Stas Bekman's avatar
Stas Bekman committed
339
            args += extra_args_str.split()
340

341
        if distributed:
Stas Bekman's avatar
Stas Bekman committed
342
            if n_gpus_to_use is None:
343
                n_gpus_to_use = backend_device_count(torch_device)
344
            master_port = get_torch_dist_unique_port()
345
            distributed_args = f"""
346
                -m torch.distributed.run
Stas Bekman's avatar
Stas Bekman committed
347
                --nproc_per_node={n_gpus_to_use}
348
                --master_port={master_port}
Sylvain Gugger's avatar
Sylvain Gugger committed
349
                {self.examples_dir_str}/pytorch/translation/run_translation.py
350
351
            """.split()
            cmd = [sys.executable] + distributed_args + args
352
353
            # keep for quick debug
            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
354
            execute_subprocess_async(cmd, env=self.get_env())
355
        else:
356
            testargs = ["run_translation.py"] + args
357
358
            with patch.object(sys, "argv", testargs):
                main()
359

360
        return output_dir