test_trainer_ext.py 14.8 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import math
Suraj Patil's avatar
Suraj Patil committed
16
import os
17
import re
Suraj Patil's avatar
Suraj Patil committed
18
import sys
19
import unittest
Stas Bekman's avatar
Stas Bekman committed
20
from pathlib import Path
21
from typing import Tuple
Suraj Patil's avatar
Suraj Patil committed
22
23
from unittest.mock import patch

24
from parameterized import parameterized
25

26
from transformers.testing_utils import (
27
    CaptureStderr,
28
    ExtendSysPath,
29
30
31
    TestCasePlus,
    execute_subprocess_async,
    get_gpu_count,
32
    get_torch_dist_unique_port,
33
34
35
    require_apex,
    require_bitsandbytes,
    require_fairscale,
36
    require_torch,
37
    require_torch_gpu,
38
39
40
41
    require_torch_multi_gpu,
    require_torch_non_multi_gpu,
    slow,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
42
43
from transformers.trainer_callback import TrainerState
from transformers.trainer_utils import set_seed
Suraj Patil's avatar
Suraj Patil committed
44

45
46

bindir = os.path.abspath(os.path.dirname(__file__))
Sylvain Gugger's avatar
Sylvain Gugger committed
47
with ExtendSysPath(f"{bindir}/../../examples/pytorch/translation"):
48
    from run_translation import main  # noqa
49

Suraj Patil's avatar
Suraj Patil committed
50

51
set_seed(42)
Suraj Patil's avatar
Suraj Patil committed
52
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
Sylvain Gugger's avatar
Sylvain Gugger committed
53
MBART_TINY = "sshleifer/tiny-mbart"
Suraj Patil's avatar
Suraj Patil committed
54
55


56
@require_torch
57
class TestTrainerExt(TestCasePlus):
58
59
60
61
62
63
64
65
66
    def run_seq2seq_quick(
        self,
        distributed=False,
        extra_args_str=None,
        predict_with_generate=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
    ):
67
68
69
70
71
72
73
74
        output_dir = self.run_trainer(
            eval_steps=1,
            max_len=12,
            model_name=MBART_TINY,
            num_train_epochs=1,
            distributed=distributed,
            extra_args_str=extra_args_str,
            predict_with_generate=predict_with_generate,
75
76
77
            do_train=do_train,
            do_eval=do_eval,
            do_predict=do_predict,
78
        )
79
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
80
81
82
83

        if not do_eval:
            return

84
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
85

86
        first_step_stats = eval_metrics[0]
87
88
        if predict_with_generate:
            assert "eval_bleu" in first_step_stats
Suraj Patil's avatar
Suraj Patil committed
89

90
91
92
            last_step_stats = eval_metrics[-1]
            assert isinstance(last_step_stats["eval_bleu"], float)
            assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`"
93

94
    @require_torch_non_multi_gpu
95
96
    def test_run_seq2seq_no_dist(self):
        self.run_seq2seq_quick()
97

98
    # verify that the trainer can handle non-distributed with n_gpu > 1
99
    @require_torch_multi_gpu
100
101
    def test_run_seq2seq_dp(self):
        self.run_seq2seq_quick(distributed=False)
102

103
    # verify that the trainer can handle distributed with n_gpu > 1
104
    @require_torch_multi_gpu
105
106
    def test_run_seq2seq_ddp(self):
        self.run_seq2seq_quick(distributed=True)
107

108
    # test --sharded_ddp w/o --fp16
Yih-Dar's avatar
Yih-Dar committed
109
    @unittest.skip("Requires an update of the env running those tests")
110
111
    @require_torch_multi_gpu
    @require_fairscale
112
113
    def test_run_seq2seq_sharded_ddp(self):
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple")
114

115
    # test --sharded_ddp w/ --fp16
116
    @unittest.skip("Requires an update of the env running those tests")
117
118
    @require_torch_multi_gpu
    @require_fairscale
119
120
121
    def test_run_seq2seq_sharded_ddp_fp16(self):
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple --fp16")

122
    # test --sharded_ddp zero_dp_2 w/o --fp16
Yih-Dar's avatar
Yih-Dar committed
123
    @unittest.skip("Requires an update of the env running those tests")
124
125
126
    @require_torch_multi_gpu
    @require_fairscale
    def test_run_seq2seq_fully_sharded_ddp(self):
127
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False)
128

129
    # test --sharded_ddp zero_dp_2 w/ --fp16
130
    @unittest.skip("Requires an update of the env running those tests")
131
132
133
134
    @require_torch_multi_gpu
    @require_fairscale
    def test_run_seq2seq_fully_sharded_ddp_fp16(self):
        self.run_seq2seq_quick(
135
            distributed=True, extra_args_str="--sharded_ddp zero_dp_2 --fp16", predict_with_generate=False
136
        )
137

138
    @require_apex
139
    @require_torch_gpu
140
    def test_run_seq2seq_apex(self):
141
142
143
144
145
146
147
148
149
150
151
152
        # XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same
        # program and it breaks other tests that run from the same pytest worker, therefore until this is
        # sorted out it must be run only in an external program, that is distributed=True in this
        # test and only under one or more gpus - if we want cpu will need to make a special test
        #
        # specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via
        # 2nd main() call it botches the future eval.
        #
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
        # test 2nd time - was getting eval_loss': nan'
        # to reproduce the problem set distributed=False
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
153

154
    @parameterized.expand(["base", "low", "high", "mixed"])
155
    @require_torch_multi_gpu
156
157
    def test_trainer_log_level_replica(self, experiment_id):
        # as each sub-test is slow-ish split into multiple sub-tests to avoid CI timeout
158
        experiments = {
159
            # test with the default log_level - should be info and thus log info once
160
            "base": {"extra_args_str": "", "n_matches": 1},
161
162
            # test with low log_level and log_level_replica - should be noisy on all processes
            # now the info string should appear twice on 2 processes
163
            "low": {"extra_args_str": "--log_level debug --log_level_replica debug", "n_matches": 2},
164
165
            # test with high log_level and low log_level_replica
            # now the info string should appear once only on the replica
166
            "high": {"extra_args_str": "--log_level error --log_level_replica debug", "n_matches": 1},
167
            # test with high log_level and log_level_replica - should be quiet on all processes
168
169
            "mixed": {"extra_args_str": "--log_level error --log_level_replica error", "n_matches": 0},
        }
170

171
        data = experiments[experiment_id]
172
        kwargs = {"distributed": True, "predict_with_generate": False, "do_eval": False, "do_predict": False}
173
        log_info_string = "Running training"
174
        with CaptureStderr() as cl:
175
            self.run_seq2seq_quick(**kwargs, extra_args_str=data["extra_args_str"])
176
        n_matches = len(re.findall(log_info_string, cl.err))
177
        self.assertEqual(n_matches, data["n_matches"])
178

179
    @slow
180
    def test_run_seq2seq(self):
181
        output_dir = self.run_trainer(
182
183
184
185
186
187
            eval_steps=2,
            max_len=128,
            model_name=MARIAN_MODEL,
            learning_rate=3e-4,
            num_train_epochs=10,
            distributed=False,
188
        )
Suraj Patil's avatar
Suraj Patil committed
189

190
191
192
193
194
        # Check metrics
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
        last_step_stats = eval_metrics[-1]
195

196
        assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing"
197
        assert isinstance(last_step_stats["eval_bleu"], float)
198

199
200
201
        # test if do_predict saves generations and metrics
        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
202
203
        assert "generated_predictions.txt" in contents
        assert "predict_results.json" in contents
204

205
206
207
208
209
210
    @slow
    @require_bitsandbytes
    def test_run_seq2seq_bnb(self):
        from transformers.training_args import OptimizerNames

        def train_and_return_metrics(optim: str) -> Tuple[int, float]:
Stas Bekman's avatar
Stas Bekman committed
211
            extra_args = "--skip_memory_metrics 0"
212
213
214
215
216
217

            output_dir = self.run_trainer(
                max_len=128,
                model_name=MARIAN_MODEL,
                learning_rate=3e-4,
                num_train_epochs=1,
Stas Bekman's avatar
Stas Bekman committed
218
                optim=optim,
219
220
221
222
                distributed=True,  # force run in a new process
                extra_args_str=extra_args,
                do_eval=False,
                do_predict=False,
Stas Bekman's avatar
Stas Bekman committed
223
                n_gpus_to_use=1,  # to allow deterministic fixed memory usage
224
225
226
227
            )

            # Check metrics
            logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
Stas Bekman's avatar
Stas Bekman committed
228
229
            gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20)
            gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20)
230
231

            loss = logs[0]["train_loss"]
Stas Bekman's avatar
Stas Bekman committed
232
            return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss
233
234
235
236

        gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
        gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)

Stas Bekman's avatar
Stas Bekman committed
237
        gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb
238
239
240

        gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
        gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
Stas Bekman's avatar
Stas Bekman committed
241
        gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb
242

Stas Bekman's avatar
Stas Bekman committed
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
        # sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which
        # doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized
        # in 2 bytes and the diff in optim memory usage is derived as so:
        #
        # - normal 25*8=~200MB (8 bytes per param)
        # - bnb    25*2= ~50MB (2 bytes per param)
        #
        # Thus we should expect ~150MB total memory saved.
        #
        # Peak memory should be the same - the total should be different by about that same margin
        #
        # After leaving a small margin to accommodate for differences between gpus let's check
        # that we have at least 120MB in savings
        expected_savings = 120

        # uncomment the following if this test starts failing - requires py38 for a new print feature
        # gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb
        # print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB")
        # print(f" {gpu_alloc_mem_bnb=}MB  {gpu_peak_mem_bnb=}MB  {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB")
        # print(f"{gpu_alloc_mem_diff=}MB")
        # print(f"{gpu_peak_mem_diff=}MB")
        # print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB")
        # print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB")
266
267

        self.assertGreater(
Stas Bekman's avatar
Stas Bekman committed
268
269
270
271
272
            gpu_alloc_mem_diff,
            expected_savings,
            "should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got"
            f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and"
            f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB",
273
274
275
        )

        self.assertGreater(
Stas Bekman's avatar
Stas Bekman committed
276
277
278
279
280
            gpu_total_mem_diff,
            expected_savings,
            "should use ~150MB less total gpu memory with BNB, compared to without it for this model but got"
            f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and"
            f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB",
281
282
283
        )

        self.assertEqual(
284
            loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
285
286
        )

287
    def run_trainer(
288
        self,
289
        max_len: int,
290
291
        model_name: str,
        num_train_epochs: int,
292
        learning_rate: float = 3e-3,
Stas Bekman's avatar
Stas Bekman committed
293
        optim: str = "adafactor",
294
295
        distributed: bool = False,
        extra_args_str: str = None,
Stas Bekman's avatar
Stas Bekman committed
296
        eval_steps: int = 0,
297
        predict_with_generate: bool = True,
298
299
300
        do_train: bool = True,
        do_eval: bool = True,
        do_predict: bool = True,
Stas Bekman's avatar
Stas Bekman committed
301
        n_gpus_to_use: int = None,
302
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
303
        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
304
        output_dir = self.get_auto_remove_tmp_dir()
305
        args_train = f"""
306
            --model_name_or_path {model_name}
307
308
309
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --test_file {data_dir}/test.json
310
311
            --output_dir {output_dir}
            --overwrite_output_dir
312
            --max_train_samples 8
313
            --max_source_length {max_len}
314
            --max_target_length {max_len}
315
316
317
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
318
            --learning_rate {learning_rate}
319
320
            --warmup_steps 8
            --logging_steps 0
321
            --logging_strategy no
322
            --save_steps {str(eval_steps)}
323
            --group_by_length
324
            --label_smoothing_factor 0.1
325
326
            --target_lang ro_RO
            --source_lang en_XX
Stas Bekman's avatar
Stas Bekman committed
327
        """.split()
328
329
330
331
332
333
334
335

        args_eval = f"""
            --do_eval
            --per_device_eval_batch_size 4
            --max_eval_samples 8
            --val_max_target_length {max_len}
            --evaluation_strategy steps
            --eval_steps {str(eval_steps)}
Stas Bekman's avatar
Stas Bekman committed
336
        """.split()
337
338
339

        args_predict = """
            --do_predict
Stas Bekman's avatar
Stas Bekman committed
340
        """.split()
341

Stas Bekman's avatar
Stas Bekman committed
342
        args = []
343
344
345
346
347
348
349
350
351
        if do_train:
            args += args_train

        if do_eval:
            args += args_eval

        if do_predict:
            args += args_predict

352
        if predict_with_generate:
Stas Bekman's avatar
Stas Bekman committed
353
            args += "--predict_with_generate".split()
354

Stas Bekman's avatar
Stas Bekman committed
355
356
357
358
359
        if do_train:
            if optim == "adafactor":
                args += "--adafactor".split()
            else:
                args += f"--optim {optim}".split()
360

361
        if extra_args_str is not None:
Stas Bekman's avatar
Stas Bekman committed
362
            args += extra_args_str.split()
363

364
        if distributed:
Stas Bekman's avatar
Stas Bekman committed
365
366
            if n_gpus_to_use is None:
                n_gpus_to_use = get_gpu_count()
367
            master_port = get_torch_dist_unique_port()
368
369
            distributed_args = f"""
                -m torch.distributed.launch
Stas Bekman's avatar
Stas Bekman committed
370
                --nproc_per_node={n_gpus_to_use}
371
                --master_port={master_port}
Sylvain Gugger's avatar
Sylvain Gugger committed
372
                {self.examples_dir_str}/pytorch/translation/run_translation.py
373
374
            """.split()
            cmd = [sys.executable] + distributed_args + args
375
376
            # keep for quick debug
            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
377
            execute_subprocess_async(cmd, env=self.get_env())
378
        else:
379
            testargs = ["run_translation.py"] + args
380
381
            with patch.object(sys, "argv", testargs):
                main()
382

383
        return output_dir