test_trainer_ext.py 14.7 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import math
Suraj Patil's avatar
Suraj Patil committed
16
import os
17
import re
Suraj Patil's avatar
Suraj Patil committed
18
import sys
19
import unittest
Stas Bekman's avatar
Stas Bekman committed
20
from pathlib import Path
21
from typing import Tuple
Suraj Patil's avatar
Suraj Patil committed
22
23
from unittest.mock import patch

24
from parameterized import parameterized
25
from transformers.testing_utils import (
26
    CaptureStderr,
27
    ExtendSysPath,
28
29
30
    TestCasePlus,
    execute_subprocess_async,
    get_gpu_count,
31
    get_torch_dist_unique_port,
32
33
34
    require_apex,
    require_bitsandbytes,
    require_fairscale,
35
    require_torch,
36
    require_torch_gpu,
37
38
39
40
    require_torch_multi_gpu,
    require_torch_non_multi_gpu,
    slow,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
41
42
from transformers.trainer_callback import TrainerState
from transformers.trainer_utils import set_seed
Suraj Patil's avatar
Suraj Patil committed
43

44
45

bindir = os.path.abspath(os.path.dirname(__file__))
Sylvain Gugger's avatar
Sylvain Gugger committed
46
with ExtendSysPath(f"{bindir}/../../examples/pytorch/translation"):
47
    from run_translation import main  # noqa
48

Suraj Patil's avatar
Suraj Patil committed
49

50
set_seed(42)
Suraj Patil's avatar
Suraj Patil committed
51
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
Sylvain Gugger's avatar
Sylvain Gugger committed
52
MBART_TINY = "sshleifer/tiny-mbart"
Suraj Patil's avatar
Suraj Patil committed
53
54


55
@require_torch
56
class TestTrainerExt(TestCasePlus):
57
58
59
60
61
62
63
64
65
    def run_seq2seq_quick(
        self,
        distributed=False,
        extra_args_str=None,
        predict_with_generate=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
    ):
66
67
68
69
70
71
72
73
        output_dir = self.run_trainer(
            eval_steps=1,
            max_len=12,
            model_name=MBART_TINY,
            num_train_epochs=1,
            distributed=distributed,
            extra_args_str=extra_args_str,
            predict_with_generate=predict_with_generate,
74
75
76
            do_train=do_train,
            do_eval=do_eval,
            do_predict=do_predict,
77
        )
78
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
79
80
81
82

        if not do_eval:
            return

83
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
84

85
        first_step_stats = eval_metrics[0]
86
87
        if predict_with_generate:
            assert "eval_bleu" in first_step_stats
Suraj Patil's avatar
Suraj Patil committed
88

89
90
91
            last_step_stats = eval_metrics[-1]
            assert isinstance(last_step_stats["eval_bleu"], float)
            assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`"
92

93
    @require_torch_non_multi_gpu
94
95
    def test_run_seq2seq_no_dist(self):
        self.run_seq2seq_quick()
96

97
    # verify that the trainer can handle non-distributed with n_gpu > 1
98
    @require_torch_multi_gpu
99
100
    def test_run_seq2seq_dp(self):
        self.run_seq2seq_quick(distributed=False)
101

102
    # verify that the trainer can handle distributed with n_gpu > 1
103
    @require_torch_multi_gpu
104
105
    def test_run_seq2seq_ddp(self):
        self.run_seq2seq_quick(distributed=True)
106

107
    # test --sharded_ddp w/o --fp16
Yih-Dar's avatar
Yih-Dar committed
108
    @unittest.skip("Requires an update of the env running those tests")
109
110
    @require_torch_multi_gpu
    @require_fairscale
111
112
    def test_run_seq2seq_sharded_ddp(self):
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple")
113

114
    # test --sharded_ddp w/ --fp16
115
    @unittest.skip("Requires an update of the env running those tests")
116
117
    @require_torch_multi_gpu
    @require_fairscale
118
119
120
    def test_run_seq2seq_sharded_ddp_fp16(self):
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple --fp16")

121
    # test --sharded_ddp zero_dp_2 w/o --fp16
Yih-Dar's avatar
Yih-Dar committed
122
    @unittest.skip("Requires an update of the env running those tests")
123
124
125
    @require_torch_multi_gpu
    @require_fairscale
    def test_run_seq2seq_fully_sharded_ddp(self):
126
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False)
127

128
    # test --sharded_ddp zero_dp_2 w/ --fp16
129
    @unittest.skip("Requires an update of the env running those tests")
130
131
132
133
    @require_torch_multi_gpu
    @require_fairscale
    def test_run_seq2seq_fully_sharded_ddp_fp16(self):
        self.run_seq2seq_quick(
134
            distributed=True, extra_args_str="--sharded_ddp zero_dp_2 --fp16", predict_with_generate=False
135
        )
136

137
    @require_apex
138
    @require_torch_gpu
139
    def test_run_seq2seq_apex(self):
140
141
142
143
144
145
146
147
148
149
150
151
        # XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same
        # program and it breaks other tests that run from the same pytest worker, therefore until this is
        # sorted out it must be run only in an external program, that is distributed=True in this
        # test and only under one or more gpus - if we want cpu will need to make a special test
        #
        # specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via
        # 2nd main() call it botches the future eval.
        #
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
        # test 2nd time - was getting eval_loss': nan'
        # to reproduce the problem set distributed=False
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
152

153
    @parameterized.expand(["base", "low", "high", "mixed"])
154
    @require_torch_multi_gpu
155
156
157
158
159
160
161
162
163
164
165
166
167
168
    def test_trainer_log_level_replica(self, experiment_id):
        # as each sub-test is slow-ish split into multiple sub-tests to avoid CI timeout
        experiments = dict(
            # test with the default log_level - should be info and thus log info once
            base=dict(extra_args_str="", n_matches=1),
            # test with low log_level and log_level_replica - should be noisy on all processes
            # now the info string should appear twice on 2 processes
            low=dict(extra_args_str="--log_level debug --log_level_replica debug", n_matches=2),
            # test with high log_level and low log_level_replica
            # now the info string should appear once only on the replica
            high=dict(extra_args_str="--log_level error --log_level_replica debug", n_matches=1),
            # test with high log_level and log_level_replica - should be quiet on all processes
            mixed=dict(extra_args_str="--log_level error --log_level_replica error", n_matches=0),
        )
169

170
171
172
        data = experiments[experiment_id]
        kwargs = dict(distributed=True, predict_with_generate=False, do_eval=False, do_predict=False)
        log_info_string = "Running training"
173
        with CaptureStderr() as cl:
174
            self.run_seq2seq_quick(**kwargs, extra_args_str=data["extra_args_str"])
175
        n_matches = len(re.findall(log_info_string, cl.err))
176
        self.assertEqual(n_matches, data["n_matches"])
177

178
    @slow
179
    def test_run_seq2seq(self):
180
        output_dir = self.run_trainer(
181
182
183
184
185
186
            eval_steps=2,
            max_len=128,
            model_name=MARIAN_MODEL,
            learning_rate=3e-4,
            num_train_epochs=10,
            distributed=False,
187
        )
Suraj Patil's avatar
Suraj Patil committed
188

189
190
191
192
193
        # Check metrics
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
        last_step_stats = eval_metrics[-1]
194

195
        assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing"
196
        assert isinstance(last_step_stats["eval_bleu"], float)
197

198
199
200
        # test if do_predict saves generations and metrics
        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
201
202
        assert "generated_predictions.txt" in contents
        assert "predict_results.json" in contents
203

204
205
206
207
208
209
    @slow
    @require_bitsandbytes
    def test_run_seq2seq_bnb(self):
        from transformers.training_args import OptimizerNames

        def train_and_return_metrics(optim: str) -> Tuple[int, float]:
Stas Bekman's avatar
Stas Bekman committed
210
            extra_args = "--skip_memory_metrics 0"
211
212
213
214
215
216

            output_dir = self.run_trainer(
                max_len=128,
                model_name=MARIAN_MODEL,
                learning_rate=3e-4,
                num_train_epochs=1,
Stas Bekman's avatar
Stas Bekman committed
217
                optim=optim,
218
219
220
221
                distributed=True,  # force run in a new process
                extra_args_str=extra_args,
                do_eval=False,
                do_predict=False,
Stas Bekman's avatar
Stas Bekman committed
222
                n_gpus_to_use=1,  # to allow deterministic fixed memory usage
223
224
225
226
            )

            # Check metrics
            logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
Stas Bekman's avatar
Stas Bekman committed
227
228
            gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20)
            gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20)
229
230

            loss = logs[0]["train_loss"]
Stas Bekman's avatar
Stas Bekman committed
231
            return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss
232
233
234
235

        gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
        gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)

Stas Bekman's avatar
Stas Bekman committed
236
        gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb
237
238
239

        gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
        gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
Stas Bekman's avatar
Stas Bekman committed
240
        gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb
241

Stas Bekman's avatar
Stas Bekman committed
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
        # sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which
        # doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized
        # in 2 bytes and the diff in optim memory usage is derived as so:
        #
        # - normal 25*8=~200MB (8 bytes per param)
        # - bnb    25*2= ~50MB (2 bytes per param)
        #
        # Thus we should expect ~150MB total memory saved.
        #
        # Peak memory should be the same - the total should be different by about that same margin
        #
        # After leaving a small margin to accommodate for differences between gpus let's check
        # that we have at least 120MB in savings
        expected_savings = 120

        # uncomment the following if this test starts failing - requires py38 for a new print feature
        # gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb
        # print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB")
        # print(f" {gpu_alloc_mem_bnb=}MB  {gpu_peak_mem_bnb=}MB  {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB")
        # print(f"{gpu_alloc_mem_diff=}MB")
        # print(f"{gpu_peak_mem_diff=}MB")
        # print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB")
        # print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB")
265
266

        self.assertGreater(
Stas Bekman's avatar
Stas Bekman committed
267
268
269
270
271
            gpu_alloc_mem_diff,
            expected_savings,
            "should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got"
            f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and"
            f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB",
272
273
274
        )

        self.assertGreater(
Stas Bekman's avatar
Stas Bekman committed
275
276
277
278
279
            gpu_total_mem_diff,
            expected_savings,
            "should use ~150MB less total gpu memory with BNB, compared to without it for this model but got"
            f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and"
            f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB",
280
281
282
        )

        self.assertEqual(
283
            loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
284
285
        )

286
    def run_trainer(
287
        self,
288
        max_len: int,
289
290
        model_name: str,
        num_train_epochs: int,
291
        learning_rate: float = 3e-3,
Stas Bekman's avatar
Stas Bekman committed
292
        optim: str = "adafactor",
293
294
        distributed: bool = False,
        extra_args_str: str = None,
Stas Bekman's avatar
Stas Bekman committed
295
        eval_steps: int = 0,
296
        predict_with_generate: bool = True,
297
298
299
        do_train: bool = True,
        do_eval: bool = True,
        do_predict: bool = True,
Stas Bekman's avatar
Stas Bekman committed
300
        n_gpus_to_use: int = None,
301
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
302
        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
303
        output_dir = self.get_auto_remove_tmp_dir()
304
        args_train = f"""
305
            --model_name_or_path {model_name}
306
307
308
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --test_file {data_dir}/test.json
309
310
            --output_dir {output_dir}
            --overwrite_output_dir
311
            --max_train_samples 8
312
            --max_source_length {max_len}
313
            --max_target_length {max_len}
314
315
316
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
317
            --learning_rate {learning_rate}
318
319
            --warmup_steps 8
            --logging_steps 0
320
            --logging_strategy no
321
            --save_steps {str(eval_steps)}
322
            --group_by_length
323
            --label_smoothing_factor 0.1
324
325
            --target_lang ro_RO
            --source_lang en_XX
Stas Bekman's avatar
Stas Bekman committed
326
        """.split()
327
328
329
330
331
332
333
334

        args_eval = f"""
            --do_eval
            --per_device_eval_batch_size 4
            --max_eval_samples 8
            --val_max_target_length {max_len}
            --evaluation_strategy steps
            --eval_steps {str(eval_steps)}
Stas Bekman's avatar
Stas Bekman committed
335
        """.split()
336
337
338

        args_predict = """
            --do_predict
Stas Bekman's avatar
Stas Bekman committed
339
        """.split()
340

Stas Bekman's avatar
Stas Bekman committed
341
        args = []
342
343
344
345
346
347
348
349
350
        if do_train:
            args += args_train

        if do_eval:
            args += args_eval

        if do_predict:
            args += args_predict

351
        if predict_with_generate:
Stas Bekman's avatar
Stas Bekman committed
352
            args += "--predict_with_generate".split()
353

Stas Bekman's avatar
Stas Bekman committed
354
355
356
357
358
        if do_train:
            if optim == "adafactor":
                args += "--adafactor".split()
            else:
                args += f"--optim {optim}".split()
359

360
        if extra_args_str is not None:
Stas Bekman's avatar
Stas Bekman committed
361
            args += extra_args_str.split()
362

363
        if distributed:
Stas Bekman's avatar
Stas Bekman committed
364
365
366

            if n_gpus_to_use is None:
                n_gpus_to_use = get_gpu_count()
367
            master_port = get_torch_dist_unique_port()
368
369
            distributed_args = f"""
                -m torch.distributed.launch
Stas Bekman's avatar
Stas Bekman committed
370
                --nproc_per_node={n_gpus_to_use}
371
                --master_port={master_port}
Sylvain Gugger's avatar
Sylvain Gugger committed
372
                {self.examples_dir_str}/pytorch/translation/run_translation.py
373
374
            """.split()
            cmd = [sys.executable] + distributed_args + args
375
376
            # keep for quick debug
            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
377
            execute_subprocess_async(cmd, env=self.get_env())
378
        else:
379
            testargs = ["run_translation.py"] + args
380
381
            with patch.object(sys, "argv", testargs):
                main()
382

383
        return output_dir