test_trainer_ext.py 13.5 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import math
Suraj Patil's avatar
Suraj Patil committed
16
import os
17
import re
Suraj Patil's avatar
Suraj Patil committed
18
import sys
Stas Bekman's avatar
Stas Bekman committed
19
from pathlib import Path
20
from typing import Tuple
Suraj Patil's avatar
Suraj Patil committed
21
22
from unittest.mock import patch

23
from parameterized import parameterized
24

25
from transformers.testing_utils import (
26
    CaptureStderr,
27
    ExtendSysPath,
28
    TestCasePlus,
29
    backend_device_count,
30
    execute_subprocess_async,
31
    get_torch_dist_unique_port,
32
33
    require_apex,
    require_bitsandbytes,
34
    require_torch,
35
    require_torch_gpu,
36
37
    require_torch_multi_accelerator,
    require_torch_non_multi_accelerator,
38
    slow,
39
    torch_device,
40
)
Sylvain Gugger's avatar
Sylvain Gugger committed
41
42
from transformers.trainer_callback import TrainerState
from transformers.trainer_utils import set_seed
Suraj Patil's avatar
Suraj Patil committed
43

44
45

bindir = os.path.abspath(os.path.dirname(__file__))
Sylvain Gugger's avatar
Sylvain Gugger committed
46
with ExtendSysPath(f"{bindir}/../../examples/pytorch/translation"):
47
    from run_translation import main  # noqa
48

Suraj Patil's avatar
Suraj Patil committed
49

50
set_seed(42)
Suraj Patil's avatar
Suraj Patil committed
51
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
Sylvain Gugger's avatar
Sylvain Gugger committed
52
MBART_TINY = "sshleifer/tiny-mbart"
Suraj Patil's avatar
Suraj Patil committed
53
54


55
@require_torch
56
class TestTrainerExt(TestCasePlus):
57
58
59
60
61
62
63
64
65
    def run_seq2seq_quick(
        self,
        distributed=False,
        extra_args_str=None,
        predict_with_generate=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
    ):
66
67
68
69
70
71
72
73
        output_dir = self.run_trainer(
            eval_steps=1,
            max_len=12,
            model_name=MBART_TINY,
            num_train_epochs=1,
            distributed=distributed,
            extra_args_str=extra_args_str,
            predict_with_generate=predict_with_generate,
74
75
76
            do_train=do_train,
            do_eval=do_eval,
            do_predict=do_predict,
77
        )
78
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
79
80
81
82

        if not do_eval:
            return

83
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
84

85
        first_step_stats = eval_metrics[0]
86
87
        if predict_with_generate:
            assert "eval_bleu" in first_step_stats
Suraj Patil's avatar
Suraj Patil committed
88

89
90
91
            last_step_stats = eval_metrics[-1]
            assert isinstance(last_step_stats["eval_bleu"], float)
            assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`"
92

93
    @require_torch_non_multi_accelerator
94
95
    def test_run_seq2seq_no_dist(self):
        self.run_seq2seq_quick()
96

97
    # verify that the trainer can handle non-distributed with n_gpu > 1
98
    @require_torch_multi_accelerator
99
100
    def test_run_seq2seq_dp(self):
        self.run_seq2seq_quick(distributed=False)
101

102
    # verify that the trainer can handle distributed with n_gpu > 1
103
    @require_torch_multi_accelerator
104
105
    def test_run_seq2seq_ddp(self):
        self.run_seq2seq_quick(distributed=True)
106

107
    @require_apex
108
    @require_torch_gpu
109
    def test_run_seq2seq_apex(self):
110
111
112
113
114
115
116
117
118
119
120
121
        # XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same
        # program and it breaks other tests that run from the same pytest worker, therefore until this is
        # sorted out it must be run only in an external program, that is distributed=True in this
        # test and only under one or more gpus - if we want cpu will need to make a special test
        #
        # specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via
        # 2nd main() call it botches the future eval.
        #
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
        # test 2nd time - was getting eval_loss': nan'
        # to reproduce the problem set distributed=False
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
122

123
    @parameterized.expand(["base", "low", "high", "mixed"])
124
    @require_torch_multi_accelerator
125
126
    def test_trainer_log_level_replica(self, experiment_id):
        # as each sub-test is slow-ish split into multiple sub-tests to avoid CI timeout
127
        experiments = {
128
            # test with the default log_level - should be info and thus log info once
129
            "base": {"extra_args_str": "", "n_matches": 1},
130
131
            # test with low log_level and log_level_replica - should be noisy on all processes
            # now the info string should appear twice on 2 processes
132
            "low": {"extra_args_str": "--log_level debug --log_level_replica debug", "n_matches": 2},
133
134
            # test with high log_level and low log_level_replica
            # now the info string should appear once only on the replica
135
            "high": {"extra_args_str": "--log_level error --log_level_replica debug", "n_matches": 1},
136
            # test with high log_level and log_level_replica - should be quiet on all processes
137
138
            "mixed": {"extra_args_str": "--log_level error --log_level_replica error", "n_matches": 0},
        }
139

140
        data = experiments[experiment_id]
141
        kwargs = {"distributed": True, "predict_with_generate": False, "do_eval": False, "do_predict": False}
142
        log_info_string = "Running training"
143
        with CaptureStderr() as cl:
144
            self.run_seq2seq_quick(**kwargs, extra_args_str=data["extra_args_str"])
145
        n_matches = len(re.findall(log_info_string, cl.err))
146
        self.assertEqual(n_matches, data["n_matches"])
147

148
    @slow
149
    def test_run_seq2seq(self):
150
        output_dir = self.run_trainer(
151
152
153
154
155
156
            eval_steps=2,
            max_len=128,
            model_name=MARIAN_MODEL,
            learning_rate=3e-4,
            num_train_epochs=10,
            distributed=False,
157
        )
Suraj Patil's avatar
Suraj Patil committed
158

159
160
161
162
163
        # Check metrics
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
        last_step_stats = eval_metrics[-1]
164

165
        assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing"
166
        assert isinstance(last_step_stats["eval_bleu"], float)
167

168
169
170
        # test if do_predict saves generations and metrics
        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
171
172
        assert "generated_predictions.txt" in contents
        assert "predict_results.json" in contents
173

174
175
176
177
178
179
    @slow
    @require_bitsandbytes
    def test_run_seq2seq_bnb(self):
        from transformers.training_args import OptimizerNames

        def train_and_return_metrics(optim: str) -> Tuple[int, float]:
Stas Bekman's avatar
Stas Bekman committed
180
            extra_args = "--skip_memory_metrics 0"
181
182
183
184
185
186

            output_dir = self.run_trainer(
                max_len=128,
                model_name=MARIAN_MODEL,
                learning_rate=3e-4,
                num_train_epochs=1,
Stas Bekman's avatar
Stas Bekman committed
187
                optim=optim,
188
189
190
191
                distributed=True,  # force run in a new process
                extra_args_str=extra_args,
                do_eval=False,
                do_predict=False,
Stas Bekman's avatar
Stas Bekman committed
192
                n_gpus_to_use=1,  # to allow deterministic fixed memory usage
193
194
195
196
            )

            # Check metrics
            logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
Stas Bekman's avatar
Stas Bekman committed
197
198
            gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20)
            gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20)
199
200

            loss = logs[0]["train_loss"]
Stas Bekman's avatar
Stas Bekman committed
201
            return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss
202
203
204
205

        gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
        gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)

Stas Bekman's avatar
Stas Bekman committed
206
        gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb
207
208
209

        gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
        gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
Stas Bekman's avatar
Stas Bekman committed
210
        gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb
211

Stas Bekman's avatar
Stas Bekman committed
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
        # sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which
        # doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized
        # in 2 bytes and the diff in optim memory usage is derived as so:
        #
        # - normal 25*8=~200MB (8 bytes per param)
        # - bnb    25*2= ~50MB (2 bytes per param)
        #
        # Thus we should expect ~150MB total memory saved.
        #
        # Peak memory should be the same - the total should be different by about that same margin
        #
        # After leaving a small margin to accommodate for differences between gpus let's check
        # that we have at least 120MB in savings
        expected_savings = 120

        # uncomment the following if this test starts failing - requires py38 for a new print feature
        # gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb
        # print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB")
        # print(f" {gpu_alloc_mem_bnb=}MB  {gpu_peak_mem_bnb=}MB  {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB")
        # print(f"{gpu_alloc_mem_diff=}MB")
        # print(f"{gpu_peak_mem_diff=}MB")
        # print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB")
        # print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB")
235
236

        self.assertGreater(
Stas Bekman's avatar
Stas Bekman committed
237
238
239
240
241
            gpu_alloc_mem_diff,
            expected_savings,
            "should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got"
            f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and"
            f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB",
242
243
244
        )

        self.assertGreater(
Stas Bekman's avatar
Stas Bekman committed
245
246
247
248
249
            gpu_total_mem_diff,
            expected_savings,
            "should use ~150MB less total gpu memory with BNB, compared to without it for this model but got"
            f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and"
            f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB",
250
251
252
        )

        self.assertEqual(
253
            loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
254
255
        )

256
    def run_trainer(
257
        self,
258
        max_len: int,
259
260
        model_name: str,
        num_train_epochs: int,
261
        learning_rate: float = 3e-3,
Stas Bekman's avatar
Stas Bekman committed
262
        optim: str = "adafactor",
263
264
        distributed: bool = False,
        extra_args_str: str = None,
Stas Bekman's avatar
Stas Bekman committed
265
        eval_steps: int = 0,
266
        predict_with_generate: bool = True,
267
268
269
        do_train: bool = True,
        do_eval: bool = True,
        do_predict: bool = True,
Stas Bekman's avatar
Stas Bekman committed
270
        n_gpus_to_use: int = None,
271
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
272
        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
273
        output_dir = self.get_auto_remove_tmp_dir()
274
        args_train = f"""
275
            --model_name_or_path {model_name}
276
277
278
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --test_file {data_dir}/test.json
279
280
            --output_dir {output_dir}
            --overwrite_output_dir
281
            --max_train_samples 8
282
            --max_source_length {max_len}
283
            --max_target_length {max_len}
284
285
286
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
287
            --learning_rate {learning_rate}
288
289
            --warmup_steps 8
            --logging_steps 0
290
            --logging_strategy no
291
            --save_steps {str(eval_steps)}
292
            --group_by_length
293
            --label_smoothing_factor 0.1
294
295
            --target_lang ro_RO
            --source_lang en_XX
Stas Bekman's avatar
Stas Bekman committed
296
        """.split()
297
298
299
300
301
302
303
304

        args_eval = f"""
            --do_eval
            --per_device_eval_batch_size 4
            --max_eval_samples 8
            --val_max_target_length {max_len}
            --evaluation_strategy steps
            --eval_steps {str(eval_steps)}
Stas Bekman's avatar
Stas Bekman committed
305
        """.split()
306
307
308

        args_predict = """
            --do_predict
Stas Bekman's avatar
Stas Bekman committed
309
        """.split()
310

Stas Bekman's avatar
Stas Bekman committed
311
        args = []
312
313
314
315
316
317
318
319
320
        if do_train:
            args += args_train

        if do_eval:
            args += args_eval

        if do_predict:
            args += args_predict

321
        if predict_with_generate:
Stas Bekman's avatar
Stas Bekman committed
322
            args += "--predict_with_generate".split()
323

Stas Bekman's avatar
Stas Bekman committed
324
325
326
327
328
        if do_train:
            if optim == "adafactor":
                args += "--adafactor".split()
            else:
                args += f"--optim {optim}".split()
329

330
        if extra_args_str is not None:
Stas Bekman's avatar
Stas Bekman committed
331
            args += extra_args_str.split()
332

333
        if distributed:
Stas Bekman's avatar
Stas Bekman committed
334
            if n_gpus_to_use is None:
335
                n_gpus_to_use = backend_device_count(torch_device)
336
            master_port = get_torch_dist_unique_port()
337
            distributed_args = f"""
338
                -m torch.distributed.run
Stas Bekman's avatar
Stas Bekman committed
339
                --nproc_per_node={n_gpus_to_use}
340
                --master_port={master_port}
Sylvain Gugger's avatar
Sylvain Gugger committed
341
                {self.examples_dir_str}/pytorch/translation/run_translation.py
342
343
            """.split()
            cmd = [sys.executable] + distributed_args + args
344
345
            # keep for quick debug
            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
346
            execute_subprocess_async(cmd, env=self.get_env())
347
        else:
348
            testargs = ["run_translation.py"] + args
349
350
            with patch.object(sys, "argv", testargs):
                main()
351

352
        return output_dir