test_trainer_ext.py 14.3 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import math
Suraj Patil's avatar
Suraj Patil committed
16
import os
17
import re
Suraj Patil's avatar
Suraj Patil committed
18
import sys
19
import unittest
20
from typing import Tuple
Suraj Patil's avatar
Suraj Patil committed
21
22
from unittest.mock import patch

23
from parameterized import parameterized
24
from transformers import AutoModel
25
from transformers.testing_utils import (
26
    CaptureStderr,
27
    ExtendSysPath,
28
29
30
    TestCasePlus,
    execute_subprocess_async,
    get_gpu_count,
31
    get_torch_dist_unique_port,
32
33
34
    require_apex,
    require_bitsandbytes,
    require_fairscale,
35
    require_torch,
36
    require_torch_gpu,
37
38
39
40
    require_torch_multi_gpu,
    require_torch_non_multi_gpu,
    slow,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
41
42
from transformers.trainer_callback import TrainerState
from transformers.trainer_utils import set_seed
Suraj Patil's avatar
Suraj Patil committed
43

44
45

bindir = os.path.abspath(os.path.dirname(__file__))
Sylvain Gugger's avatar
Sylvain Gugger committed
46
with ExtendSysPath(f"{bindir}/../../examples/pytorch/translation"):
47
    from run_translation import main  # noqa
48

Suraj Patil's avatar
Suraj Patil committed
49

50
set_seed(42)
Suraj Patil's avatar
Suraj Patil committed
51
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
Sylvain Gugger's avatar
Sylvain Gugger committed
52
MBART_TINY = "sshleifer/tiny-mbart"
Suraj Patil's avatar
Suraj Patil committed
53
54


55
@require_torch
56
class TestTrainerExt(TestCasePlus):
57
58
59
60
61
62
63
64
65
    def run_seq2seq_quick(
        self,
        distributed=False,
        extra_args_str=None,
        predict_with_generate=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
    ):
66
67
68
69
70
71
72
73
        output_dir = self.run_trainer(
            eval_steps=1,
            max_len=12,
            model_name=MBART_TINY,
            num_train_epochs=1,
            distributed=distributed,
            extra_args_str=extra_args_str,
            predict_with_generate=predict_with_generate,
74
75
76
            do_train=do_train,
            do_eval=do_eval,
            do_predict=do_predict,
77
        )
78
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
79
80
81
82

        if not do_eval:
            return

83
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
84

85
        first_step_stats = eval_metrics[0]
86
87
        if predict_with_generate:
            assert "eval_bleu" in first_step_stats
Suraj Patil's avatar
Suraj Patil committed
88

89
90
91
            last_step_stats = eval_metrics[-1]
            assert isinstance(last_step_stats["eval_bleu"], float)
            assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`"
92

93
    @require_torch_non_multi_gpu
94
95
    def test_run_seq2seq_no_dist(self):
        self.run_seq2seq_quick()
96

97
    # verify that the trainer can handle non-distributed with n_gpu > 1
98
    @require_torch_multi_gpu
99
100
    def test_run_seq2seq_dp(self):
        self.run_seq2seq_quick(distributed=False)
101

102
    # verify that the trainer can handle distributed with n_gpu > 1
103
    @require_torch_multi_gpu
104
105
    def test_run_seq2seq_ddp(self):
        self.run_seq2seq_quick(distributed=True)
106

107
    # test --sharded_ddp w/o --fp16
108
109
    @require_torch_multi_gpu
    @require_fairscale
110
111
    def test_run_seq2seq_sharded_ddp(self):
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple")
112

113
    # test --sharded_ddp w/ --fp16
114
    @unittest.skip("Requires an update of the env running those tests")
115
116
    @require_torch_multi_gpu
    @require_fairscale
117
118
119
    def test_run_seq2seq_sharded_ddp_fp16(self):
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple --fp16")

120
    # test --sharded_ddp zero_dp_2 w/o --fp16
121
122
123
    @require_torch_multi_gpu
    @require_fairscale
    def test_run_seq2seq_fully_sharded_ddp(self):
124
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False)
125

126
    # test --sharded_ddp zero_dp_2 w/ --fp16
127
    @unittest.skip("Requires an update of the env running those tests")
128
129
130
131
    @require_torch_multi_gpu
    @require_fairscale
    def test_run_seq2seq_fully_sharded_ddp_fp16(self):
        self.run_seq2seq_quick(
132
            distributed=True, extra_args_str="--sharded_ddp zero_dp_2 --fp16", predict_with_generate=False
133
        )
134

135
    @require_apex
136
    @require_torch_gpu
137
    def test_run_seq2seq_apex(self):
138
139
140
141
142
143
144
145
146
147
148
149
        # XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same
        # program and it breaks other tests that run from the same pytest worker, therefore until this is
        # sorted out it must be run only in an external program, that is distributed=True in this
        # test and only under one or more gpus - if we want cpu will need to make a special test
        #
        # specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via
        # 2nd main() call it botches the future eval.
        #
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
        # test 2nd time - was getting eval_loss': nan'
        # to reproduce the problem set distributed=False
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
150

151
    @parameterized.expand(["base", "low", "high", "mixed"])
152
    @require_torch_multi_gpu
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    def test_trainer_log_level_replica(self, experiment_id):
        # as each sub-test is slow-ish split into multiple sub-tests to avoid CI timeout
        experiments = dict(
            # test with the default log_level - should be info and thus log info once
            base=dict(extra_args_str="", n_matches=1),
            # test with low log_level and log_level_replica - should be noisy on all processes
            # now the info string should appear twice on 2 processes
            low=dict(extra_args_str="--log_level debug --log_level_replica debug", n_matches=2),
            # test with high log_level and low log_level_replica
            # now the info string should appear once only on the replica
            high=dict(extra_args_str="--log_level error --log_level_replica debug", n_matches=1),
            # test with high log_level and log_level_replica - should be quiet on all processes
            mixed=dict(extra_args_str="--log_level error --log_level_replica error", n_matches=0),
        )
167

168
169
170
        data = experiments[experiment_id]
        kwargs = dict(distributed=True, predict_with_generate=False, do_eval=False, do_predict=False)
        log_info_string = "Running training"
171
        with CaptureStderr() as cl:
172
            self.run_seq2seq_quick(**kwargs, extra_args_str=data["extra_args_str"])
173
        n_matches = len(re.findall(log_info_string, cl.err))
174
        self.assertEqual(n_matches, data["n_matches"])
175

176
    @slow
177
    def test_run_seq2seq(self):
178
        output_dir = self.run_trainer(
179
180
181
182
183
184
            eval_steps=2,
            max_len=128,
            model_name=MARIAN_MODEL,
            learning_rate=3e-4,
            num_train_epochs=10,
            distributed=False,
185
        )
Suraj Patil's avatar
Suraj Patil committed
186

187
188
189
190
191
        # Check metrics
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
        last_step_stats = eval_metrics[-1]
192

193
        assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing"
194
        assert isinstance(last_step_stats["eval_bleu"], float)
195

196
197
198
        # test if do_predict saves generations and metrics
        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
199
200
        assert "generated_predictions.txt" in contents
        assert "predict_results.json" in contents
201

202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
    @slow
    @require_bitsandbytes
    def test_run_seq2seq_bnb(self):
        from transformers.training_args import OptimizerNames

        def train_and_return_metrics(optim: str) -> Tuple[int, float]:
            from pathlib import Path

            extra_args = (
                f"--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
                "False --adafactor False --log_level debug"
            )

            output_dir = self.run_trainer(
                eval_steps=2,
                max_len=128,
                model_name=MARIAN_MODEL,
                learning_rate=3e-4,
                num_train_epochs=1,
                distributed=True,  # force run in a new process
                extra_args_str=extra_args,
                do_eval=False,
                do_predict=False,
            )

            # Check metrics
            logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
            gpu_peak_mem = logs[0]["train_mem_gpu_peaked_delta"]
            gpu_alloc_mem = logs[0]["train_mem_gpu_alloc_delta"]

            loss = logs[0]["train_loss"]
            return gpu_peak_mem, gpu_alloc_mem, loss

        gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
        gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)

        gpu_peak_mem_diff_bytes = gpu_peak_mem_orig - gpu_peak_mem_bnb
        gpu_peak_mem_diff_percent = gpu_peak_mem_diff_bytes / gpu_peak_mem_bnb

        gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
        gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb

        gpu_total_mem_diff_bytes = gpu_total_mem_orig - gpu_total_mem_bnb
        gpu_total_mem_diff_percent = gpu_total_mem_diff_bytes / gpu_total_mem_bnb

        # leave this for now if CI gets very different results
        # print(f"{gpu_alloc_mem_orig=:010d} {gpu_peak_mem_orig=:010d} {gpu_alloc_mem_orig+gpu_peak_mem_orig=:010d}" )
        # print(f" {gpu_alloc_mem_bnb=:010d}  {gpu_peak_mem_bnb=:010d}   {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=:010d}")
        # print(f"{gpu_peak_mem_diff_bytes=}, {gpu_peak_mem_diff_percent=}")
        # print(f"{gpu_total_mem_orig=}, {gpu_total_mem_bnb=}")
        # print(f"{gpu_total_mem_diff_bytes=}, {gpu_total_mem_diff_percent=}")

        self.assertGreater(
            gpu_peak_mem_diff_percent,
            10,  # basically a huge difference - got ~30x on my desktop
            "should use very little peak gpu memory with BNB, compared to without it"
            f"but got gpu_peak_mem_orig={gpu_peak_mem_orig} and gpu_peak_mem_bnb={gpu_peak_mem_bnb}",
        )

        self.assertGreater(
            gpu_total_mem_diff_percent,
            0.20,  # could easily be 0.50, but let's stay on the safe side
            "Using BNB should use less total GPU memory than without it"
            f"but got gpu_total_mem_orig={gpu_total_mem_orig} and gpu_total_mem_bnb={gpu_total_mem_bnb}",
        )

        self.assertEqual(
269
            loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
270
271
272
273
274
275
276
277
278
279
280
        )

        # Additionally let's test that the absolute gpu memory difference is larger or about the
        # same as the expected saving coming from BNB (6 bytes per param)
        model = AutoModel.from_pretrained(MARIAN_MODEL)
        total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
        bnb_saved_bytes = total_numel * 6  # 324MB

        self.assertGreater(
            gpu_total_mem_diff_bytes,
            bnb_saved_bytes * 0.8,  # add a safety margin, if it saved slightly less
Sylvain Gugger's avatar
Sylvain Gugger committed
281
282
            f"BNB should have saved about {bnb_saved_bytes} bytes, but the saved bytes were"
            f" {gpu_total_mem_diff_bytes}",
283
284
        )

285
    def run_trainer(
286
287
        self,
        eval_steps: int,
288
        max_len: int,
289
290
        model_name: str,
        num_train_epochs: int,
291
        learning_rate: float = 3e-3,
292
293
        distributed: bool = False,
        extra_args_str: str = None,
294
        predict_with_generate: bool = True,
295
296
297
        do_train: bool = True,
        do_eval: bool = True,
        do_predict: bool = True,
298
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
299
        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
300
        output_dir = self.get_auto_remove_tmp_dir()
301
        args_train = f"""
302
            --model_name_or_path {model_name}
303
304
305
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --test_file {data_dir}/test.json
306
307
            --output_dir {output_dir}
            --overwrite_output_dir
308
            --max_train_samples 8
309
            --max_source_length {max_len}
310
            --max_target_length {max_len}
311
312
313
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
314
            --learning_rate {learning_rate}
315
316
            --warmup_steps 8
            --logging_steps 0
317
            --logging_strategy no
318
            --save_steps {str(eval_steps)}
319
            --group_by_length
320
            --label_smoothing_factor 0.1
321
            --adafactor
322
323
            --target_lang ro_RO
            --source_lang en_XX
324
        """
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348

        args_eval = f"""
            --do_eval
            --per_device_eval_batch_size 4
            --max_eval_samples 8
            --val_max_target_length {max_len}
            --evaluation_strategy steps
            --eval_steps {str(eval_steps)}
        """

        args_predict = """
            --do_predict
        """

        args = ""
        if do_train:
            args += args_train

        if do_eval:
            args += args_eval

        if do_predict:
            args += args_predict

349
350
351
352
        if predict_with_generate:
            args += "--predict_with_generate"

        args = args.split()
353

354
355
356
        if extra_args_str is not None:
            args.extend(extra_args_str.split())

357
        if distributed:
358
            n_gpu = get_gpu_count()
359
            master_port = get_torch_dist_unique_port()
360
361
362
            distributed_args = f"""
                -m torch.distributed.launch
                --nproc_per_node={n_gpu}
363
                --master_port={master_port}
Sylvain Gugger's avatar
Sylvain Gugger committed
364
                {self.examples_dir_str}/pytorch/translation/run_translation.py
365
366
            """.split()
            cmd = [sys.executable] + distributed_args + args
367
368
            # keep for quick debug
            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
369
            execute_subprocess_async(cmd, env=self.get_env())
370
        else:
371
            testargs = ["run_translation.py"] + args
372
373
            with patch.object(sys, "argv", testargs):
                main()
374

375
        return output_dir