test_trainer_ext.py 14.5 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import math
Suraj Patil's avatar
Suraj Patil committed
16
import os
17
import re
Suraj Patil's avatar
Suraj Patil committed
18
import sys
19
import unittest
20
from typing import Tuple
Suraj Patil's avatar
Suraj Patil committed
21
22
from unittest.mock import patch

23
from parameterized import parameterized
24
from transformers import AutoModel
25
from transformers.testing_utils import (
26
    CaptureStderr,
27
    ExtendSysPath,
28
29
30
    TestCasePlus,
    execute_subprocess_async,
    get_gpu_count,
31
    get_torch_dist_unique_port,
32
33
34
    require_apex,
    require_bitsandbytes,
    require_fairscale,
35
    require_torch,
36
    require_torch_gpu,
37
38
39
40
    require_torch_multi_gpu,
    require_torch_non_multi_gpu,
    slow,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
41
42
from transformers.trainer_callback import TrainerState
from transformers.trainer_utils import set_seed
Suraj Patil's avatar
Suraj Patil committed
43

44
45

bindir = os.path.abspath(os.path.dirname(__file__))
Sylvain Gugger's avatar
Sylvain Gugger committed
46
with ExtendSysPath(f"{bindir}/../../examples/pytorch/translation"):
47
    from run_translation import main  # noqa
48

Suraj Patil's avatar
Suraj Patil committed
49

50
set_seed(42)
Suraj Patil's avatar
Suraj Patil committed
51
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
Sylvain Gugger's avatar
Sylvain Gugger committed
52
MBART_TINY = "sshleifer/tiny-mbart"
Suraj Patil's avatar
Suraj Patil committed
53
54


55
@require_torch
56
class TestTrainerExt(TestCasePlus):
57
58
59
60
61
62
63
64
65
    def run_seq2seq_quick(
        self,
        distributed=False,
        extra_args_str=None,
        predict_with_generate=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
    ):
66
67
68
69
70
71
72
73
        output_dir = self.run_trainer(
            eval_steps=1,
            max_len=12,
            model_name=MBART_TINY,
            num_train_epochs=1,
            distributed=distributed,
            extra_args_str=extra_args_str,
            predict_with_generate=predict_with_generate,
74
75
76
            do_train=do_train,
            do_eval=do_eval,
            do_predict=do_predict,
77
        )
78
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
79
80
81
82

        if not do_eval:
            return

83
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
84

85
        first_step_stats = eval_metrics[0]
86
87
        if predict_with_generate:
            assert "eval_bleu" in first_step_stats
Suraj Patil's avatar
Suraj Patil committed
88

89
90
91
            last_step_stats = eval_metrics[-1]
            assert isinstance(last_step_stats["eval_bleu"], float)
            assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`"
92

93
    @require_torch_non_multi_gpu
94
95
    def test_run_seq2seq_no_dist(self):
        self.run_seq2seq_quick()
96

97
    # verify that the trainer can handle non-distributed with n_gpu > 1
98
    @require_torch_multi_gpu
99
100
    def test_run_seq2seq_dp(self):
        self.run_seq2seq_quick(distributed=False)
101

102
    # verify that the trainer can handle distributed with n_gpu > 1
103
    @require_torch_multi_gpu
104
105
    def test_run_seq2seq_ddp(self):
        self.run_seq2seq_quick(distributed=True)
106

107
    # test --sharded_ddp w/o --fp16
Yih-Dar's avatar
Yih-Dar committed
108
    @unittest.skip("Requires an update of the env running those tests")
109
110
    @require_torch_multi_gpu
    @require_fairscale
111
112
    def test_run_seq2seq_sharded_ddp(self):
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple")
113

114
    # test --sharded_ddp w/ --fp16
115
    @unittest.skip("Requires an update of the env running those tests")
116
117
    @require_torch_multi_gpu
    @require_fairscale
118
119
120
    def test_run_seq2seq_sharded_ddp_fp16(self):
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple --fp16")

121
    # test --sharded_ddp zero_dp_2 w/o --fp16
Yih-Dar's avatar
Yih-Dar committed
122
    @unittest.skip("Requires an update of the env running those tests")
123
124
125
    @require_torch_multi_gpu
    @require_fairscale
    def test_run_seq2seq_fully_sharded_ddp(self):
126
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False)
127

128
    # test --sharded_ddp zero_dp_2 w/ --fp16
129
    @unittest.skip("Requires an update of the env running those tests")
130
131
132
133
    @require_torch_multi_gpu
    @require_fairscale
    def test_run_seq2seq_fully_sharded_ddp_fp16(self):
        self.run_seq2seq_quick(
134
            distributed=True, extra_args_str="--sharded_ddp zero_dp_2 --fp16", predict_with_generate=False
135
        )
136

137
    @require_apex
138
    @require_torch_gpu
139
    def test_run_seq2seq_apex(self):
140
141
142
143
144
145
146
147
148
149
150
151
        # XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same
        # program and it breaks other tests that run from the same pytest worker, therefore until this is
        # sorted out it must be run only in an external program, that is distributed=True in this
        # test and only under one or more gpus - if we want cpu will need to make a special test
        #
        # specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via
        # 2nd main() call it botches the future eval.
        #
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
        # test 2nd time - was getting eval_loss': nan'
        # to reproduce the problem set distributed=False
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
152

153
    @parameterized.expand(["base", "low", "high", "mixed"])
154
    @require_torch_multi_gpu
155
156
157
158
159
160
161
162
163
164
165
166
167
168
    def test_trainer_log_level_replica(self, experiment_id):
        # as each sub-test is slow-ish split into multiple sub-tests to avoid CI timeout
        experiments = dict(
            # test with the default log_level - should be info and thus log info once
            base=dict(extra_args_str="", n_matches=1),
            # test with low log_level and log_level_replica - should be noisy on all processes
            # now the info string should appear twice on 2 processes
            low=dict(extra_args_str="--log_level debug --log_level_replica debug", n_matches=2),
            # test with high log_level and low log_level_replica
            # now the info string should appear once only on the replica
            high=dict(extra_args_str="--log_level error --log_level_replica debug", n_matches=1),
            # test with high log_level and log_level_replica - should be quiet on all processes
            mixed=dict(extra_args_str="--log_level error --log_level_replica error", n_matches=0),
        )
169

170
171
172
        data = experiments[experiment_id]
        kwargs = dict(distributed=True, predict_with_generate=False, do_eval=False, do_predict=False)
        log_info_string = "Running training"
173
        with CaptureStderr() as cl:
174
            self.run_seq2seq_quick(**kwargs, extra_args_str=data["extra_args_str"])
175
        n_matches = len(re.findall(log_info_string, cl.err))
176
        self.assertEqual(n_matches, data["n_matches"])
177

178
    @slow
179
    def test_run_seq2seq(self):
180
        output_dir = self.run_trainer(
181
182
183
184
185
186
            eval_steps=2,
            max_len=128,
            model_name=MARIAN_MODEL,
            learning_rate=3e-4,
            num_train_epochs=10,
            distributed=False,
187
        )
Suraj Patil's avatar
Suraj Patil committed
188

189
190
191
192
193
        # Check metrics
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
        last_step_stats = eval_metrics[-1]
194

195
        assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing"
196
        assert isinstance(last_step_stats["eval_bleu"], float)
197

198
199
200
        # test if do_predict saves generations and metrics
        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
201
202
        assert "generated_predictions.txt" in contents
        assert "predict_results.json" in contents
203

204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
    @slow
    @require_bitsandbytes
    def test_run_seq2seq_bnb(self):
        from transformers.training_args import OptimizerNames

        def train_and_return_metrics(optim: str) -> Tuple[int, float]:
            from pathlib import Path

            extra_args = (
                f"--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
                "False --adafactor False --log_level debug"
            )

            output_dir = self.run_trainer(
                eval_steps=2,
                max_len=128,
                model_name=MARIAN_MODEL,
                learning_rate=3e-4,
                num_train_epochs=1,
                distributed=True,  # force run in a new process
                extra_args_str=extra_args,
                do_eval=False,
                do_predict=False,
            )

            # Check metrics
            logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
            gpu_peak_mem = logs[0]["train_mem_gpu_peaked_delta"]
            gpu_alloc_mem = logs[0]["train_mem_gpu_alloc_delta"]

            loss = logs[0]["train_loss"]
            return gpu_peak_mem, gpu_alloc_mem, loss

        gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
        gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)

        gpu_peak_mem_diff_bytes = gpu_peak_mem_orig - gpu_peak_mem_bnb
        gpu_peak_mem_diff_percent = gpu_peak_mem_diff_bytes / gpu_peak_mem_bnb

        gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
        gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb

        gpu_total_mem_diff_bytes = gpu_total_mem_orig - gpu_total_mem_bnb
        gpu_total_mem_diff_percent = gpu_total_mem_diff_bytes / gpu_total_mem_bnb

        # leave this for now if CI gets very different results
        # print(f"{gpu_alloc_mem_orig=:010d} {gpu_peak_mem_orig=:010d} {gpu_alloc_mem_orig+gpu_peak_mem_orig=:010d}" )
        # print(f" {gpu_alloc_mem_bnb=:010d}  {gpu_peak_mem_bnb=:010d}   {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=:010d}")
        # print(f"{gpu_peak_mem_diff_bytes=}, {gpu_peak_mem_diff_percent=}")
        # print(f"{gpu_total_mem_orig=}, {gpu_total_mem_bnb=}")
        # print(f"{gpu_total_mem_diff_bytes=}, {gpu_total_mem_diff_percent=}")

        self.assertGreater(
            gpu_peak_mem_diff_percent,
            10,  # basically a huge difference - got ~30x on my desktop
            "should use very little peak gpu memory with BNB, compared to without it"
            f"but got gpu_peak_mem_orig={gpu_peak_mem_orig} and gpu_peak_mem_bnb={gpu_peak_mem_bnb}",
        )

        self.assertGreater(
            gpu_total_mem_diff_percent,
            0.20,  # could easily be 0.50, but let's stay on the safe side
            "Using BNB should use less total GPU memory than without it"
            f"but got gpu_total_mem_orig={gpu_total_mem_orig} and gpu_total_mem_bnb={gpu_total_mem_bnb}",
        )

        self.assertEqual(
271
            loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
272
273
274
275
276
277
278
279
280
281
282
        )

        # Additionally let's test that the absolute gpu memory difference is larger or about the
        # same as the expected saving coming from BNB (6 bytes per param)
        model = AutoModel.from_pretrained(MARIAN_MODEL)
        total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
        bnb_saved_bytes = total_numel * 6  # 324MB

        self.assertGreater(
            gpu_total_mem_diff_bytes,
            bnb_saved_bytes * 0.8,  # add a safety margin, if it saved slightly less
Sylvain Gugger's avatar
Sylvain Gugger committed
283
284
            f"BNB should have saved about {bnb_saved_bytes} bytes, but the saved bytes were"
            f" {gpu_total_mem_diff_bytes}",
285
286
        )

287
    def run_trainer(
288
289
        self,
        eval_steps: int,
290
        max_len: int,
291
292
        model_name: str,
        num_train_epochs: int,
293
        learning_rate: float = 3e-3,
294
295
        distributed: bool = False,
        extra_args_str: str = None,
296
        predict_with_generate: bool = True,
297
298
299
        do_train: bool = True,
        do_eval: bool = True,
        do_predict: bool = True,
300
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
301
        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
302
        output_dir = self.get_auto_remove_tmp_dir()
303
        args_train = f"""
304
            --model_name_or_path {model_name}
305
306
307
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --test_file {data_dir}/test.json
308
309
            --output_dir {output_dir}
            --overwrite_output_dir
310
            --max_train_samples 8
311
            --max_source_length {max_len}
312
            --max_target_length {max_len}
313
314
315
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
316
            --learning_rate {learning_rate}
317
318
            --warmup_steps 8
            --logging_steps 0
319
            --logging_strategy no
320
            --save_steps {str(eval_steps)}
321
            --group_by_length
322
            --label_smoothing_factor 0.1
323
            --adafactor
324
325
            --target_lang ro_RO
            --source_lang en_XX
326
        """
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350

        args_eval = f"""
            --do_eval
            --per_device_eval_batch_size 4
            --max_eval_samples 8
            --val_max_target_length {max_len}
            --evaluation_strategy steps
            --eval_steps {str(eval_steps)}
        """

        args_predict = """
            --do_predict
        """

        args = ""
        if do_train:
            args += args_train

        if do_eval:
            args += args_eval

        if do_predict:
            args += args_predict

351
352
353
354
        if predict_with_generate:
            args += "--predict_with_generate"

        args = args.split()
355

356
357
358
        if extra_args_str is not None:
            args.extend(extra_args_str.split())

359
        if distributed:
360
            n_gpu = get_gpu_count()
361
            master_port = get_torch_dist_unique_port()
362
363
364
            distributed_args = f"""
                -m torch.distributed.launch
                --nproc_per_node={n_gpu}
365
                --master_port={master_port}
Sylvain Gugger's avatar
Sylvain Gugger committed
366
                {self.examples_dir_str}/pytorch/translation/run_translation.py
367
368
            """.split()
            cmd = [sys.executable] + distributed_args + args
369
370
            # keep for quick debug
            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
371
            execute_subprocess_async(cmd, env=self.get_env())
372
        else:
373
            testargs = ["run_translation.py"] + args
374
375
            with patch.object(sys, "argv", testargs):
                main()
376

377
        return output_dir