test_trainer_ext.py 14.3 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import math
Suraj Patil's avatar
Suraj Patil committed
16
import os
17
import re
Suraj Patil's avatar
Suraj Patil committed
18
import sys
19
import unittest
20
from typing import Tuple
Suraj Patil's avatar
Suraj Patil committed
21
22
from unittest.mock import patch

23
from parameterized import parameterized
24
from transformers import AutoModel
25
from transformers.testing_utils import (
26
    CaptureStderr,
27
    ExtendSysPath,
28
29
30
    TestCasePlus,
    execute_subprocess_async,
    get_gpu_count,
31
    get_torch_dist_unique_port,
32
33
34
    require_apex,
    require_bitsandbytes,
    require_fairscale,
35
    require_torch,
36
    require_torch_gpu,
37
38
39
40
    require_torch_multi_gpu,
    require_torch_non_multi_gpu,
    slow,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
41
42
from transformers.trainer_callback import TrainerState
from transformers.trainer_utils import set_seed
Suraj Patil's avatar
Suraj Patil committed
43

44
45

bindir = os.path.abspath(os.path.dirname(__file__))
Sylvain Gugger's avatar
Sylvain Gugger committed
46
with ExtendSysPath(f"{bindir}/../../examples/pytorch/translation"):
47
    from run_translation import main  # noqa
48

Suraj Patil's avatar
Suraj Patil committed
49

50
set_seed(42)
Suraj Patil's avatar
Suraj Patil committed
51
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
Sylvain Gugger's avatar
Sylvain Gugger committed
52
MBART_TINY = "sshleifer/tiny-mbart"
Suraj Patil's avatar
Suraj Patil committed
53
54


55
@require_torch
56
class TestTrainerExt(TestCasePlus):
57
58
59
60
61
62
63
64
65
    def run_seq2seq_quick(
        self,
        distributed=False,
        extra_args_str=None,
        predict_with_generate=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
    ):
66
67
68
69
70
71
72
73
        output_dir = self.run_trainer(
            eval_steps=1,
            max_len=12,
            model_name=MBART_TINY,
            num_train_epochs=1,
            distributed=distributed,
            extra_args_str=extra_args_str,
            predict_with_generate=predict_with_generate,
74
75
76
            do_train=do_train,
            do_eval=do_eval,
            do_predict=do_predict,
77
        )
78
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
79
80
81
82

        if not do_eval:
            return

83
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
84

85
        first_step_stats = eval_metrics[0]
86
87
        if predict_with_generate:
            assert "eval_bleu" in first_step_stats
Suraj Patil's avatar
Suraj Patil committed
88

89
90
91
            last_step_stats = eval_metrics[-1]
            assert isinstance(last_step_stats["eval_bleu"], float)
            assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`"
92

93
    @require_torch_non_multi_gpu
94
95
    def test_run_seq2seq_no_dist(self):
        self.run_seq2seq_quick()
96

97
    # verify that the trainer can handle non-distributed with n_gpu > 1
98
    @require_torch_multi_gpu
99
100
    def test_run_seq2seq_dp(self):
        self.run_seq2seq_quick(distributed=False)
101

102
    # verify that the trainer can handle distributed with n_gpu > 1
103
    @require_torch_multi_gpu
104
105
    def test_run_seq2seq_ddp(self):
        self.run_seq2seq_quick(distributed=True)
106

107
    # test --sharded_ddp w/o --fp16
108
109
    @require_torch_multi_gpu
    @require_fairscale
110
111
    def test_run_seq2seq_sharded_ddp(self):
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple")
112

113
    # test --sharded_ddp w/ --fp16
114
    @unittest.skip("Requires an update of the env running those tests")
115
116
    @require_torch_multi_gpu
    @require_fairscale
117
118
119
    def test_run_seq2seq_sharded_ddp_fp16(self):
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple --fp16")

120
    # test --sharded_ddp zero_dp_2 w/o --fp16
121
122
123
    @require_torch_multi_gpu
    @require_fairscale
    def test_run_seq2seq_fully_sharded_ddp(self):
124
        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False)
125

126
    # test --sharded_ddp zero_dp_2 w/ --fp16
127
    @unittest.skip("Requires an update of the env running those tests")
128
129
130
131
    @require_torch_multi_gpu
    @require_fairscale
    def test_run_seq2seq_fully_sharded_ddp_fp16(self):
        self.run_seq2seq_quick(
132
            distributed=True, extra_args_str="--sharded_ddp zero_dp_2 --fp16", predict_with_generate=False
133
        )
134

135
    @require_apex
136
    @require_torch_gpu
137
    def test_run_seq2seq_apex(self):
138
139
140
141
142
143
144
145
146
147
148
149
        # XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same
        # program and it breaks other tests that run from the same pytest worker, therefore until this is
        # sorted out it must be run only in an external program, that is distributed=True in this
        # test and only under one or more gpus - if we want cpu will need to make a special test
        #
        # specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via
        # 2nd main() call it botches the future eval.
        #
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
        # test 2nd time - was getting eval_loss': nan'
        # to reproduce the problem set distributed=False
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
150

151
    @parameterized.expand(["base", "low", "high", "mixed"])
152
    @require_torch_multi_gpu
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    def test_trainer_log_level_replica(self, experiment_id):
        # as each sub-test is slow-ish split into multiple sub-tests to avoid CI timeout
        experiments = dict(
            # test with the default log_level - should be info and thus log info once
            base=dict(extra_args_str="", n_matches=1),
            # test with low log_level and log_level_replica - should be noisy on all processes
            # now the info string should appear twice on 2 processes
            low=dict(extra_args_str="--log_level debug --log_level_replica debug", n_matches=2),
            # test with high log_level and low log_level_replica
            # now the info string should appear once only on the replica
            high=dict(extra_args_str="--log_level error --log_level_replica debug", n_matches=1),
            # test with high log_level and log_level_replica - should be quiet on all processes
            mixed=dict(extra_args_str="--log_level error --log_level_replica error", n_matches=0),
        )
167

168
169
170
        data = experiments[experiment_id]
        kwargs = dict(distributed=True, predict_with_generate=False, do_eval=False, do_predict=False)
        log_info_string = "Running training"
171
        with CaptureStderr() as cl:
172
            self.run_seq2seq_quick(**kwargs, extra_args_str=data["extra_args_str"])
173
        n_matches = len(re.findall(log_info_string, cl.err))
174
        self.assertEqual(n_matches, data["n_matches"])
175

176
    @slow
177
    def test_run_seq2seq(self):
178
        output_dir = self.run_trainer(
179
180
181
182
183
184
            eval_steps=2,
            max_len=128,
            model_name=MARIAN_MODEL,
            learning_rate=3e-4,
            num_train_epochs=10,
            distributed=False,
185
        )
Suraj Patil's avatar
Suraj Patil committed
186

187
188
189
190
191
        # Check metrics
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
        last_step_stats = eval_metrics[-1]
192

193
        assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing"
194
        assert isinstance(last_step_stats["eval_bleu"], float)
195

196
197
198
        # test if do_predict saves generations and metrics
        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
199
200
        assert "generated_predictions.txt" in contents
        assert "predict_results.json" in contents
201

202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
    @slow
    @require_bitsandbytes
    def test_run_seq2seq_bnb(self):
        from transformers.training_args import OptimizerNames

        def train_and_return_metrics(optim: str) -> Tuple[int, float]:
            from pathlib import Path

            extra_args = (
                f"--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
                "False --adafactor False --log_level debug"
            )

            output_dir = self.run_trainer(
                eval_steps=2,
                max_len=128,
                model_name=MARIAN_MODEL,
                learning_rate=3e-4,
                num_train_epochs=1,
                distributed=True,  # force run in a new process
                extra_args_str=extra_args,
                do_eval=False,
                do_predict=False,
            )

            # Check metrics
            logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
            gpu_peak_mem = logs[0]["train_mem_gpu_peaked_delta"]
            gpu_alloc_mem = logs[0]["train_mem_gpu_alloc_delta"]

            loss = logs[0]["train_loss"]
            return gpu_peak_mem, gpu_alloc_mem, loss

        gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
        gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)

        gpu_peak_mem_diff_bytes = gpu_peak_mem_orig - gpu_peak_mem_bnb
        gpu_peak_mem_diff_percent = gpu_peak_mem_diff_bytes / gpu_peak_mem_bnb

        gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
        gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb

        gpu_total_mem_diff_bytes = gpu_total_mem_orig - gpu_total_mem_bnb
        gpu_total_mem_diff_percent = gpu_total_mem_diff_bytes / gpu_total_mem_bnb

        # leave this for now if CI gets very different results
        # print(f"{gpu_alloc_mem_orig=:010d} {gpu_peak_mem_orig=:010d} {gpu_alloc_mem_orig+gpu_peak_mem_orig=:010d}" )
        # print(f" {gpu_alloc_mem_bnb=:010d}  {gpu_peak_mem_bnb=:010d}   {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=:010d}")
        # print(f"{gpu_peak_mem_diff_bytes=}, {gpu_peak_mem_diff_percent=}")
        # print(f"{gpu_total_mem_orig=}, {gpu_total_mem_bnb=}")
        # print(f"{gpu_total_mem_diff_bytes=}, {gpu_total_mem_diff_percent=}")

        self.assertGreater(
            gpu_peak_mem_diff_percent,
            10,  # basically a huge difference - got ~30x on my desktop
            "should use very little peak gpu memory with BNB, compared to without it"
            f"but got gpu_peak_mem_orig={gpu_peak_mem_orig} and gpu_peak_mem_bnb={gpu_peak_mem_bnb}",
        )

        self.assertGreater(
            gpu_total_mem_diff_percent,
            0.20,  # could easily be 0.50, but let's stay on the safe side
            "Using BNB should use less total GPU memory than without it"
            f"but got gpu_total_mem_orig={gpu_total_mem_orig} and gpu_total_mem_bnb={gpu_total_mem_bnb}",
        )

        self.assertEqual(
269
            loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
270
271
272
273
274
275
276
277
278
279
280
281
282
283
        )

        # Additionally let's test that the absolute gpu memory difference is larger or about the
        # same as the expected saving coming from BNB (6 bytes per param)
        model = AutoModel.from_pretrained(MARIAN_MODEL)
        total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
        bnb_saved_bytes = total_numel * 6  # 324MB

        self.assertGreater(
            gpu_total_mem_diff_bytes,
            bnb_saved_bytes * 0.8,  # add a safety margin, if it saved slightly less
            f"BNB should have saved about {bnb_saved_bytes} bytes, but the saved bytes were {gpu_total_mem_diff_bytes}",
        )

284
    def run_trainer(
285
286
        self,
        eval_steps: int,
287
        max_len: int,
288
289
        model_name: str,
        num_train_epochs: int,
290
        learning_rate: float = 3e-3,
291
292
        distributed: bool = False,
        extra_args_str: str = None,
293
        predict_with_generate: bool = True,
294
295
296
        do_train: bool = True,
        do_eval: bool = True,
        do_predict: bool = True,
297
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
298
        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
299
        output_dir = self.get_auto_remove_tmp_dir()
300
        args_train = f"""
301
            --model_name_or_path {model_name}
302
303
304
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --test_file {data_dir}/test.json
305
306
            --output_dir {output_dir}
            --overwrite_output_dir
307
            --max_train_samples 8
308
            --max_source_length {max_len}
309
            --max_target_length {max_len}
310
311
312
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
313
            --learning_rate {learning_rate}
314
315
            --warmup_steps 8
            --logging_steps 0
316
            --logging_strategy no
317
            --save_steps {str(eval_steps)}
318
            --group_by_length
319
            --label_smoothing_factor 0.1
320
            --adafactor
321
322
            --target_lang ro_RO
            --source_lang en_XX
323
        """
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347

        args_eval = f"""
            --do_eval
            --per_device_eval_batch_size 4
            --max_eval_samples 8
            --val_max_target_length {max_len}
            --evaluation_strategy steps
            --eval_steps {str(eval_steps)}
        """

        args_predict = """
            --do_predict
        """

        args = ""
        if do_train:
            args += args_train

        if do_eval:
            args += args_eval

        if do_predict:
            args += args_predict

348
349
350
351
        if predict_with_generate:
            args += "--predict_with_generate"

        args = args.split()
352

353
354
355
        if extra_args_str is not None:
            args.extend(extra_args_str.split())

356
        if distributed:
357
            n_gpu = get_gpu_count()
358
            master_port = get_torch_dist_unique_port()
359
360
361
            distributed_args = f"""
                -m torch.distributed.launch
                --nproc_per_node={n_gpu}
362
                --master_port={master_port}
Sylvain Gugger's avatar
Sylvain Gugger committed
363
                {self.examples_dir_str}/pytorch/translation/run_translation.py
364
365
            """.split()
            cmd = [sys.executable] + distributed_args + args
366
367
            # keep for quick debug
            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
368
            execute_subprocess_async(cmd, env=self.get_env())
369
        else:
370
            testargs = ["run_translation.py"] + args
371
372
            with patch.object(sys, "argv", testargs):
                main()
373

374
        return output_dir