"api/examples/generate-streaming/main.go" did not exist on "4c7db6b7e917ef475a9d5dccd180cefd298175e4"
test_trainer_ext.py 13.4 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import math
Suraj Patil's avatar
Suraj Patil committed
16
import os
17
import re
Suraj Patil's avatar
Suraj Patil committed
18
import sys
Stas Bekman's avatar
Stas Bekman committed
19
from pathlib import Path
20
from typing import Tuple
Suraj Patil's avatar
Suraj Patil committed
21
22
from unittest.mock import patch

23
from parameterized import parameterized
24

25
from transformers.testing_utils import (
26
    CaptureStderr,
27
    ExtendSysPath,
28
29
30
    TestCasePlus,
    execute_subprocess_async,
    get_gpu_count,
31
    get_torch_dist_unique_port,
32
33
    require_apex,
    require_bitsandbytes,
34
    require_torch,
35
    require_torch_gpu,
36
37
38
39
    require_torch_multi_gpu,
    require_torch_non_multi_gpu,
    slow,
)
Sylvain Gugger's avatar
Sylvain Gugger committed
40
41
from transformers.trainer_callback import TrainerState
from transformers.trainer_utils import set_seed
Suraj Patil's avatar
Suraj Patil committed
42

43
44

bindir = os.path.abspath(os.path.dirname(__file__))
Sylvain Gugger's avatar
Sylvain Gugger committed
45
with ExtendSysPath(f"{bindir}/../../examples/pytorch/translation"):
46
    from run_translation import main  # noqa
47

Suraj Patil's avatar
Suraj Patil committed
48

49
set_seed(42)
Suraj Patil's avatar
Suraj Patil committed
50
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
Sylvain Gugger's avatar
Sylvain Gugger committed
51
MBART_TINY = "sshleifer/tiny-mbart"
Suraj Patil's avatar
Suraj Patil committed
52
53


54
@require_torch
55
class TestTrainerExt(TestCasePlus):
56
57
58
59
60
61
62
63
64
    def run_seq2seq_quick(
        self,
        distributed=False,
        extra_args_str=None,
        predict_with_generate=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
    ):
65
66
67
68
69
70
71
72
        output_dir = self.run_trainer(
            eval_steps=1,
            max_len=12,
            model_name=MBART_TINY,
            num_train_epochs=1,
            distributed=distributed,
            extra_args_str=extra_args_str,
            predict_with_generate=predict_with_generate,
73
74
75
            do_train=do_train,
            do_eval=do_eval,
            do_predict=do_predict,
76
        )
77
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
78
79
80
81

        if not do_eval:
            return

82
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
83

84
        first_step_stats = eval_metrics[0]
85
86
        if predict_with_generate:
            assert "eval_bleu" in first_step_stats
Suraj Patil's avatar
Suraj Patil committed
87

88
89
90
            last_step_stats = eval_metrics[-1]
            assert isinstance(last_step_stats["eval_bleu"], float)
            assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`"
91

92
    @require_torch_non_multi_gpu
93
94
    def test_run_seq2seq_no_dist(self):
        self.run_seq2seq_quick()
95

96
    # verify that the trainer can handle non-distributed with n_gpu > 1
97
    @require_torch_multi_gpu
98
99
    def test_run_seq2seq_dp(self):
        self.run_seq2seq_quick(distributed=False)
100

101
    # verify that the trainer can handle distributed with n_gpu > 1
102
    @require_torch_multi_gpu
103
104
    def test_run_seq2seq_ddp(self):
        self.run_seq2seq_quick(distributed=True)
105

106
    @require_apex
107
    @require_torch_gpu
108
    def test_run_seq2seq_apex(self):
109
110
111
112
113
114
115
116
117
118
119
120
        # XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same
        # program and it breaks other tests that run from the same pytest worker, therefore until this is
        # sorted out it must be run only in an external program, that is distributed=True in this
        # test and only under one or more gpus - if we want cpu will need to make a special test
        #
        # specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via
        # 2nd main() call it botches the future eval.
        #
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
        # test 2nd time - was getting eval_loss': nan'
        # to reproduce the problem set distributed=False
        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
121

122
    @parameterized.expand(["base", "low", "high", "mixed"])
123
    @require_torch_multi_gpu
124
125
    def test_trainer_log_level_replica(self, experiment_id):
        # as each sub-test is slow-ish split into multiple sub-tests to avoid CI timeout
126
        experiments = {
127
            # test with the default log_level - should be info and thus log info once
128
            "base": {"extra_args_str": "", "n_matches": 1},
129
130
            # test with low log_level and log_level_replica - should be noisy on all processes
            # now the info string should appear twice on 2 processes
131
            "low": {"extra_args_str": "--log_level debug --log_level_replica debug", "n_matches": 2},
132
133
            # test with high log_level and low log_level_replica
            # now the info string should appear once only on the replica
134
            "high": {"extra_args_str": "--log_level error --log_level_replica debug", "n_matches": 1},
135
            # test with high log_level and log_level_replica - should be quiet on all processes
136
137
            "mixed": {"extra_args_str": "--log_level error --log_level_replica error", "n_matches": 0},
        }
138

139
        data = experiments[experiment_id]
140
        kwargs = {"distributed": True, "predict_with_generate": False, "do_eval": False, "do_predict": False}
141
        log_info_string = "Running training"
142
        with CaptureStderr() as cl:
143
            self.run_seq2seq_quick(**kwargs, extra_args_str=data["extra_args_str"])
144
        n_matches = len(re.findall(log_info_string, cl.err))
145
        self.assertEqual(n_matches, data["n_matches"])
146

147
    @slow
148
    def test_run_seq2seq(self):
149
        output_dir = self.run_trainer(
150
151
152
153
154
155
            eval_steps=2,
            max_len=128,
            model_name=MARIAN_MODEL,
            learning_rate=3e-4,
            num_train_epochs=10,
            distributed=False,
156
        )
Suraj Patil's avatar
Suraj Patil committed
157

158
159
160
161
162
        # Check metrics
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
        last_step_stats = eval_metrics[-1]
163

164
        assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing"
165
        assert isinstance(last_step_stats["eval_bleu"], float)
166

167
168
169
        # test if do_predict saves generations and metrics
        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
170
171
        assert "generated_predictions.txt" in contents
        assert "predict_results.json" in contents
172

173
174
175
176
177
178
    @slow
    @require_bitsandbytes
    def test_run_seq2seq_bnb(self):
        from transformers.training_args import OptimizerNames

        def train_and_return_metrics(optim: str) -> Tuple[int, float]:
Stas Bekman's avatar
Stas Bekman committed
179
            extra_args = "--skip_memory_metrics 0"
180
181
182
183
184
185

            output_dir = self.run_trainer(
                max_len=128,
                model_name=MARIAN_MODEL,
                learning_rate=3e-4,
                num_train_epochs=1,
Stas Bekman's avatar
Stas Bekman committed
186
                optim=optim,
187
188
189
190
                distributed=True,  # force run in a new process
                extra_args_str=extra_args,
                do_eval=False,
                do_predict=False,
Stas Bekman's avatar
Stas Bekman committed
191
                n_gpus_to_use=1,  # to allow deterministic fixed memory usage
192
193
194
195
            )

            # Check metrics
            logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
Stas Bekman's avatar
Stas Bekman committed
196
197
            gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20)
            gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20)
198
199

            loss = logs[0]["train_loss"]
Stas Bekman's avatar
Stas Bekman committed
200
            return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss
201
202
203
204

        gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
        gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)

Stas Bekman's avatar
Stas Bekman committed
205
        gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb
206
207
208

        gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
        gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
Stas Bekman's avatar
Stas Bekman committed
209
        gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb
210

Stas Bekman's avatar
Stas Bekman committed
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
        # sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which
        # doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized
        # in 2 bytes and the diff in optim memory usage is derived as so:
        #
        # - normal 25*8=~200MB (8 bytes per param)
        # - bnb    25*2= ~50MB (2 bytes per param)
        #
        # Thus we should expect ~150MB total memory saved.
        #
        # Peak memory should be the same - the total should be different by about that same margin
        #
        # After leaving a small margin to accommodate for differences between gpus let's check
        # that we have at least 120MB in savings
        expected_savings = 120

        # uncomment the following if this test starts failing - requires py38 for a new print feature
        # gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb
        # print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB")
        # print(f" {gpu_alloc_mem_bnb=}MB  {gpu_peak_mem_bnb=}MB  {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB")
        # print(f"{gpu_alloc_mem_diff=}MB")
        # print(f"{gpu_peak_mem_diff=}MB")
        # print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB")
        # print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB")
234
235

        self.assertGreater(
Stas Bekman's avatar
Stas Bekman committed
236
237
238
239
240
            gpu_alloc_mem_diff,
            expected_savings,
            "should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got"
            f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and"
            f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB",
241
242
243
        )

        self.assertGreater(
Stas Bekman's avatar
Stas Bekman committed
244
245
246
247
248
            gpu_total_mem_diff,
            expected_savings,
            "should use ~150MB less total gpu memory with BNB, compared to without it for this model but got"
            f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and"
            f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB",
249
250
251
        )

        self.assertEqual(
252
            loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
253
254
        )

255
    def run_trainer(
256
        self,
257
        max_len: int,
258
259
        model_name: str,
        num_train_epochs: int,
260
        learning_rate: float = 3e-3,
Stas Bekman's avatar
Stas Bekman committed
261
        optim: str = "adafactor",
262
263
        distributed: bool = False,
        extra_args_str: str = None,
Stas Bekman's avatar
Stas Bekman committed
264
        eval_steps: int = 0,
265
        predict_with_generate: bool = True,
266
267
268
        do_train: bool = True,
        do_eval: bool = True,
        do_predict: bool = True,
Stas Bekman's avatar
Stas Bekman committed
269
        n_gpus_to_use: int = None,
270
    ):
Sylvain Gugger's avatar
Sylvain Gugger committed
271
        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
272
        output_dir = self.get_auto_remove_tmp_dir()
273
        args_train = f"""
274
            --model_name_or_path {model_name}
275
276
277
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --test_file {data_dir}/test.json
278
279
            --output_dir {output_dir}
            --overwrite_output_dir
280
            --max_train_samples 8
281
            --max_source_length {max_len}
282
            --max_target_length {max_len}
283
284
285
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
286
            --learning_rate {learning_rate}
287
288
            --warmup_steps 8
            --logging_steps 0
289
            --logging_strategy no
290
            --save_steps {str(eval_steps)}
291
            --group_by_length
292
            --label_smoothing_factor 0.1
293
294
            --target_lang ro_RO
            --source_lang en_XX
Stas Bekman's avatar
Stas Bekman committed
295
        """.split()
296
297
298
299
300
301
302
303

        args_eval = f"""
            --do_eval
            --per_device_eval_batch_size 4
            --max_eval_samples 8
            --val_max_target_length {max_len}
            --evaluation_strategy steps
            --eval_steps {str(eval_steps)}
Stas Bekman's avatar
Stas Bekman committed
304
        """.split()
305
306
307

        args_predict = """
            --do_predict
Stas Bekman's avatar
Stas Bekman committed
308
        """.split()
309

Stas Bekman's avatar
Stas Bekman committed
310
        args = []
311
312
313
314
315
316
317
318
319
        if do_train:
            args += args_train

        if do_eval:
            args += args_eval

        if do_predict:
            args += args_predict

320
        if predict_with_generate:
Stas Bekman's avatar
Stas Bekman committed
321
            args += "--predict_with_generate".split()
322

Stas Bekman's avatar
Stas Bekman committed
323
324
325
326
327
        if do_train:
            if optim == "adafactor":
                args += "--adafactor".split()
            else:
                args += f"--optim {optim}".split()
328

329
        if extra_args_str is not None:
Stas Bekman's avatar
Stas Bekman committed
330
            args += extra_args_str.split()
331

332
        if distributed:
Stas Bekman's avatar
Stas Bekman committed
333
334
            if n_gpus_to_use is None:
                n_gpus_to_use = get_gpu_count()
335
            master_port = get_torch_dist_unique_port()
336
            distributed_args = f"""
337
                -m torch.distributed.run
Stas Bekman's avatar
Stas Bekman committed
338
                --nproc_per_node={n_gpus_to_use}
339
                --master_port={master_port}
Sylvain Gugger's avatar
Sylvain Gugger committed
340
                {self.examples_dir_str}/pytorch/translation/run_translation.py
341
342
            """.split()
            cmd = [sys.executable] + distributed_args + args
343
344
            # keep for quick debug
            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
345
            execute_subprocess_async(cmd, env=self.get_env())
346
        else:
347
            testargs = ["run_translation.py"] + args
348
349
            with patch.object(sys, "argv", testargs):
                main()
350

351
        return output_dir