test_4bit.py 18.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# coding=utf-8
# Copyright 2022 The HuggingFace Team Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a clone of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
16
import tempfile
17
18
import unittest

19
20
from packaging import version

21
22
23
24
25
26
from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
27
    BitsAndBytesConfig,
28
29
    pipeline,
)
30
31
32
33
34
35
36
37
38
from transformers.testing_utils import (
    is_torch_available,
    require_accelerate,
    require_bitsandbytes,
    require_torch,
    require_torch_gpu,
    require_torch_multi_gpu,
    slow,
)
39
from transformers.utils.versions import importlib_metadata
40
41
42
43


if is_torch_available():
    import torch
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    import torch.nn as nn

    class LoRALayer(nn.Module):
        """Wraps a linear layer with LoRA-like adapter - Used for testing purposes only"""

        def __init__(self, module: nn.Module, rank: int):
            super().__init__()
            self.module = module
            self.adapter = nn.Sequential(
                nn.Linear(module.in_features, rank, bias=False),
                nn.Linear(rank, module.out_features, bias=False),
            )
            small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
            nn.init.normal_(self.adapter[0].weight, std=small_std)
            nn.init.zeros_(self.adapter[1].weight)
            self.adapter.to(module.weight.device)

        def forward(self, input, *args, **kwargs):
            return self.module(input, *args, **kwargs) + self.adapter(input)
63
64
65
66
67
68
69


@require_bitsandbytes
@require_accelerate
@require_torch
@require_torch_gpu
@slow
70
class Base4bitTest(unittest.TestCase):
71
72
73
74
75
76
77
78
    # We keep the constants inside the init function and model loading inside setUp function

    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
    # Therefore here we use only bloom-1b3 to test our module
    model_name = "bigscience/bloom-1b7"

    # Constant values
    EXPECTED_RELATIVE_DIFFERENCE = (
79
        2.109659552692574  # This was obtained on a RTX Titan so the number might slightly change
80
81
82
    )

    input_text = "Hello my name is"
83
84
85
    EXPECTED_OUTPUTS = set()
    EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
    EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of your father.\n")
86
87
88
89
90
91
92
    MAX_NEW_TOKENS = 10

    def setUp(self):
        # Models and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)


93
class Bnb4BitTest(Base4bitTest):
94
95
96
97
    def setUp(self):
        super().setUp()

        # Models and tokenizer
98
99
100
        self.model_fp16 = AutoModelForCausalLM.from_pretrained(
            self.model_name, torch_dtype=torch.float16, device_map="auto"
        )
101
        self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
102
103
104
105
106
107
108

    def tearDown(self):
        r"""
        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
        """
        del self.model_fp16
109
        del self.model_4bit
110
111
112
113

        gc.collect()
        torch.cuda.empty_cache()

114
115
116
117
118
119
120
121
122
123
124
125
126
    def test_quantization_config_json_serialization(self):
        r"""
        A simple test to check if the quantization config is correctly serialized and deserialized
        """
        config = self.model_4bit.config

        self.assertTrue(hasattr(config, "quantization_config"))

        _ = config.to_dict()
        _ = config.to_diff_dict()

        _ = config.to_json_string()

127
128
129
130
131
    def test_memory_footprint(self):
        r"""
        A simple test to check if the model conversion has been done correctly by checking on the
        memory footprint of the converted model and the class type of the linear layers of the converted models
        """
132
        from bitsandbytes.nn import Params4bit
133
134

        mem_fp16 = self.model_fp16.get_memory_footprint()
135
136
137
138
139
140
141
142
143
144
145
146
147
148
        mem_4bit = self.model_4bit.get_memory_footprint()

        self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE)
        self.assertTrue(self.model_4bit.transformer.h[0].mlp.dense_4h_to_h.weight.__class__ == Params4bit)

    def test_linear_are_4bit(self):
        r"""
        A simple test to check if the model conversion has been done correctly by checking on the
        memory footprint of the converted model and the class type of the linear layers of the converted models
        """
        from transformers import T5PreTrainedModel

        self.model_fp16.get_memory_footprint()
        self.model_4bit.get_memory_footprint()
149

150
151
152
153
154
        for name, module in self.model_4bit.named_modules():
            if isinstance(module, torch.nn.Linear):
                if name not in ["lm_head"] + T5PreTrainedModel._keep_in_fp32_modules:
                    # 4-bit parameters are packed in uint8 variables
                    self.assertTrue(module.weight.dtype == torch.uint8)
155
156
157
158
159
160
161
162

    def test_generate_quality(self):
        r"""
        Test the generation quality of the quantized model and see that we are matching the expected output.
        Given that we are operating on small numbers + the testing model is relatively small, we might not get
        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
        """
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
163
        output_sequences = self.model_4bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
164

165
        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
166

167
168
169
170
171
    def test_generate_quality_config(self):
        r"""
        Test that loading the model with the config is equivalent
        """
        bnb_config = BitsAndBytesConfig()
172
        bnb_config.load_in_4bit = True
173

174
        model_4bit_from_config = AutoModelForCausalLM.from_pretrained(
175
176
177
178
            self.model_name, quantization_config=bnb_config, device_map="auto"
        )

        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
179
        output_sequences = model_4bit_from_config.generate(
180
181
182
            input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10
        )

183
        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
184

185
    def test_raise_on_save_pretrained(self):
186
187
188
        r"""
        Test whether trying to save a model after converting it in 8-bit will throw a warning.
        """
189
190
        with self.assertRaises(NotImplementedError), tempfile.TemporaryDirectory() as tmpdirname:
            self.model_4bit.save_pretrained(tmpdirname)
191

192
    def test_raise_if_config_and_load_in_4bit(self):
193
        r"""
194
        Test that loading the model with the config and `load_in_4bit` raises an error
195
196
197
198
199
200
201
        """
        bnb_config = BitsAndBytesConfig()

        with self.assertRaises(ValueError):
            _ = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                quantization_config=bnb_config,
202
                load_in_4bit=True,
203
                device_map="auto",
204
                bnb_4bit_quant_type="nf4",
205
206
            )

207
208
209
210
211
212
213
    def test_device_and_dtype_assignment(self):
        r"""
        Test whether trying to cast (or assigning a device to) a model after converting it in 8-bit will throw an error.
        Checks also if other models are casted correctly.
        """
        with self.assertRaises(ValueError):
            # Tries with `str`
214
            self.model_4bit.to("cpu")
215
216
217

        with self.assertRaises(ValueError):
            # Tries with a `dtype``
218
            self.model_4bit.to(torch.float16)
219
220
221

        with self.assertRaises(ValueError):
            # Tries with a `device`
222
            self.model_4bit.to(torch.device("cuda:0"))
223
224
225

        with self.assertRaises(ValueError):
            # Tries with a `device`
226
            self.model_4bit.float()
227
228
229

        with self.assertRaises(ValueError):
            # Tries with a `device`
230
            self.model_4bit.half()
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246

        # Test if we did not break anything
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")

        self.model_fp16 = self.model_fp16.to(torch.float32)
        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)

        # Check this does not throw an error
        _ = self.model_fp16.to("cpu")

        # Check this does not throw an error
        _ = self.model_fp16.half()

        # Check this does not throw an error
        _ = self.model_fp16.float()

247
    def test_fp32_4bit_conversion(self):
248
        r"""
249
        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
250
        """
251
        model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", load_in_4bit=True, device_map="auto")
252
253
        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)

254

255
256
257
258
259
@require_bitsandbytes
@require_accelerate
@require_torch
@require_torch_gpu
@slow
260
class Bnb4BitT5Test(unittest.TestCase):
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
    @classmethod
    def setUpClass(cls):
        cls.model_name = "t5-small"
        cls.dense_act_model_name = "google/flan-t5-small"  # flan-t5 uses dense-act instead of dense-relu-dense
        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
        cls.input_text = "Translate in German: Hello, my dog is cute"

    def tearDown(self):
        r"""
        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
        """
        gc.collect()
        torch.cuda.empty_cache()

    def test_inference_without_keep_in_fp32(self):
        r"""
278
        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
279
280
281
282
283
        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
        both cases.
        """
        from transformers import T5ForConditionalGeneration

284
        modules = T5ForConditionalGeneration._keep_in_fp32_modules
285
286
287
        T5ForConditionalGeneration._keep_in_fp32_modules = None

        # test with `t5-small`
288
        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
289
290
291
292
293
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
        _ = model.generate(**encoded_input)

        # test with `flan-t5-small`
        model = T5ForConditionalGeneration.from_pretrained(
294
            self.dense_act_model_name, load_in_4bit=True, device_map="auto"
295
296
297
        )
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
        _ = model.generate(**encoded_input)
298
        T5ForConditionalGeneration._keep_in_fp32_modules = modules
299
300
301

    def test_inference_with_keep_in_fp32(self):
        r"""
302
        Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
303
304
305
        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
        both cases.
        """
306
307
        import bitsandbytes as bnb

308
309
310
        from transformers import T5ForConditionalGeneration

        # test with `t5-small`
311
        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
312
313

        # there was a bug with decoders - this test checks that it is fixed
314
        self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear4bit))
315

316
317
318
319
320
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
        _ = model.generate(**encoded_input)

        # test with `flan-t5-small`
        model = T5ForConditionalGeneration.from_pretrained(
321
            self.dense_act_model_name, load_in_4bit=True, device_map="auto"
322
323
324
325
        )
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
        _ = model.generate(**encoded_input)

326

327
class Classes4BitModelTest(Base4bitTest):
328
329
330
331
    def setUp(self):
        super().setUp()
        # model_name
        self.model_name = "bigscience/bloom-560m"
332
333
334
335
        self.seq_to_seq_name = "t5-small"

        # Different types of model

336
        self.base_model = AutoModel.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
337
        # Sequence classification model
338
        self.sequence_model = AutoModelForSequenceClassification.from_pretrained(
339
            self.model_name, load_in_4bit=True, device_map="auto"
340
        )
341
        # CausalLM model
342
        self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
343
344
        # Seq2seq model
        self.seq_to_seq_model = AutoModelForSeq2SeqLM.from_pretrained(
345
            self.seq_to_seq_name, load_in_4bit=True, device_map="auto"
346
        )
347
348
349
350
351
352
353
354

    def tearDown(self):
        r"""
        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
        """
        del self.base_model
        del self.sequence_model
355
        del self.model_4bit
356
        del self.seq_to_seq_model
357
358
359
360
361
362
363
364
365

        gc.collect()
        torch.cuda.empty_cache()

    def test_correct_head_class(self):
        r"""
        A simple test to check if the last modules for some classes (AutoModelForCausalLM or SequenceClassification)
        are kept in their native class.
        """
366
        from bitsandbytes.nn import Params4bit
367

368
        self.assertTrue(self.base_model.h[-1].mlp.dense_4h_to_h.weight.__class__ == Params4bit)
369
370

        # Other heads should be nn.Parameter
371
        self.assertTrue(self.model_4bit.lm_head.weight.__class__ == torch.nn.Parameter)
372
        self.assertTrue(self.sequence_model.score.weight.__class__ == torch.nn.Parameter)
373
        self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
374
375


376
class Pipeline4BitTest(Base4bitTest):
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
    def setUp(self):
        super().setUp()

    def tearDown(self):
        r"""
        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
        """
        del self.pipe

        gc.collect()
        torch.cuda.empty_cache()

    def test_pipeline(self):
        r"""
392
        The aim of this test is to verify that the mixed 4bit is compatible with `pipeline` from transformers. Since
393
394
395
396
397
398
399
        we used pipline for inference speed benchmarking we want to make sure that this feature does not break anything
        on pipline.
        """
        # self._clear_cuda_cache()
        self.pipe = pipeline(
            "text-generation",
            model=self.model_name,
400
            model_kwargs={"device_map": "auto", "load_in_4bit": True, "torch_dtype": torch.float16},
401
402
403
404
405
            max_new_tokens=self.MAX_NEW_TOKENS,
        )

        # Real second forward pass
        pipeline_output = self.pipe(self.input_text)
406
        self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
407
408
409


@require_torch_multi_gpu
410
class Bnb4bitTestMultiGpu(Base4bitTest):
411
412
413
414
415
416
417
418
419
420
    def setUp(self):
        super().setUp()

    def test_multi_gpu_loading(self):
        r"""
        This tests that the model has been loaded and can be used correctly on a multi-GPU setup.
        Let's just try to load a model on 2 GPUs and see if it works. The model we test has ~2GB of total, 3GB should suffice
        """

        model_parallel = AutoModelForCausalLM.from_pretrained(
421
            self.model_name, load_in_4bit=True, device_map="balanced"
422
423
        )

Younes Belkada's avatar
Younes Belkada committed
424
425
        # Check correct device map
        self.assertEqual(set(model_parallel.hf_device_map.values()), {0, 1})
426
427
428
429
430
431

        # Check that inference pass works on the model
        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")

        # Second real batch
        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
432
        self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
433
434


435
class Bnb4BitTestTraining(Base4bitTest):
436
437
438
439
440
441
442
443
444
    def setUp(self):
        self.model_name = "facebook/opt-350m"
        super().setUp()

    def test_training(self):
        if version.parse(importlib_metadata.version("bitsandbytes")) < version.parse("0.37.0"):
            return

        # Step 1: freeze all parameters
445
446
447
        model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)

        self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475

        for param in model.parameters():
            param.requires_grad = False  # freeze the model - train adapters later
            if param.ndim == 1:
                # cast the small parameters (e.g. layernorm) to fp32 for stability
                param.data = param.data.to(torch.float32)

        # Step 2: add adapters
        for _, module in model.named_modules():
            if "OPTAttention" in repr(type(module)):
                module.q_proj = LoRALayer(module.q_proj, rank=16)
                module.k_proj = LoRALayer(module.k_proj, rank=16)
                module.v_proj = LoRALayer(module.v_proj, rank=16)

        # Step 3: dummy batch
        batch = self.tokenizer("Test batch ", return_tensors="pt").to(0)

        # Step 4: Check if the gradient is not None
        with torch.cuda.amp.autocast():
            out = model.forward(**batch)
            out.logits.norm().backward()

        for module in model.modules():
            if isinstance(module, LoRALayer):
                self.assertTrue(module.adapter[1].weight.grad is not None)
                self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0)
            elif isinstance(module, nn.Embedding):
                self.assertTrue(module.weight.grad is None)