test_generation.py 4.08 KB
Newer Older
1
from itertools import product
Aarni Koskela's avatar
Aarni Koskela committed
2
import math
3

Aarni Koskela's avatar
Aarni Koskela committed
4
5
import pytest
import torch
6

Aarni Koskela's avatar
Aarni Koskela committed
7
8
from tests.helpers import TRUE_FALSE, describe_dtype, id_formatter

9
10
transformers = pytest.importorskip("transformers")

Tim Dettmers's avatar
Tim Dettmers committed
11
12

def get_4bit_config():
Ruff's avatar
Ruff committed
13
14
15
16
17
18
19
20
21
    return transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        load_in_8bit=False,
        llm_int8_threshold=6.0,
        llm_int8_has_fp16_weight=False,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )
Tim Dettmers's avatar
Tim Dettmers committed
22
23


24
25
26
def get_model_and_tokenizer(config):
    model_name_or_path, quant_type = config
    bnb_config = get_4bit_config()
Ruff's avatar
Ruff committed
27
    if quant_type == "16bit":
28
29
        bnb_config.load_in_4bit = False
    else:
Ruff's avatar
Ruff committed
30
31
32
        bnb_config.bnb_4bit_quant_type = quant_type
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
33
        quantization_config=bnb_config,
Ruff's avatar
Ruff committed
34
35
36
37
        max_memory={0: "48GB"},
        device_map="auto",
        torch_dtype=torch.bfloat16,
    ).eval()
Tim Dettmers's avatar
Tim Dettmers committed
38

39
40
41
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)

    return model, tokenizer
Tim Dettmers's avatar
Tim Dettmers committed
42

Ruff's avatar
Ruff committed
43

Tim Dettmers's avatar
Tim Dettmers committed
44
45
46
47
48
49
def get_prompt_for_generation_eval(text, add_roles=True):
    description = (
        "A chat between a curious human and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's questions."
    )
    if add_roles:
Ruff's avatar
Ruff committed
50
        prompt = f"{description} ### Human: {text} ### Assistant:"
Tim Dettmers's avatar
Tim Dettmers committed
51
    else:
Ruff's avatar
Ruff committed
52
        prompt = f"{description} {text}"
Tim Dettmers's avatar
Tim Dettmers committed
53
54
    return prompt

Ruff's avatar
Ruff committed
55

Tim Dettmers's avatar
Tim Dettmers committed
56
57
def generate(model, tokenizer, text, generation_config, prompt_func=get_prompt_for_generation_eval):
    text = prompt_func(text)
Ruff's avatar
Ruff committed
58
59
    inputs = tokenizer(text, return_tensors="pt").to("cuda:0")
    outputs = model.generate(inputs=inputs["input_ids"], generation_config=generation_config)
Tim Dettmers's avatar
Tim Dettmers committed
60
61
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

Aarni Koskela's avatar
Aarni Koskela committed
62

Matthew Douglas's avatar
Matthew Douglas committed
63
models = ["bigscience/bloom-1b7"]
Ruff's avatar
Ruff committed
64
65
66
67
dtypes = ["nf4", "fp4"]


@pytest.fixture(scope="session", params=product(models, dtypes))
68
69
def model_and_tokenizer(request):
    model, tokenizer = get_model_and_tokenizer(request.param)
70
    yield request.param, model, tokenizer
71
72
73
    del model


Aarni Koskela's avatar
Aarni Koskela committed
74
75
76
77
78
@pytest.mark.parametrize("DQ", TRUE_FALSE, ids=id_formatter("dq"))
@pytest.mark.parametrize("inference_kernel", TRUE_FALSE, ids=id_formatter("inference_kernel"))
@pytest.mark.parametrize("dtype", [torch.float16], ids=describe_dtype)
@pytest.mark.slow
def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype):
79
    fixture_config, model, tokenizer = model_and_tokenizer
Tim Dettmers's avatar
Tim Dettmers committed
80
81

    generation_config = transformers.GenerationConfig(
82
        max_new_tokens=20,
Tim Dettmers's avatar
Tim Dettmers committed
83
84
85
86
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
    )
87
    generation_config.max_new_tokens = 20
Tim Dettmers's avatar
Tim Dettmers committed
88

Ruff's avatar
Ruff committed
89
90
91
    # text = 'Please write down the first 50 digits of pi.'
    # text = get_prompt_for_generation_eval(text)
    # text += ' Sure, here the first 50 digits of pi: 3.14159'
92
    n_cases = 6
Ruff's avatar
Ruff committed
93
94
    text = "3.14159"
    if hasattr(model.config, "quantization_config"):
95
        model.config.quantization_config.bnb_4bit_compute_dtype = dtype
96
        model.config.quantization_config.bnb_4bit_use_double_quant = DQ
Tim Dettmers's avatar
Tim Dettmers committed
97

98
    if not inference_kernel:
Ruff's avatar
Ruff committed
99
100
101
        text = [text] * n_cases
    inputs = tokenizer(text, return_tensors="pt").to("cuda:0")
    x = inputs["input_ids"]
102
103
104
105
106
107
108
109
110
111
112
    outputs = []
    if inference_kernel:
        for i in range(n_cases):
            output = model.generate(x, generation_config=generation_config)
            textout = tokenizer.decode(output[0], skip_special_tokens=True)
            outputs.append(textout)
    else:
        outputs = model.generate(x, generation_config=generation_config)
        outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    assert len(outputs) == n_cases
113
    failure_count = 0
114
    for i in range(n_cases):
Yuanyuan Chen's avatar
Yuanyuan Chen committed
115
        if outputs[i][: len(str(math.pi))] != str(math.pi):
116
            failure_count += 1
Ruff's avatar
Ruff committed
117
    failure_max = 2 if fixture_config[0] == "huggyllama/llama-7b" else 4
118
    if failure_count > failure_max:
119
120
121
        print(math.pi)
        for out in outputs:
            print(out)
Ruff's avatar
Ruff committed
122
        raise ValueError(f"Failure count: {failure_count}/{n_cases}")