test_generation.py 2.86 KB
Newer Older
Tim Dettmers's avatar
Tim Dettmers committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pytest
import torch
import math

from transformers import (
  AutoConfig,
  AutoModelForCausalLM,
  AutoTokenizer,
  BitsAndBytesConfig,
  GenerationConfig,
  set_seed,

)
import transformers


def get_4bit_config():
  return BitsAndBytesConfig(
    load_in_4bit=True,
    load_in_8bit=False,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
  )


def get_model(model_name_or_path='huggyllama/llama-7b', bnb_config=get_4bit_config()):
  model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    quantization_config=bnb_config,
    max_memory={0:'48GB'},
    device_map='auto'
  ).eval()

  return model

def get_prompt_for_generation_eval(text, add_roles=True):
    description = (
        "A chat between a curious human and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's questions."
    )
    if add_roles:
        prompt = f'{description} ### Human: {text} ### Assistant:'
    else:
        prompt = f'{description} {text}'
    return prompt

def generate(model, tokenizer, text, generation_config, prompt_func=get_prompt_for_generation_eval):
    text = prompt_func(text)
    inputs = tokenizer(text, return_tensors="pt").to('cuda:0')
    outputs = model.generate(inputs=inputs['input_ids'], generation_config=generation_config)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

name_or_path = 'huggyllama/llama-7b'
#name_or_path = 'AI-Sweden/gpt-sw3-126m'

@pytest.fixture(scope='session')
def model():
    bnb_config = get_4bit_config()
    bnb_config.bnb_4bit_compute_dtype=torch.float32
    bnb_config.load_in_4bit=True
    model = get_model(name_or_path)
    print('')
    return model

@pytest.fixture(scope='session')
def tokenizer():
    tokenizer = transformers.AutoTokenizer.from_pretrained(name_or_path)
    return tokenizer

@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
def test_pi(model, tokenizer, dtype):

    generation_config = transformers.GenerationConfig(
        max_new_tokens=128,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
    )
    generation_config.max_new_tokens = 50


    #text = 'Please write down the first 50 digits of pi.'
    #text = get_prompt_for_generation_eval(text)
    #text += ' Sure, here the first 50 digits of pi: 3.14159'
    text = '3.14159'
    model.config.quantization_config.bnb_4bit_compute_dtype = dtype

    inputs = tokenizer(text, return_tensors="pt").to('cuda:0')
    outputs = model.generate(inputs=inputs['input_ids'], generation_config=generation_config)
    textout = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print('')
    print(textout)
    print(math.pi)

    assert textout[:len(str(math.pi))] == str(math.pi)