lora_hf_play.py 1.48 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
import torch
from peft import PeftModel
from transformers import LlamaForCausalLM, LlamaTokenizer

MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
# ADAPTER = "winddude/wizardLM-LlaMA-LoRA-7B"
ADAPTER = "/home/ying/test_lora"
HF_TOKEN = "..."


prompt = """
### Instruction:
13
Write a poem about the transformers Python library.
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
Mention the word "large language models" in that poem.
### Response:
The Transformers are large language models,
They're used to make predictions on text.
"""


tokenizer = LlamaTokenizer.from_pretrained(MODEL)

base_model = LlamaForCausalLM.from_pretrained(
    MODEL,
    device_map="auto",
    # load_in_8bit=True,
    torch_dtype=torch.float16,
    # use_auth_token=HF_TOKEN,
).cuda()


# base model generate
with torch.no_grad():
    output_tensors = base_model.generate(
        input_ids=tokenizer(prompt, return_tensors="pt").input_ids.cuda(),
        max_new_tokens=32,
        do_sample=False,
    )[0]

output = tokenizer.decode(output_tensors, skip_special_tokens=True)
print("======= base output ========")
print(output)


# peft model generate
model = PeftModel.from_pretrained(
    base_model,
    ADAPTER,
    torch_dtype=torch.float16,
    is_trainable=False,
)

with torch.no_grad():
    output_tensors = model.generate(
        input_ids=tokenizer(prompt, return_tensors="pt").input_ids.cuda(),
        max_new_tokens=32,
        do_sample=False,
    )[0]

output = tokenizer.decode(output_tensors, skip_special_tokens=True)
print("======= peft output ========")
print(output)