generate.py 1.03 KB
Newer Older
Casper Hansen's avatar
Casper Hansen committed
1
2
3
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer

4

5
quant_path = "casperhansen/llama-3-8b-instruct-awq"
Casper Hansen's avatar
Casper Hansen committed
6
7

# Load model
8
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
Casper Hansen's avatar
Casper Hansen committed
9
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
Casper Hansen's avatar
Casper Hansen committed
10
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
Casper Hansen's avatar
Casper Hansen committed
11

Casper Hansen's avatar
Casper Hansen committed
12
13
14
15
prompt = "You're standing on the surface of the Earth. "\
        "You walk one mile south, one mile west and one mile north. "\
        "You end up exactly where you started. Where are you?"

16
17
18
19
20
21
22
23
24
25
26
27
28
29
chat = [
    {"role": "system", "content": "You are a concise assistant that helps answer questions."},
    {"role": "user", "content": prompt},
]

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

tokens = tokenizer.apply_chat_template(
    chat,
    return_tensors="pt"
).cuda()
Casper Hansen's avatar
Casper Hansen committed
30
31
32
33
34

# Generate output
generation_output = model.generate(
    tokens, 
    streamer=streamer,
35
36
37
    max_new_tokens=64,
    eos_token_id=terminators
)