generate.py 874 Bytes
Newer Older
Casper Hansen's avatar
Casper Hansen committed
1
2
3
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer

4

5
quant_path = "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
Casper Hansen's avatar
Casper Hansen committed
6
7

# Load model
8
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
Casper Hansen's avatar
Casper Hansen committed
9
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
Casper Hansen's avatar
Casper Hansen committed
10
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
Casper Hansen's avatar
Casper Hansen committed
11
12

# Convert prompt to tokens
13
prompt_template = "[INST] {prompt} [/INST]"
Casper Hansen's avatar
Casper Hansen committed
14

Casper Hansen's avatar
Casper Hansen committed
15
16
17
18
prompt = "You're standing on the surface of the Earth. "\
        "You walk one mile south, one mile west and one mile north. "\
        "You end up exactly where you started. Where are you?"

Casper Hansen's avatar
Casper Hansen committed
19
tokens = tokenizer(
Casper Hansen's avatar
Casper Hansen committed
20
    prompt_template.format(prompt=prompt), 
Casper Hansen's avatar
Casper Hansen committed
21
22
23
24
25
26
27
28
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens, 
    streamer=streamer,
    max_new_tokens=512
Casper Hansen's avatar
Casper Hansen committed
29
)