test.py 619 Bytes
Newer Older
chenzk's avatar
v1.0  
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from transformers import pipeline, AutoTokenizer
from kvpress import KnormPress

model = "Qwen/Qwen3-8B"
tokenizer = AutoTokenizer.from_pretrained(model)

# model_kwargs = {"attn_implementation": "flash_attention_2"}
model_kwargs = {"attn_implementation": "eager"}
pipe = pipeline("kv-press-text-generation", model=model, model_kwargs=model_kwargs)

context = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
question = "美国面积多大?"

press = KnormPress(compression_ratio=0.5)

answer = pipe(context, question=question, press=press, max_new_tokens=64)["answer"]
print("answer: ", answer)