from transformers import pipeline, AutoTokenizer from kvpress import TOVAPress model = "Qwen/Qwen3-8B" tokenizer = AutoTokenizer.from_pretrained(model) model_kwargs = {"attn_implementation": "flash_attention_2"} # model_kwargs = {"attn_implementation": "eager"} pipe = pipeline("kv-press-text-generation", model=model, model_kwargs=model_kwargs) context = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." question = "美国面积多大?" press = TOVAPress(compression_ratio=0.5) answer = pipe(context, question=question, press=press, max_new_tokens=512)["answer"] print("answer: ", answer)