context_extension.py 1.89 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
"""
This script demonstrates how to extend the context length
5
of a Qwen model using the YARN method (rope_parameters)
6
7
8
9
10
and run a simple chat example.

Usage:
    python examples/offline_inference/context_extension.py
"""
11

12
from vllm import LLM, RequestOutput, SamplingParams
13

14
15
16
17
18
19
20
21

def create_llm():
    rope_theta = 1000000
    original_max_position_embeddings = 32768
    factor = 4.0

    # Use yarn to extend context
    hf_overrides = {
22
23
        "rope_parameters": {
            "rope_theta": rope_theta,
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
            "rope_type": "yarn",
            "factor": factor,
            "original_max_position_embeddings": original_max_position_embeddings,
        },
        "max_model_len": int(original_max_position_embeddings * factor),
    }

    llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides)
    return llm


def run_llm_chat(llm):
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
        max_tokens=128,
    )

    conversation = [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Hello"},
        {"role": "assistant", "content": "Hello! How can I assist you today?"},
    ]
    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
48
49
50
    return outputs, [
        conversation,
    ]
51
52


53
def print_outputs(outputs: list[RequestOutput], conversations: list):
54
    print("\nGenerated Outputs:\n" + "-" * 80)
55
56
    for i, output in enumerate(outputs):
        prompt = conversations[i]
57
58
59
60
61
62
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}\n")
        print(f"Generated text: {generated_text!r}")
        print("-" * 80)


63
64
def main():
    llm = create_llm()
65
66
    outputs, conversations = run_llm_chat(llm)
    print_outputs(outputs, conversations)
67
68
69
70


if __name__ == "__main__":
    main()