neuron_speculation.py 1.87 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""
4
This example shows how to run offline inference with a speculative
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
decoding model on neuron.
"""

import os

from vllm import LLM, SamplingParams

# Sample prompts.
prompts = [
    "Hello, I am a language model and I can help",
    "The president of the United States is",
    "The capital of France is",
]


def config_buckets():
    """Configure context length and token gen buckets."""
    # creates XLA hlo graphs for all the context length buckets.
23
    os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
24
    # creates XLA hlo graphs for all the token gen buckets.
25
    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
26
27


28
def initialize_llm():
29
30
31
32
33
34
    """Create an LLM with speculative decoding."""
    return LLM(
        model="openlm-research/open_llama_7b",
        speculative_config={
            "model": "openlm-research/open_llama_3b",
            "num_speculative_tokens": 4,
35
            "max_model_len": 2048,
36
37
38
39
40
41
42
43
44
45
        },
        max_num_seqs=4,
        max_model_len=2048,
        block_size=2048,
        use_v2_block_manager=True,
        device="neuron",
        tensor_parallel_size=32,
    )


46
def process_requests(llm: LLM, sampling_params: SamplingParams):
47
    """Generate texts from prompts and print them."""
48
    outputs = llm.generate(prompts, sampling_params)
49
50
51
52
53
54
55
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


def main():
56
    """Main function that sets up the llm and processes prompts."""
57
    config_buckets()
58
    llm = initialize_llm()
59
60
    # Create a sampling params object.
    sampling_params = SamplingParams(max_tokens=100, top_k=1)
61
    process_requests(llm, sampling_params)
62
63


64
if __name__ == "__main__":
65
    main()