neuron_speculation.py 1.83 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""
4
This example shows how to run offline inference with a speculative
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
decoding model on neuron.
"""

import os

from vllm import LLM, SamplingParams

# Sample prompts.
prompts = [
    "Hello, I am a language model and I can help",
    "The president of the United States is",
    "The capital of France is",
]


def config_buckets():
    """Configure context length and token gen buckets."""
    # creates XLA hlo graphs for all the context length buckets.
23
    os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
24
    # creates XLA hlo graphs for all the token gen buckets.
25
    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
26
27


28
def initialize_llm():
29
30
31
32
33
34
    """Create an LLM with speculative decoding."""
    return LLM(
        model="openlm-research/open_llama_7b",
        speculative_config={
            "model": "openlm-research/open_llama_3b",
            "num_speculative_tokens": 4,
35
            "max_model_len": 2048,
36
37
38
39
40
41
42
43
44
        },
        max_num_seqs=4,
        max_model_len=2048,
        block_size=2048,
        device="neuron",
        tensor_parallel_size=32,
    )


45
def process_requests(llm: LLM, sampling_params: SamplingParams):
46
    """Generate texts from prompts and print them."""
47
    outputs = llm.generate(prompts, sampling_params)
48
49
50
51
52
53
54
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


def main():
55
    """Main function that sets up the llm and processes prompts."""
56
    config_buckets()
57
    llm = initialize_llm()
58
59
    # Create a sampling params object.
    sampling_params = SamplingParams(max_tokens=100, top_k=1)
60
    process_requests(llm, sampling_params)
61
62


63
if __name__ == "__main__":
64
    main()