infer_vllm.py 957 Bytes
Newer Older
chenych's avatar
chenych committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# This example shows how to use vLLM to run the Ministral 8B model.
import argparse

from vllm import LLM
from vllm.sampling_params import SamplingParams


parse = argparse.ArgumentParser()
parse.add_argument("--model_name", type=str, default="mistralai/Ministral-8B-Instruct-2410")
parse.add_argument("--prompt", type=str, default="Do we need to think for 10 seconds to find the answer of 1 + 1?")
args = parse.parse_args()

sampling_params = SamplingParams(max_tokens=8192)

# note that running Ministral 8B on a single GPU requires 24 GB of GPU RAM
# If you want to divide the GPU requirement over multiple devices, please add *e.g.* `tensor_parallel=2`
llm = LLM(model=args.model_name, tokenizer_mode="mistral", config_format="mistral", load_format="mistral")

messages = [
    {
        "role": "user",
        "content": args.prompt
    },
]

outputs = llm.chat(messages, sampling_params=sampling_params)

print("output:", outputs[0].outputs[0].text)