# This example shows how to use vLLM to run the Ministral 8B model. import argparse from vllm import LLM from vllm.sampling_params import SamplingParams parse = argparse.ArgumentParser() parse.add_argument("--model_name", type=str, default="mistralai/Ministral-8B-Instruct-2410") parse.add_argument("--prompt", type=str, default="Do we need to think for 10 seconds to find the answer of 1 + 1?") args = parse.parse_args() sampling_params = SamplingParams(max_tokens=8192) # note that running Ministral 8B on a single GPU requires 24 GB of GPU RAM # If you want to divide the GPU requirement over multiple devices, please add *e.g.* `tensor_parallel=2` llm = LLM(model=args.model_name, tokenizer_mode="mistral", config_format="mistral", load_format="mistral") messages = [ { "role": "user", "content": args.prompt }, ] outputs = llm.chat(messages, sampling_params=sampling_params) print("output:", outputs[0].outputs[0].text)