import argparse from vllm import LLM from vllm.sampling_params import SamplingParams # This script demonstrates how to use vLLM to run inference with a Mistral model. parse = argparse.ArgumentParser() parse.add_argument("--user_prompt", type=str, default="Give me 5 non-formal ways to say 'See you later' in French.") parse.add_argument("--model_name_or_path", type=str, default="mistralai/Mistral-Small-3.1-24B-Instruct-2503") args = parse.parse_args() # Define the system prompt for the conversational agent SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat." messages = [ { "role": "system", "content": SYSTEM_PROMPT }, { "role": "user", "content": args.user_prompt }, ] # note that running this model on GPU requires over 60 GB of GPU RAM llm = LLM(model=args.model_name_or_path, tokenizer_mode="mistral") sampling_params = SamplingParams(max_tokens=512, temperature=0.15) outputs = llm.chat(messages, sampling_params=sampling_params) print(outputs[0].outputs[0].text)