# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from vllm import LLM, SamplingParams # Sample prompts. prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", "Hello, my name is", ] # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16) def main(model_path, tensor_parallel_size, gpu_memory_utilization, dtype): # Create an LLM. llm = LLM(model=model_path, tensor_parallel_size=tensor_parallel_size, dtype=dtype, trust_remote_code=True, enforce_eager=True, block_size=64, enable_prefix_caching=False, gpu_memory_utilization=gpu_memory_utilization) # Generate texts from the prompts. # The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}") print(f"Output: {generated_text!r}") print("-" * 60) if __name__ == "__main__": parser = argparse.ArgumentParser(description="vLLM Offline Inference Example") parser.add_argument("--model_path", type=str, default="/mnt/data/llm-models/qwen3/Qwen3-8B", help="Path to the model") parser.add_argument("--tp", "--tensor_parallel_size", type=int, default=1, help="Tensor parallel size") parser.add_argument("--gpu_memory_utilization", type=float, default=0.98, help="GPU memory utilization (0.0-1.0)") parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "float32", "int8", "auto"], help="Data type for model weights") args = parser.parse_args() main(args.model_path, args.tensor_parallel_size, args.gpu_memory_utilization, args.dtype)