Merge similar examples in `offline_inference` into single `basic` example (#12737)

992e5c3d · Harry Mellor · GitHub · b69692a2 · b69692a2 · b69692a2
Unverified Commit 992e5c3d authored Feb 20, 2025 by Harry Mellor Committed by GitHub Feb 20, 2025
9 changed files
--- a/examples/offline_inference/basic_with_model_default_sampling.py
+++ b/examples/offline_inference/basic_with_model_default_sampling.py
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-# Create an LLM with built-in default generation config.
-# The generation config is set to None by default to keep
-# the behavior consistent with the previous version.
-# If you want to use the default generation config from the model,
-# you should set the generation_config to "auto".
-llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto")
-
-# Load the default sampling parameters from the model.
-sampling_params = llm.get_default_sampling_params()
-# Modify the sampling parameters if needed.
-sampling_params.temperature = 0.5
-
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/chat.py
+++ b/examples/offline_inference/chat.py
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM, SamplingParams
-
-llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
-sampling_params = SamplingParams(temperature=0.5)
-
-
-def print_outputs(outputs):
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    print("-" * 80)
-
-
-print("=" * 80)
-
-# In this script, we demonstrate how to pass input to the chat method:
-
-conversation = [
-    {
-        "role": "system",
-        "content": "You are a helpful assistant"
-    },
-    {
-        "role": "user",
-        "content": "Hello"
-    },
-    {
-        "role": "assistant",
-        "content": "Hello! How can I assist you today?"
-    },
-    {
-        "role": "user",
-        "content": "Write an essay about the importance of higher education.",
-    },
-]
-outputs = llm.chat(conversation,
-                   sampling_params=sampling_params,
-                   use_tqdm=False)
-print_outputs(outputs)
-
-# You can run batch inference with llm.chat API
-conversation = [
-    {
-        "role": "system",
-        "content": "You are a helpful assistant"
-    },
-    {
-        "role": "user",
-        "content": "Hello"
-    },
-    {
-        "role": "assistant",
-        "content": "Hello! How can I assist you today?"
-    },
-    {
-        "role": "user",
-        "content": "Write an essay about the importance of higher education.",
-    },
-]
-conversations = [conversation for _ in range(10)]
-
-# We turn on tqdm progress bar to verify it's indeed running batch inference
-outputs = llm.chat(messages=conversations,
-                   sampling_params=sampling_params,
-                   use_tqdm=True)
-print_outputs(outputs)
-
-# A chat template can be optionally supplied.
-# If not, the model will use its default chat template.
-
-# with open('template_falcon_180b.jinja', "r") as f:
-#     chat_template = f.read()
-
-# outputs = llm.chat(
-#     conversations,
-#     sampling_params=sampling_params,
-#     use_tqdm=False,
-#     chat_template=chat_template,
-# )
--- a/examples/offline_inference/classification.py
+++ b/examples/offline_inference/classification.py
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-# Create an LLM.
-# You should pass task="classify" for classification models
-model = LLM(
-    model="jason9693/Qwen2.5-1.5B-apeach",
-    task="classify",
-    enforce_eager=True,
-)
-
-# Generate logits. The output is a list of ClassificationRequestOutputs.
-outputs = model.classify(prompts)
-
-# Print the outputs.
-for prompt, output in zip(prompts, outputs):
-    probs = output.outputs.probs
-    probs_trimmed = ((str(probs[:16])[:-1] +
-                      ", ...]") if len(probs) > 16 else probs)
-    print(f"Prompt: {prompt!r} | "
-          f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
--- a/examples/offline_inference/cli.py
+++ b/examples/offline_inference/cli.py
-# SPDX-License-Identifier: Apache-2.0
-
-from dataclasses import asdict
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
-
-
-def get_prompts(num_prompts: int):
-    # The default sample prompts.
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    if num_prompts != len(prompts):
-        prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
-
-    return prompts
-
-
-def main(args):
-    # Create prompts
-    prompts = get_prompts(args.num_prompts)
-
-    # Create a sampling params object.
-    sampling_params = SamplingParams(n=args.n,
-                                     temperature=args.temperature,
-                                     top_p=args.top_p,
-                                     top_k=args.top_k,
-                                     max_tokens=args.max_tokens)
-
-    # Create an LLM.
-    # The default model is 'facebook/opt-125m'
-    engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**asdict(engine_args))
-
-    # Generate texts from the prompts.
-    # The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-if __name__ == '__main__':
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    group = parser.add_argument_group("SamplingParams options")
-    group.add_argument("--num-prompts",
-                       type=int,
-                       default=4,
-                       help="Number of prompts used for inference")
-    group.add_argument("--max-tokens",
-                       type=int,
-                       default=16,
-                       help="Generated output length for sampling")
-    group.add_argument('--n',
-                       type=int,
-                       default=1,
-                       help='Number of generated sequences per prompt')
-    group.add_argument('--temperature',
-                       type=float,
-                       default=0.8,
-                       help='Temperature for text generation')
-    group.add_argument('--top-p',
-                       type=float,
-                       default=0.95,
-                       help='top_p for text generation')
-    group.add_argument('--top-k',
-                       type=int,
-                       default=-1,
-                       help='top_k for text generation')
-
-    args = parser.parse_args()
-    main(args)
--- a/examples/offline_inference/cpu_offload.py
+++ b/examples/offline_inference/cpu_offload.py
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-# Create an LLM.
-llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/embedding.py
+++ b/examples/offline_inference/embedding.py
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-# Create an LLM.
-# You should pass task="embed" for embedding models
-model = LLM(
-    model="intfloat/e5-mistral-7b-instruct",
-    task="embed",
-    enforce_eager=True,
-)
-
-# Generate embedding. The output is a list of EmbeddingRequestOutputs.
-outputs = model.embed(prompts)
-
-# Print the outputs.
-for prompt, output in zip(prompts, outputs):
-    embeds = output.outputs.embedding
-    embeds_trimmed = ((str(embeds[:16])[:-1] +
-                       ", ...]") if len(embeds) > 16 else embeds)
-    print(f"Prompt: {prompt!r} | "
-          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
--- a/examples/offline_inference/gguf_inference.py
+++ b/examples/offline_inference/gguf_inference.py
-# SPDX-License-Identifier: Apache-2.0
-
-from huggingface_hub import hf_hub_download
-
-from vllm import LLM, SamplingParams
-
-
-def run_gguf_inference(model_path, tokenizer):
-    # Sample prompts.
-    prompts = [
-        "How many helicopters can a human eat in one sitting?",
-        "What's the future of AI?",
-    ]
-    prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
-    # Create a sampling params object.
-    sampling_params = SamplingParams(temperature=0, max_tokens=128)
-
-    # Create an LLM.
-    llm = LLM(model=model_path, tokenizer=tokenizer)
-
-    outputs = llm.chat(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-if __name__ == "__main__":
-    repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
-    filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
-    tokenizer = "microsoft/Phi-3-medium-4k-instruct"
-    model = hf_hub_download(repo_id, filename=filename)
-    run_gguf_inference(model, tokenizer)
--- a/examples/offline_inference/scoring.py
+++ b/examples/offline_inference/scoring.py
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM
-
-# Sample prompts.
-text_1 = "What is the capital of France?"
-texts_2 = [
-    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-]
-
-# Create an LLM.
-# You should pass task="score" for cross-encoder models
-model = LLM(
-    model="BAAI/bge-reranker-v2-m3",
-    task="score",
-    enforce_eager=True,
-)
-
-# Generate scores. The output is a list of ScoringRequestOutputs.
-outputs = model.score(text_1, texts_2)
-
-# Print the outputs.
-for text_2, output in zip(texts_2, outputs):
-    score = output.outputs.score
-    print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -14,7 +14,7 @@ def test_platform_plugins():
    import os
    example_file = os.path.join(
        os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference/basic.py")
+        "examples", "offline_inference/basic/basic.py")
    runpy.run_path(example_file)

    # check if the plugin is loaded correctly