Unverified Commit dcf8862f authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Examples][1/n] Resettle basic examples. (#35579)


Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: default avatarwang.yuqi <noooop@126.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 43aa3892
...@@ -34,7 +34,7 @@ function cpu_tests() { ...@@ -34,7 +34,7 @@ function cpu_tests() {
# offline inference # offline inference
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
set -e set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
# Run model tests # Run model tests
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
......
...@@ -27,7 +27,7 @@ function cpu_tests() { ...@@ -27,7 +27,7 @@ function cpu_tests() {
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1 export TORCH_COMPILE_DISABLE=1
set -xve set -xve
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
# Run basic model test # Run basic model test
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
......
...@@ -25,5 +25,5 @@ remove_docker_container ...@@ -25,5 +25,5 @@ remove_docker_container
# Run the image and test offline inference # Run the image and test offline inference
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
' '
...@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \ ...@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
-e PT_HPU_LAZY_MODE=1 \ -e PT_HPU_LAZY_MODE=1 \
"${image_name}" \ "${image_name}" \
/bin/bash -c ' /bin/bash -c '
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
' '
EXITCODE=$? EXITCODE=$?
......
...@@ -34,15 +34,15 @@ docker run \ ...@@ -34,15 +34,15 @@ docker run \
set -e set -e
echo $ZE_AFFINITY_MASK echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0 pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/engine pytest -v -s v1/engine
......
...@@ -529,12 +529,12 @@ steps: ...@@ -529,12 +529,12 @@ steps:
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic # for basic
- python3 offline_inference/basic/chat.py - python3 basic/offline_inference/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 basic/offline_inference/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/classify.py - python3 basic/offline_inference/classify.py
- python3 offline_inference/basic/embed.py - python3 basic/offline_inference/embed.py
- python3 offline_inference/basic/score.py - python3 basic/offline_inference/score.py
# for multi-modal models # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
...@@ -1169,7 +1169,7 @@ steps: ...@@ -1169,7 +1169,7 @@ steps:
- pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/test_transformers.py
# - pytest -v -s tests/models/multimodal/processing/ # - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
- python3 examples/offline_inference/basic/chat.py - python3 examples/basic/offline_inference/chat.py
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
...@@ -2208,12 +2208,12 @@ steps: ...@@ -2208,12 +2208,12 @@ steps:
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic # for basic
- python3 offline_inference/basic/chat.py - python3 basic/offline_inference/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 basic/offline_inference/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/classify.py - python3 basic/offline_inference/classify.py
- python3 offline_inference/basic/embed.py - python3 basic/offline_inference/embed.py
- python3 offline_inference/basic/score.py - python3 basic/offline_inference/score.py
# for multi-modal models # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
...@@ -2789,7 +2789,7 @@ steps: ...@@ -2789,7 +2789,7 @@ steps:
- pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/test_transformers.py
# - pytest -v -s tests/models/multimodal/processing/ # - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
- python3 examples/offline_inference/basic/chat.py - python3 examples/basic/offline_inference/chat.py
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
...@@ -2816,7 +2816,7 @@ steps: ...@@ -2816,7 +2816,7 @@ steps:
- vllm/platforms/cuda.py - vllm/platforms/cuda.py
commands: commands:
- rocm-smi - rocm-smi
- python3 examples/offline_inference/basic/chat.py - python3 examples/basic/offline_inference/chat.py
# Attention # Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py - pytest -v -s tests/kernels/attention/test_attention_selector.py
......
...@@ -96,7 +96,7 @@ steps: ...@@ -96,7 +96,7 @@ steps:
- vllm/platforms/cuda.py - vllm/platforms/cuda.py
commands: commands:
- nvidia-smi - nvidia-smi
- python3 examples/offline_inference/basic/chat.py - python3 examples/basic/offline_inference/chat.py
# Attention # Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py - pytest -v -s tests/kernels/attention/test_attention_selector.py
......
...@@ -67,12 +67,13 @@ steps: ...@@ -67,12 +67,13 @@ steps:
- examples/ - examples/
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
- python3 offline_inference/basic/chat.py # for basic # for basic
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 basic/offline_inference/chat.py
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/classify.py - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/embed.py - python3 basic/offline_inference/classify.py
- python3 offline_inference/basic/score.py - python3 basic/offline_inference/embed.py
- python3 basic/offline_inference/score.py
# for multi-modal models # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
......
...@@ -65,7 +65,7 @@ steps: ...@@ -65,7 +65,7 @@ steps:
- pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py - pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py - python3 examples/basic/offline_inference/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
...@@ -259,7 +259,7 @@ ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-ll ...@@ -259,7 +259,7 @@ ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-ll
# On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15 # On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/basic/basic.py $ python examples/basic/offline_inference/basic.py
``` ```
- When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access. - When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.
......
...@@ -75,7 +75,7 @@ This guide will help you quickly get started with vLLM to perform: ...@@ -75,7 +75,7 @@ This guide will help you quickly get started with vLLM to perform:
## Offline Batched Inference ## Offline Batched Inference
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py) With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]: The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:
...@@ -228,7 +228,7 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep ...@@ -228,7 +228,7 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
print("Completion result:", completion) print("Completion result:", completion)
``` ```
A more detailed client example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py) A more detailed client example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
### OpenAI Chat Completions API with vLLM ### OpenAI Chat Completions API with vLLM
......
...@@ -59,7 +59,7 @@ for output in outputs: ...@@ -59,7 +59,7 @@ for output in outputs:
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
A code example can be found here: [examples/offline_inference/basic/basic.py](../../examples/offline_inference/basic/basic.py) A code example can be found here: [examples/basic/offline_inference/basic.py](../../examples/basic/offline_inference/basic.py)
### `LLM.beam_search` ### `LLM.beam_search`
...@@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc ...@@ -121,7 +121,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
A code example can be found here: [examples/offline_inference/basic/chat.py](../../examples/offline_inference/basic/chat.py) A code example can be found here: [examples/basic/offline_inference/chat.py](../../examples/basic/offline_inference/chat.py)
If the model doesn't have a chat template or you want to specify another one, If the model doesn't have a chat template or you want to specify another one,
you can explicitly pass a chat template: you can explicitly pass a chat template:
......
...@@ -99,7 +99,7 @@ embeds = output.outputs.embedding ...@@ -99,7 +99,7 @@ embeds = output.outputs.embedding
print(f"Embeddings: {embeds!r} (size={len(embeds)})") print(f"Embeddings: {embeds!r} (size={len(embeds)})")
``` ```
A code example can be found here: [examples/offline_inference/basic/embed.py](../../examples/offline_inference/basic/embed.py) A code example can be found here: [examples/basic/offline_inference/embed.py](../../examples/basic/offline_inference/embed.py)
### `LLM.classify` ### `LLM.classify`
...@@ -116,7 +116,7 @@ probs = output.outputs.probs ...@@ -116,7 +116,7 @@ probs = output.outputs.probs
print(f"Class Probabilities: {probs!r} (size={len(probs)})") print(f"Class Probabilities: {probs!r} (size={len(probs)})")
``` ```
A code example can be found here: [examples/offline_inference/basic/classify.py](../../examples/offline_inference/basic/classify.py) A code example can be found here: [examples/basic/offline_inference/classify.py](../../examples/basic/offline_inference/classify.py)
### `LLM.score` ### `LLM.score`
...@@ -140,7 +140,7 @@ score = output.outputs.score ...@@ -140,7 +140,7 @@ score = output.outputs.score
print(f"Score: {score}") print(f"Score: {score}")
``` ```
A code example can be found here: [examples/offline_inference/basic/score.py](../../examples/offline_inference/basic/score.py) A code example can be found here: [examples/basic/offline_inference/score.py](../../examples/basic/offline_inference/score.py)
### `LLM.reward` ### `LLM.reward`
...@@ -156,7 +156,7 @@ data = output.outputs.data ...@@ -156,7 +156,7 @@ data = output.outputs.data
print(f"Data: {data!r}") print(f"Data: {data!r}")
``` ```
A code example can be found here: [examples/offline_inference/basic/reward.py](../../examples/offline_inference/basic/reward.py) A code example can be found here: [examples/basic/offline_inference/reward.py](../../examples/basic/offline_inference/reward.py)
### `LLM.encode` ### `LLM.encode`
......
...@@ -190,7 +190,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct --enable-offline-docs ...@@ -190,7 +190,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct --enable-offline-docs
Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
Code example: [examples/online_serving/openai_completion_client.py](../../examples/online_serving/openai_completion_client.py) Code example: [examples/basic/online_serving/openai_completion_client.py](../../examples/basic/online_serving/openai_completion_client.py)
#### Extra parameters #### Extra parameters
...@@ -221,7 +221,7 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf ...@@ -221,7 +221,7 @@ see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more inf
- *Note: `image_url.detail` parameter is not supported.* - *Note: `image_url.detail` parameter is not supported.*
Code example: [examples/online_serving/openai_chat_completion_client.py](../../examples/online_serving/openai_chat_completion_client.py) Code example: [examples/basic/online_serving/openai_chat_completion_client.py](../../examples/basic/online_serving/openai_chat_completion_client.py)
#### Extra parameters #### Extra parameters
......
# Basic # Offline Inference
The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server. The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
...@@ -7,31 +7,31 @@ The `LLM` class provides the primary Python interface for doing offline inferenc ...@@ -7,31 +7,31 @@ The `LLM` class provides the primary Python interface for doing offline inferenc
The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here. The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
```bash ```bash
python examples/offline_inference/basic/basic.py python examples/basic/offline_inference/basic.py
``` ```
The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments. The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
```bash ```bash
python examples/offline_inference/basic/classify.py python examples/basic/offline_inference/classify.py
``` ```
```bash ```bash
python examples/offline_inference/basic/embed.py python examples/basic/offline_inference/embed.py
``` ```
```bash ```bash
python examples/offline_inference/basic/score.py python examples/basic/offline_inference/score.py
``` ```
The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`. The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
```bash ```bash
python examples/offline_inference/basic/chat.py python examples/basic/offline_inference/chat.py
``` ```
```bash ```bash
python examples/offline_inference/basic/generate.py python examples/basic/offline_inference/generate.py
``` ```
## Features ## Features
......
...@@ -5,6 +5,7 @@ from argparse import Namespace ...@@ -5,6 +5,7 @@ from argparse import Namespace
from vllm import LLM, EngineArgs from vllm import LLM, EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.print_utils import print_embeddings
def parse_args(): def parse_args():
...@@ -39,10 +40,8 @@ def main(args: Namespace): ...@@ -39,10 +40,8 @@ def main(args: Namespace):
print("\nGenerated Outputs:\n" + "-" * 60) print("\nGenerated Outputs:\n" + "-" * 60)
for prompt, output in zip(prompts, outputs): for prompt, output in zip(prompts, outputs):
embeds = output.outputs.embedding embeds = output.outputs.embedding
embeds_trimmed = ( print(f"Prompt: {prompt!r}")
(str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds print_embeddings(embeds)
)
print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
print("-" * 60) print("-" * 60)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment