Unverified Commit d00d6529 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[CI/Build] Replace `vllm.entrypoints.openai.api_server` entrypoint with `vllm...


[CI/Build] Replace `vllm.entrypoints.openai.api_server` entrypoint with `vllm serve` command (#25967)
Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 3b279a84
...@@ -181,18 +181,14 @@ launch_vllm_server() { ...@@ -181,18 +181,14 @@ launch_vllm_server() {
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \ server_command="vllm serve $model \
-m vllm.entrypoints.openai.api_server \
-tp $tp \ -tp $tp \
--model $model \
--port $port \ --port $port \
$server_args" $server_args"
else else
echo "Key 'fp8' does not exist in common params." echo "Key 'fp8' does not exist in common params."
server_command="python3 \ server_command="vllm serve $model \
-m vllm.entrypoints.openai.api_server \
-tp $tp \ -tp $tp \
--model $model \
--port $port \ --port $port \
$server_args" $server_args"
fi fi
......
...@@ -365,8 +365,7 @@ run_serving_tests() { ...@@ -365,8 +365,7 @@ run_serving_tests() {
continue continue
fi fi
server_command="$server_envs python3 \ server_command="$server_envs vllm serve \
-m vllm.entrypoints.openai.api_server \
$server_args" $server_args"
# run the server # run the server
......
...@@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_ ...@@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
bench_throughput_exit_code=$? bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite # run server-based benchmarks and upload the result to buildkite
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf & vllm serve meta-llama/Llama-2-7b-chat-hf &
server_pid=$! server_pid=$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
......
...@@ -55,9 +55,7 @@ benchmark() { ...@@ -55,9 +55,7 @@ benchmark() {
output_len=$2 output_len=$2
CUDA_VISIBLE_DEVICES=0 python3 \ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \ --port 8100 \
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
...@@ -65,9 +63,7 @@ benchmark() { ...@@ -65,9 +63,7 @@ benchmark() {
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \ --port 8200 \
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
......
...@@ -38,16 +38,12 @@ wait_for_server() { ...@@ -38,16 +38,12 @@ wait_for_server() {
launch_chunked_prefill() { launch_chunked_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct" model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill # disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \ --port 8100 \
--max-model-len 10000 \ --max-model-len 10000 \
--enable-chunked-prefill \ --enable-chunked-prefill \
--gpu-memory-utilization 0.6 & --gpu-memory-utilization 0.6 &
CUDA_VISIBLE_DEVICES=1 python3 \ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \ --port 8200 \
--max-model-len 10000 \ --max-model-len 10000 \
--enable-chunked-prefill \ --enable-chunked-prefill \
...@@ -62,18 +58,14 @@ launch_chunked_prefill() { ...@@ -62,18 +58,14 @@ launch_chunked_prefill() {
launch_disagg_prefill() { launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct" model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill # disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \ --port 8100 \
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \ --port 8200 \
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
......
...@@ -565,5 +565,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"] ...@@ -565,5 +565,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]
FROM vllm-openai-base AS vllm-openai FROM vllm-openai-base AS vllm-openai
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
#################### OPENAI API SERVER #################### #################### OPENAI API SERVER ####################
...@@ -177,4 +177,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -177,4 +177,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl uv pip install dist/*.whl
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
...@@ -314,4 +314,4 @@ WORKDIR /workspace/ ...@@ -314,4 +314,4 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
...@@ -309,4 +309,4 @@ USER 2000 ...@@ -309,4 +309,4 @@ USER 2000
WORKDIR /home/vllm WORKDIR /home/vllm
# Set the default entrypoint # Set the default entrypoint
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
...@@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
# install development dependencies (for testing) # install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils RUN python3 -m pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
...@@ -661,8 +661,7 @@ Benchmark the performance of multi-modal requests in vLLM. ...@@ -661,8 +661,7 @@ Benchmark the performance of multi-modal requests in vLLM.
Start vLLM: Start vLLM:
```bash ```bash
python -m vllm.entrypoints.openai.api_server \ vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \ --dtype bfloat16 \
--limit-mm-per-prompt '{"image": 1}' \ --limit-mm-per-prompt '{"image": 1}' \
--allowed-local-media-path /path/to/sharegpt4v/images --allowed-local-media-path /path/to/sharegpt4v/images
...@@ -688,8 +687,7 @@ vllm bench serve \ ...@@ -688,8 +687,7 @@ vllm bench serve \
Start vLLM: Start vLLM:
```bash ```bash
python -m vllm.entrypoints.openai.api_server \ vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \ --dtype bfloat16 \
--limit-mm-per-prompt '{"video": 1}' \ --limit-mm-per-prompt '{"video": 1}' \
--allowed-local-media-path /path/to/sharegpt4video/videos --allowed-local-media-path /path/to/sharegpt4video/videos
......
...@@ -39,8 +39,7 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example ...@@ -39,8 +39,7 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example
```bash ```bash
VLLM_TORCH_PROFILER_DIR=./vllm_profile \ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
python -m vllm.entrypoints.openai.api_server \ vllm serve meta-llama/Meta-Llama-3-70B
--model meta-llama/Meta-Llama-3-70B
``` ```
vllm bench command: vllm bench command:
......
...@@ -19,8 +19,7 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]" ...@@ -19,8 +19,7 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
1. Start the vLLM server with the supported chat completion model, e.g. 1. Start the vLLM server with the supported chat completion model, e.g.
```bash ```bash
python -m vllm.entrypoints.openai.api_server \ vllm serve mistralai/Mistral-7B-Instruct-v0.2
--model mistralai/Mistral-7B-Instruct-v0.2
``` ```
1. Call it with AutoGen: 1. Call it with AutoGen:
......
...@@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps: ...@@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps:
For example: For example:
```console ```console
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 vllm serve <model> --host 0.0.0.0 --port 8000
``` ```
3. Start the Open WebUI Docker container: 3. Start the Open WebUI Docker container:
......
...@@ -32,6 +32,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil ...@@ -32,6 +32,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
ports: 8081 # Expose to internet traffic. ports: 8081 # Expose to internet traffic.
envs: envs:
PYTHONUNBUFFERED: 1
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
...@@ -47,9 +48,8 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil ...@@ -47,9 +48,8 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
run: | run: |
conda activate vllm conda activate vllm
echo 'Starting vllm api server...' echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \ vllm serve $MODEL_NAME \
--port 8081 \ --port 8081 \
--model $MODEL_NAME \
--trust-remote-code \ --trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log & 2>&1 | tee api_server.log &
...@@ -131,6 +131,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut ...@@ -131,6 +131,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
ports: 8081 # Expose to internet traffic. ports: 8081 # Expose to internet traffic.
envs: envs:
PYTHONUNBUFFERED: 1
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
...@@ -146,9 +147,8 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut ...@@ -146,9 +147,8 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
run: | run: |
conda activate vllm conda activate vllm
echo 'Starting vllm api server...' echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \ vllm serve $MODEL_NAME \
--port 8081 \ --port 8081 \
--model $MODEL_NAME \
--trust-remote-code \ --trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log 2>&1 | tee api_server.log
...@@ -243,6 +243,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica. ...@@ -243,6 +243,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
ports: 8081 # Expose to internet traffic. ports: 8081 # Expose to internet traffic.
envs: envs:
PYTHONUNBUFFERED: 1
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
...@@ -258,9 +259,8 @@ This will scale the service up to when the QPS exceeds 2 for each replica. ...@@ -258,9 +259,8 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
run: | run: |
conda activate vllm conda activate vllm
echo 'Starting vllm api server...' echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \ vllm serve $MODEL_NAME \
--port 8081 \ --port 8081 \
--model $MODEL_NAME \
--trust-remote-code \ --trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log 2>&1 | tee api_server.log
......
...@@ -69,6 +69,11 @@ Sometimes you may see the API server entrypoint used directly instead of via the ...@@ -69,6 +69,11 @@ Sometimes you may see the API server entrypoint used directly instead of via the
python -m vllm.entrypoints.openai.api_server --model <model> python -m vllm.entrypoints.openai.api_server --model <model>
``` ```
!!! warning
`python -m vllm.entrypoints.openai.api_server` is deprecated
and may become unsupported in a future release.
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>. That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document. More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
......
...@@ -64,8 +64,7 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V ...@@ -64,8 +64,7 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users. When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
```bash ```bash
VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \ VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
--model Qwen/Qwen3-0.6B \
--enable-sleep-mode \ --enable-sleep-mode \
--port 8000 --port 8000
``` ```
......
...@@ -48,10 +48,9 @@ The following code configures vLLM in an offline mode to use speculative decodin ...@@ -48,10 +48,9 @@ The following code configures vLLM in an offline mode to use speculative decodin
To perform the same with an online mode launch the server: To perform the same with an online mode launch the server:
```bash ```bash
python -m vllm.entrypoints.openai.api_server \ vllm serve facebook/opt-6.7b \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port 8000 \ --port 8000 \
--model facebook/opt-6.7b \
--seed 42 \ --seed 42 \
-tp 1 \ -tp 1 \
--gpu_memory_utilization 0.8 \ --gpu_memory_utilization 0.8 \
......
...@@ -67,8 +67,7 @@ docker run -it \ ...@@ -67,8 +67,7 @@ docker run -it \
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following: XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
```bash ```bash
python -m vllm.entrypoints.openai.api_server \ vllm serve facebook/opt-13b \
--model=facebook/opt-13b \
--dtype=bfloat16 \ --dtype=bfloat16 \
--max_model_len=1024 \ --max_model_len=1024 \
--distributed-executor-backend=mp \ --distributed-executor-backend=mp \
......
...@@ -21,4 +21,4 @@ while IFS='=' read -r key value; do ...@@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
done < <(env | grep "^${PREFIX}") done < <(env | grep "^${PREFIX}")
# Pass the collected arguments to the main entrypoint # Pass the collected arguments to the main entrypoint
exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}" exec vllm serve "${ARGS[@]}"
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment