Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
006693ed
Commit
006693ed
authored
Dec 01, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.11.2' into v0.11.2-ori
parents
4b51e6f1
275de341
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
612 additions
and
254 deletions
+612
-254
examples/online_serving/pooling/embedding_requests_base64_client.py
...nline_serving/pooling/embedding_requests_base64_client.py
+64
-0
examples/online_serving/pooling/embedding_requests_bytes_client.py
...online_serving/pooling/embedding_requests_bytes_client.py
+66
-0
examples/online_serving/pooling/multi_vector_retrieval_client.py
...s/online_serving/pooling/multi_vector_retrieval_client.py
+54
-0
examples/online_serving/pooling/ner_client.py
examples/online_serving/pooling/ner_client.py
+0
-0
examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
...ng/pooling/openai_chat_embedding_client_for_multimodal.py
+229
-70
examples/online_serving/pooling/openai_cross_encoder_score.py
...ples/online_serving/pooling/openai_cross_encoder_score.py
+0
-0
examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
...ving/pooling/openai_cross_encoder_score_for_multimodal.py
+0
-0
examples/online_serving/pooling/prithvi_geospatial_mae.py
examples/online_serving/pooling/prithvi_geospatial_mae.py
+4
-4
examples/online_serving/prometheus_grafana/grafana.json
examples/online_serving/prometheus_grafana/grafana.json
+1
-1
examples/online_serving/ray_serve_deepseek.py
examples/online_serving/ray_serve_deepseek.py
+0
-1
examples/online_serving/run_cluster.sh
examples/online_serving/run_cluster.sh
+24
-0
examples/online_serving/sagemaker-entrypoint.sh
examples/online_serving/sagemaker-entrypoint.sh
+1
-1
examples/online_serving/streamlit_openai_chatbot_webserver.py
...ples/online_serving/streamlit_openai_chatbot_webserver.py
+4
-4
examples/online_serving/structured_outputs/README.md
examples/online_serving/structured_outputs/README.md
+1
-1
examples/online_serving/structured_outputs/pyproject.toml
examples/online_serving/structured_outputs/pyproject.toml
+1
-1
examples/online_serving/structured_outputs/structured_outputs.py
...s/online_serving/structured_outputs/structured_outputs.py
+5
-11
examples/online_serving/token_generation_client.py
examples/online_serving/token_generation_client.py
+49
-0
examples/others/lmcache/cpu_offload_lmcache.py
examples/others/lmcache/cpu_offload_lmcache.py
+12
-31
examples/others/tensorize_vllm_model.py
examples/others/tensorize_vllm_model.py
+97
-75
examples/pyproject.toml
examples/pyproject.toml
+0
-54
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
examples/online_serving/pooling/embedding_requests_base64_client.py
0 → 100644
View file @
006693ed
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for embedding API using vLLM API server
NOTE:
start a supported embeddings model server with `vllm serve`, e.g.
vllm serve intfloat/e5-small
"""
import
argparse
import
base64
import
requests
import
torch
from
vllm.utils.serial_utils
import
(
EMBED_DTYPE_TO_TORCH_DTYPE
,
ENDIANNESS
,
binary2tensor
,
)
def
post_http_request
(
prompt
:
dict
,
api_url
:
str
)
->
requests
.
Response
:
headers
=
{
"User-Agent"
:
"Test Client"
}
response
=
requests
.
post
(
api_url
,
headers
=
headers
,
json
=
prompt
)
return
response
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"intfloat/e5-small"
)
return
parser
.
parse_args
()
def
main
(
args
):
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/v1/embeddings"
model_name
=
args
.
model
# The OpenAI client does not support the embed_dtype and endianness parameters.
for
embed_dtype
in
EMBED_DTYPE_TO_TORCH_DTYPE
:
for
endianness
in
ENDIANNESS
:
prompt
=
{
"model"
:
model_name
,
"input"
:
"vLLM is great!"
,
"encoding_format"
:
"base64"
,
"embed_dtype"
:
embed_dtype
,
"endianness"
:
endianness
,
}
response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
embedding
=
[]
for
data
in
response
.
json
()[
"data"
]:
binary
=
base64
.
b64decode
(
data
[
"embedding"
])
tensor
=
binary2tensor
(
binary
,
(
-
1
,),
embed_dtype
,
endianness
)
embedding
.
append
(
tensor
.
to
(
torch
.
float32
))
embedding
=
torch
.
cat
(
embedding
)
print
(
embed_dtype
,
endianness
,
embedding
.
shape
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/pooling/embedding_requests_bytes_client.py
0 → 100644
View file @
006693ed
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Example Python client for embedding API using vLLM API server
NOTE:
start a supported embeddings model server with `vllm serve`, e.g.
vllm serve intfloat/e5-small
"""
import
argparse
import
json
import
requests
import
torch
from
vllm.utils.serial_utils
import
(
EMBED_DTYPE_TO_TORCH_DTYPE
,
ENDIANNESS
,
MetadataItem
,
decode_pooling_output
,
)
def
post_http_request
(
prompt
:
dict
,
api_url
:
str
)
->
requests
.
Response
:
headers
=
{
"User-Agent"
:
"Test Client"
}
response
=
requests
.
post
(
api_url
,
headers
=
headers
,
json
=
prompt
)
return
response
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"intfloat/e5-small"
)
return
parser
.
parse_args
()
def
main
(
args
):
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/v1/embeddings"
model_name
=
args
.
model
# The OpenAI client does not support the bytes encoding_format.
# The OpenAI client does not support the embed_dtype and endianness parameters.
for
embed_dtype
in
EMBED_DTYPE_TO_TORCH_DTYPE
:
for
endianness
in
ENDIANNESS
:
prompt
=
{
"model"
:
model_name
,
"input"
:
"vLLM is great!"
,
"encoding_format"
:
"bytes"
,
"embed_dtype"
:
embed_dtype
,
"endianness"
:
endianness
,
}
response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
metadata
=
json
.
loads
(
response
.
headers
[
"metadata"
])
body
=
response
.
content
items
=
[
MetadataItem
(
**
x
)
for
x
in
metadata
[
"data"
]]
embedding
=
decode_pooling_output
(
items
=
items
,
body
=
body
)
embedding
=
[
x
.
to
(
torch
.
float32
)
for
x
in
embedding
]
embedding
=
torch
.
cat
(
embedding
)
print
(
embed_dtype
,
endianness
,
embedding
.
shape
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/pooling/multi_vector_retrieval_client.py
0 → 100644
View file @
006693ed
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Example online usage of Pooling API for multi vector retrieval.
Run `vllm serve <model> --runner pooling`
to start up the server in vLLM. e.g.
vllm serve BAAI/bge-m3
"""
import
argparse
import
requests
import
torch
def
post_http_request
(
prompt
:
dict
,
api_url
:
str
)
->
requests
.
Response
:
headers
=
{
"User-Agent"
:
"Test Client"
}
response
=
requests
.
post
(
api_url
,
headers
=
headers
,
json
=
prompt
)
return
response
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"BAAI/bge-m3"
)
return
parser
.
parse_args
()
def
main
(
args
):
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/pooling"
model_name
=
args
.
model
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
prompt
=
{
"model"
:
model_name
,
"input"
:
prompts
}
pooling_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
for
output
in
pooling_response
.
json
()[
"data"
]:
multi_vector
=
torch
.
tensor
(
output
[
"data"
])
print
(
multi_vector
.
shape
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/pooling/ner.py
→
examples/online_serving/pooling/ner
_client
.py
View file @
006693ed
File moved
examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
View file @
006693ed
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
# ruff: noqa: E501
"""Example Python client for multimodal embedding API using vLLM API server
"""Example Python client for multimodal embedding API using vLLM API server.
NOTE:
start a supported multimodal embeddings model server with `vllm serve`, e.g.
Refer to each `run_*` function for the command to run the server for that model.
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024
"""
"""
import
argparse
import
argparse
import
base64
import
base64
import
io
import
io
from
typing
import
Literal
import
requests
from
openai
import
OpenAI
from
openai._types
import
NOT_GIVEN
,
NotGiven
from
openai.types.chat
import
ChatCompletionMessageParam
from
openai.types.create_embedding_response
import
CreateEmbeddingResponse
from
PIL
import
Image
from
PIL
import
Image
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
image_url
=
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
image_url
=
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
def
vlm2vec
():
def
create_chat_embeddings
(
response
=
requests
.
post
(
client
:
OpenAI
,
"http://localhost:8000/v1/embeddings"
,
*
,
json
=
{
messages
:
list
[
ChatCompletionMessageParam
],
"model"
:
"TIGER-Lab/VLM2Vec-Full"
,
model
:
str
,
"messages"
:
[
encoding_format
:
Literal
[
"base64"
,
"float"
]
|
NotGiven
=
NOT_GIVEN
,
{
)
->
CreateEmbeddingResponse
:
"role"
:
"user"
,
"""
"content"
:
[
Convenience function for accessing vLLM's Chat Embeddings API,
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
which is an extension of OpenAI's existing Embeddings API.
{
"type"
:
"text"
,
"text"
:
"Represent the given image."
},
"""
],
return
client
.
post
(
}
"/embeddings"
,
],
cast_to
=
CreateEmbeddingResponse
,
"encoding_format"
:
"float"
,
body
=
{
"messages"
:
messages
,
"model"
:
model
,
"encoding_format"
:
encoding_format
},
},
)
)
response
.
raise_for_status
()
response_json
=
response
.
json
()
print
(
"Embedding output:"
,
response_json
[
"data"
][
0
][
"embedding"
])
def
run_clip
(
client
:
OpenAI
,
model
:
str
):
"""
Start the server using:
vllm serve openai/clip-vit-base-patch32
\
--runner pooling
"""
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
],
}
],
model
=
model
,
encoding_format
=
"float"
,
)
print
(
"Image embedding output:"
,
response
.
data
[
0
].
embedding
)
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"a photo of a cat"
},
],
}
],
model
=
model
,
encoding_format
=
"float"
,
)
print
(
"Text embedding output:"
,
response
.
data
[
0
].
embedding
)
def
run_dse_qwen2_vl
(
client
:
OpenAI
,
model
:
str
):
"""
Start the server using:
def
dse_qwen2_vl
(
inp
:
dict
):
vllm serve MrLight/dse-qwen2-2b-mrl-v1
\
# Embedding an Image
--runner pooling
\
if
inp
[
"type"
]
==
"image"
:
--trust-remote-code
\
messages
=
[
--max-model-len 8192
\
--chat-template examples/template_dse_qwen2_vl.jinja
"""
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
{
"role"
:
"user"
,
"role"
:
"user"
,
"content"
:
[
"content"
:
[
{
{
"type"
:
"image_url"
,
"type"
:
"image_url"
,
"image_url"
:
{
"image_url"
:
{
"url"
:
inp
[
"
image_url
"
]
,
"url"
:
image_url
,
},
},
},
},
{
"type"
:
"text"
,
"text"
:
"What is shown in this image?"
},
{
"type"
:
"text"
,
"text"
:
"What is shown in this image?"
},
],
],
}
}
]
],
# Embedding a Text Query
model
=
model
,
else
:
encoding_format
=
"float"
,
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
)
# of the minimum input size
buffer
=
io
.
BytesIO
()
print
(
"Image embedding output:"
,
response
.
data
[
0
].
embedding
)
image_placeholder
=
Image
.
new
(
"RGB"
,
(
56
,
56
))
image_placeholder
.
save
(
buffer
,
"png"
)
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
buffer
.
seek
(
0
)
# of the minimum input size
image_placeholder
=
base64
.
b64encode
(
buffer
.
read
()).
decode
(
"utf-8"
)
buffer
=
io
.
BytesIO
()
messages
=
[
image_placeholder
=
Image
.
new
(
"RGB"
,
(
56
,
56
))
image_placeholder
.
save
(
buffer
,
"png"
)
buffer
.
seek
(
0
)
image_placeholder
=
base64
.
b64encode
(
buffer
.
read
()).
decode
(
"utf-8"
)
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
{
"role"
:
"user"
,
"role"
:
"user"
,
"content"
:
[
"content"
:
[
...
@@ -76,23 +134,129 @@ def dse_qwen2_vl(inp: dict):
...
@@ -76,23 +134,129 @@ def dse_qwen2_vl(inp: dict):
"url"
:
f
"data:image/jpeg;base64,
{
image_placeholder
}
"
,
"url"
:
f
"data:image/jpeg;base64,
{
image_placeholder
}
"
,
},
},
},
},
{
"type"
:
"text"
,
"text"
:
f
"Query:
{
inp
[
'content'
]
}
"
},
{
"type"
:
"text"
,
"text"
:
"Query: What is the weather like today?"
},
],
}
],
model
=
model
,
encoding_format
=
"float"
,
)
print
(
"Text embedding output:"
,
response
.
data
[
0
].
embedding
)
def
run_siglip
(
client
:
OpenAI
,
model
:
str
):
"""
Start the server using:
vllm serve google/siglip-base-patch16-224
\
--runner pooling
"""
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
],
],
}
}
]
],
model
=
model
,
response
=
requests
.
post
(
encoding_format
=
"float"
,
"http://localhost:8000/v1/embeddings"
,
json
=
{
"model"
:
"MrLight/dse-qwen2-2b-mrl-v1"
,
"messages"
:
messages
,
"encoding_format"
:
"float"
,
},
)
)
response
.
raise_for_status
()
response_json
=
response
.
json
()
print
(
"Embedding output:"
,
response_json
[
"data"
][
0
][
"embedding"
])
print
(
"Image embedding output:"
,
response
.
data
[
0
].
embedding
)
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"a photo of a cat"
},
],
}
],
model
=
model
,
encoding_format
=
"float"
,
)
print
(
"Text embedding output:"
,
response
.
data
[
0
].
embedding
)
def
run_vlm2vec
(
client
:
OpenAI
,
model
:
str
):
"""
Start the server using:
vllm serve TIGER-Lab/VLM2Vec-Full
\
--runner pooling
\
--trust-remote-code
\
--max-model-len 4096
\
--chat-template examples/template_vlm2vec_phi3v.jinja
"""
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
{
"type"
:
"text"
,
"text"
:
"Represent the given image."
},
],
}
],
model
=
model
,
encoding_format
=
"float"
,
)
print
(
"Image embedding output:"
,
response
.
data
[
0
].
embedding
)
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}},
{
"type"
:
"text"
,
"text"
:
"Represent the given image with the following question: What is in the image."
,
},
],
}
],
model
=
model
,
encoding_format
=
"float"
,
)
print
(
"Image+Text embedding output:"
,
response
.
data
[
0
].
embedding
)
response
=
create_chat_embeddings
(
client
,
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"A cat and a dog"
},
],
}
],
model
=
model
,
encoding_format
=
"float"
,
)
print
(
"Text embedding output:"
,
response
.
data
[
0
].
embedding
)
model_example_map
=
{
"clip"
:
run_clip
,
"dse_qwen2_vl"
:
run_dse_qwen2_vl
,
"siglip"
:
run_siglip
,
"vlm2vec"
:
run_vlm2vec
,
}
def
parse_args
():
def
parse_args
():
...
@@ -103,29 +267,24 @@ def parse_args():
...
@@ -103,29 +267,24 @@ def parse_args():
parser
.
add_argument
(
parser
.
add_argument
(
"--model"
,
"--model"
,
type
=
str
,
type
=
str
,
choices
=
[
"vlm2vec"
,
"dse_qwen2_vl"
]
,
choices
=
model_example_map
.
keys
()
,
required
=
True
,
required
=
True
,
help
=
"
Which model to cal
l."
,
help
=
"
The name of the embedding mode
l."
,
)
)
return
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
(
args
):
def
main
(
args
):
if
args
.
model
==
"vlm2vec"
:
client
=
OpenAI
(
vlm2vec
()
# defaults to os.environ.get("OPENAI_API_KEY")
elif
args
.
model
==
"dse_qwen2_vl"
:
api_key
=
openai_api_key
,
dse_qwen2_vl
(
base_url
=
openai_api_base
,
{
)
"type"
:
"image"
,
"image_url"
:
image_url
,
models
=
client
.
models
.
list
()
}
model_id
=
models
.
data
[
0
].
id
)
dse_qwen2_vl
(
model_example_map
[
args
.
model
](
client
,
model_id
)
{
"type"
:
"text"
,
"content"
:
"What is the weather like today?"
,
}
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
examples/online_serving/openai_cross_encoder_score.py
→
examples/online_serving/
pooling/
openai_cross_encoder_score.py
View file @
006693ed
File moved
examples/online_serving/openai_cross_encoder_score_for_multimodal.py
→
examples/online_serving/
pooling/
openai_cross_encoder_score_for_multimodal.py
View file @
006693ed
File moved
examples/online_serving/prithvi_geospatial_mae.py
→
examples/online_serving/
pooling/
prithvi_geospatial_mae.py
View file @
006693ed
...
@@ -11,14 +11,15 @@ import requests
...
@@ -11,14 +11,15 @@ import requests
# image as input, process it using the multimodal data processor, and
# image as input, process it using the multimodal data processor, and
# perform inference.
# perform inference.
# Requirements :
# Requirements :
# - install
plugin at
:
# - install
TerraTorch v1.1 (or later)
:
#
https://github.com/christian-pinto/prithvi_io_processor_plugin
#
pip install terratorch>=v1.1
# - start vllm in serving mode with the below args
# - start vllm in serving mode with the below args
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
# --model-impl terratorch
# --model-impl terratorch
# --task embed --trust-remote-code
# --task embed --trust-remote-code
# --skip-tokenizer-init --enforce-eager
# --skip-tokenizer-init --enforce-eager
# --io-processor-plugin prithvi_to_tiff
# --io-processor-plugin terratorch_segmentation
# --enable-mm-embeds
def
main
():
def
main
():
...
@@ -34,7 +35,6 @@ def main():
...
@@ -34,7 +35,6 @@ def main():
},
},
"priority"
:
0
,
"priority"
:
0
,
"model"
:
"christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
,
"model"
:
"christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
,
"softmax"
:
False
,
}
}
ret
=
requests
.
post
(
server_endpoint
,
json
=
request_payload_url
)
ret
=
requests
.
post
(
server_endpoint
,
json
=
request_payload_url
)
...
...
examples/online_serving/prometheus_grafana/grafana.json
View file @
006693ed
...
@@ -852,7 +852,7 @@
...
@@ -852,7 +852,7 @@
"uid"
:
"${DS_PROMETHEUS}"
"uid"
:
"${DS_PROMETHEUS}"
},
},
"editorMode"
:
"code"
,
"editorMode"
:
"code"
,
"expr"
:
"vllm:
gpu
_cache_usage_perc{model_name=
\"
$model_name
\"
}"
,
"expr"
:
"vllm:
kv
_cache_usage_perc{model_name=
\"
$model_name
\"
}"
,
"instant"
:
false
,
"instant"
:
false
,
"legendFormat"
:
"GPU Cache Usage"
,
"legendFormat"
:
"GPU Cache Usage"
,
"range"
:
true
,
"range"
:
true
,
...
...
examples/online_serving/ray_serve_deepseek.py
View file @
006693ed
...
@@ -36,7 +36,6 @@ llm_config = LLMConfig(
...
@@ -36,7 +36,6 @@ llm_config = LLMConfig(
},
},
# Set to the node's accelerator type.
# Set to the node's accelerator type.
accelerator_type
=
"H100"
,
accelerator_type
=
"H100"
,
runtime_env
=
{
"env_vars"
:
{
"VLLM_USE_V1"
:
"1"
}},
# Customize engine arguments as required (for example, vLLM engine kwargs).
# Customize engine arguments as required (for example, vLLM engine kwargs).
engine_kwargs
=
{
engine_kwargs
=
{
"tensor_parallel_size"
:
8
,
"tensor_parallel_size"
:
8
,
...
...
examples/online_serving/run_cluster.sh
View file @
006693ed
...
@@ -83,6 +83,29 @@ else
...
@@ -83,6 +83,29 @@ else
RAY_START_CMD+
=
" --address=
${
HEAD_NODE_ADDRESS
}
:6379"
RAY_START_CMD+
=
" --address=
${
HEAD_NODE_ADDRESS
}
:6379"
fi
fi
# Parse VLLM_HOST_IP from additional args if present.
# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
VLLM_HOST_IP
=
""
for
arg
in
"
${
ADDITIONAL_ARGS
[@]
}
"
;
do
if
[[
$arg
==
"-e"
]]
;
then
continue
fi
if
[[
$arg
==
VLLM_HOST_IP
=
*
]]
;
then
VLLM_HOST_IP
=
"
${
arg
#VLLM_HOST_IP=
}
"
break
fi
done
# Build Ray IP environment variables if VLLM_HOST_IP is set.
# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
RAY_IP_VARS
=()
if
[
-n
"
${
VLLM_HOST_IP
}
"
]
;
then
RAY_IP_VARS
=(
-e
"RAY_NODE_IP_ADDRESS=
${
VLLM_HOST_IP
}
"
-e
"RAY_OVERRIDE_NODE_IP_ADDRESS=
${
VLLM_HOST_IP
}
"
)
fi
# Launch the container with the assembled parameters.
# Launch the container with the assembled parameters.
# --network host: Allows Ray nodes to communicate directly via host networking
# --network host: Allows Ray nodes to communicate directly via host networking
# --shm-size 10.24g: Increases shared memory
# --shm-size 10.24g: Increases shared memory
...
@@ -95,5 +118,6 @@ docker run \
...
@@ -95,5 +118,6 @@ docker run \
--shm-size
10.24g
\
--shm-size
10.24g
\
--gpus
all
\
--gpus
all
\
-v
"
${
PATH_TO_HF_HOME
}
:/root/.cache/huggingface"
\
-v
"
${
PATH_TO_HF_HOME
}
:/root/.cache/huggingface"
\
"
${
RAY_IP_VARS
[@]
}
"
\
"
${
ADDITIONAL_ARGS
[@]
}
"
\
"
${
ADDITIONAL_ARGS
[@]
}
"
\
"
${
DOCKER_IMAGE
}
"
-c
"
${
RAY_START_CMD
}
"
"
${
DOCKER_IMAGE
}
"
-c
"
${
RAY_START_CMD
}
"
examples/online_serving/sagemaker-entrypoint.sh
View file @
006693ed
...
@@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
...
@@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
done
< <
(
env
|
grep
"^
${
PREFIX
}
"
)
done
< <
(
env
|
grep
"^
${
PREFIX
}
"
)
# Pass the collected arguments to the main entrypoint
# Pass the collected arguments to the main entrypoint
exec
python3
-m
vllm.entrypoints.openai.api_server
"
${
ARGS
[@]
}
"
exec
vllm serve
"
${
ARGS
[@]
}
"
\ No newline at end of file
\ No newline at end of file
examples/online_serving/streamlit_openai_chatbot_webserver.py
View file @
006693ed
...
@@ -159,8 +159,8 @@ def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None
...
@@ -159,8 +159,8 @@ def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None
for
chunk
in
response
:
for
chunk
in
response
:
delta
=
chunk
.
choices
[
0
].
delta
delta
=
chunk
.
choices
[
0
].
delta
# Stream reasoning first
# Stream reasoning first
if
reason
and
hasattr
(
delta
,
"reasoning
_content
"
)
and
live_think
:
if
reason
and
hasattr
(
delta
,
"reasoning"
)
and
live_think
:
rc
=
delta
.
reasoning
_content
rc
=
delta
.
reasoning
if
rc
:
if
rc
:
think_text
+=
rc
think_text
+=
rc
live_think
.
markdown
(
think_text
+
"▌"
)
live_think
.
markdown
(
think_text
+
"▌"
)
...
@@ -262,8 +262,8 @@ def server_supports_reasoning():
...
@@ -262,8 +262,8 @@ def server_supports_reasoning():
messages
=
[{
"role"
:
"user"
,
"content"
:
"Hi"
}],
messages
=
[{
"role"
:
"user"
,
"content"
:
"Hi"
}],
stream
=
False
,
stream
=
False
,
)
)
return
hasattr
(
resp
.
choices
[
0
].
message
,
"reasoning
_content
"
)
and
bool
(
return
hasattr
(
resp
.
choices
[
0
].
message
,
"reasoning"
)
and
bool
(
resp
.
choices
[
0
].
message
.
reasoning
_content
resp
.
choices
[
0
].
message
.
reasoning
)
)
...
...
examples/online_serving/structured_outputs/README.md
View file @
006693ed
...
@@ -21,7 +21,7 @@ If you want to run this script standalone with `uv`, you can use the following:
...
@@ -21,7 +21,7 @@ If you want to run this script standalone with `uv`, you can use the following:
```
bash
```
bash
uvx
--from
git+https://github.com/vllm-project/vllm#subdirectory
=
examples/online_serving/structured_outputs
\
uvx
--from
git+https://github.com/vllm-project/vllm#subdirectory
=
examples/online_serving/structured_outputs
\
structured-output
structured-output
s
```
```
See
[
feature docs
](
https://docs.vllm.ai/en/latest/features/structured_outputs.html
)
for more information.
See
[
feature docs
](
https://docs.vllm.ai/en/latest/features/structured_outputs.html
)
for more information.
...
...
examples/online_serving/structured_outputs/pyproject.toml
View file @
006693ed
[project]
[project]
name
=
"examples-online-structured-outputs"
name
=
"examples-online-structured-outputs"
requires-python
=
">=3.
9
, <3.1
3
"
requires-python
=
">=3.
10
, <3.1
4
"
dependencies
=
[
"openai==1.78.1"
,
"pydantic==2.11.4"
]
dependencies
=
[
"openai==1.78.1"
,
"pydantic==2.11.4"
]
version
=
"0.0.0"
version
=
"0.0.0"
...
...
examples/online_serving/structured_outputs/structured_outputs.py
View file @
006693ed
# ruff: noqa: E501
# ruff: noqa: E501
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
import
argparse
import
argparse
import
asyncio
import
asyncio
import
enum
import
enum
import
os
import
os
from
typing
import
TYPE_CHECKING
,
Any
,
Literal
from
typing
import
Any
,
Literal
import
openai
import
openai
import
pydantic
import
pydantic
from
openai.types.chat
import
ChatCompletionChunk
if
TYPE_CHECKING
:
from
openai.types.chat
import
ChatCompletionChunk
ConstraintsFormat
=
Literal
[
ConstraintsFormat
=
Literal
[
"choice"
,
"choice"
,
...
@@ -39,7 +33,7 @@ async def print_stream_response(
...
@@ -39,7 +33,7 @@ async def print_stream_response(
async
for
chunk
in
stream_response
:
async
for
chunk
in
stream_response
:
delta
=
chunk
.
choices
[
0
].
delta
delta
=
chunk
.
choices
[
0
].
delta
reasoning_chunk_text
:
str
|
None
=
getattr
(
delta
,
"reasoning
_content
"
,
None
)
reasoning_chunk_text
:
str
|
None
=
getattr
(
delta
,
"reasoning"
,
None
)
content_chunk_text
=
delta
.
content
content_chunk_text
=
delta
.
content
if
args
.
reasoning
:
if
args
.
reasoning
:
...
@@ -261,8 +255,8 @@ async def cli():
...
@@ -261,8 +255,8 @@ async def cli():
for
constraint
,
response
in
zip
(
constraints
,
results
):
for
constraint
,
response
in
zip
(
constraints
,
results
):
print
(
f
"
\n\n
{
constraint
}
:"
)
print
(
f
"
\n\n
{
constraint
}
:"
)
message
=
response
.
choices
[
0
].
message
message
=
response
.
choices
[
0
].
message
if
args
.
reasoning
and
hasattr
(
message
,
"reasoning
_content
"
):
if
args
.
reasoning
and
hasattr
(
message
,
"reasoning"
):
print
(
f
" Reasoning:
{
message
.
reasoning
_content
or
''
}
"
)
print
(
f
" Reasoning:
{
message
.
reasoning
or
''
}
"
)
print
(
f
" Content:
{
message
.
content
!
r
}
"
)
print
(
f
" Content:
{
message
.
content
!
r
}
"
)
...
...
examples/online_serving/token_generation_client.py
0 → 100644
View file @
006693ed
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
httpx
from
transformers
import
AutoTokenizer
GEN_ENDPOINT
=
"http://localhost:8000/inference/v1/generate"
DUMMY_API_KEY
=
"empty"
MODEL_NAME
=
"Qwen/Qwen3-0.6B"
transport
=
httpx
.
HTTPTransport
()
headers
=
{
"Authorization"
:
f
"Bearer
{
DUMMY_API_KEY
}
"
}
client
=
httpx
.
Client
(
transport
=
transport
,
base_url
=
GEN_ENDPOINT
,
timeout
=
600
,
headers
=
headers
,
)
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"How many countries are in the EU?"
},
]
def
main
(
client
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
token_ids
=
tokenizer
.
apply_chat_template
(
messages
,
add_generation_prompt
=
True
,
enable_thinking
=
False
,
)
payload
=
{
"model"
:
MODEL_NAME
,
"token_ids"
:
token_ids
,
"sampling_params"
:
{
"max_tokens"
:
24
,
"temperature"
:
0.2
,
"detokenize"
:
False
},
"stream"
:
False
,
}
resp
=
client
.
post
(
GEN_ENDPOINT
,
json
=
payload
)
resp
.
raise_for_status
()
data
=
resp
.
json
()
print
(
data
)
print
(
"-"
*
50
)
print
(
"Token generation results:"
)
res
=
tokenizer
.
decode
(
data
[
"choices"
][
0
][
"token_ids"
])
print
(
res
)
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
main
(
client
)
examples/others/lmcache/cpu_offload_lmcache.py
View file @
006693ed
...
@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
...
@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
def
setup_environment_variables
(
vllm_version
:
str
):
def
setup_environment_variables
():
# LMCache-related environment variables
# LMCache-related environment variables
# Use experimental features in LMCache
# Use experimental features in LMCache
os
.
environ
[
"LMCACHE_USE_EXPERIMENTAL"
]
=
"True"
os
.
environ
[
"LMCACHE_USE_EXPERIMENTAL"
]
=
"True"
...
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
...
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
os
.
environ
[
"LMCACHE_LOCAL_CPU"
]
=
"True"
os
.
environ
[
"LMCACHE_LOCAL_CPU"
]
=
"True"
# Set local CPU memory limit to 5.0 GB
# Set local CPU memory limit to 5.0 GB
os
.
environ
[
"LMCACHE_MAX_LOCAL_CPU_SIZE"
]
=
"5.0"
os
.
environ
[
"LMCACHE_MAX_LOCAL_CPU_SIZE"
]
=
"5.0"
if
vllm_version
==
"v0"
:
os
.
environ
[
"VLLM_USE_V1"
]
=
"0"
@
contextlib
.
contextmanager
@
contextlib
.
contextmanager
def
build_llm_with_lmcache
(
lmcache_connector
:
str
,
model
:
str
,
vllm_version
:
str
):
def
build_llm_with_lmcache
(
lmcache_connector
:
str
,
model
:
str
):
ktc
=
KVTransferConfig
(
ktc
=
KVTransferConfig
(
kv_connector
=
lmcache_connector
,
kv_connector
=
lmcache_connector
,
kv_role
=
"kv_both"
,
kv_role
=
"kv_both"
,
...
@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
...
@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
# memory. Reduce the value if your GPU has less memory.
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
if
vllm_version
==
"v0"
:
llm_args
=
EngineArgs
(
llm_args
=
EngineArgs
(
model
=
model
,
model
=
model
,
kv_transfer_config
=
ktc
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
gpu_memory_utilization
=
0.8
,
)
enable_chunked_prefill
=
True
,
# Only in v0
)
else
:
llm_args
=
EngineArgs
(
model
=
model
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
)
llm
=
LLM
(
**
asdict
(
llm_args
))
llm
=
LLM
(
**
asdict
(
llm_args
))
try
:
try
:
...
@@ -116,18 +105,10 @@ def parse_args():
...
@@ -116,18 +105,10 @@ def parse_args():
def
main
():
def
main
():
args
=
parse_args
()
lmcache_connector
=
"LMCacheConnectorV1"
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
if
args
.
version
==
"v0"
:
setup_environment_variables
()
lmcache_connector
=
"LMCacheConnector"
with
build_llm_with_lmcache
(
lmcache_connector
,
model
)
as
llm
:
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
else
:
lmcache_connector
=
"LMCacheConnectorV1"
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
setup_environment_variables
(
args
.
version
)
with
build_llm_with_lmcache
(
lmcache_connector
,
model
,
args
.
version
)
as
llm
:
# This example script runs two requests with a shared prefix.
# This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts
# Define the shared prompt and specific prompts
shared_prompt
=
"Hello, how are you?"
*
1000
shared_prompt
=
"Hello, how are you?"
*
1000
...
...
examples/others/tensorize_vllm_model.py
View file @
006693ed
...
@@ -16,13 +16,11 @@ from vllm.model_executor.model_loader.tensorizer import (
...
@@ -16,13 +16,11 @@ from vllm.model_executor.model_loader.tensorizer import (
tensorize_vllm_model
,
tensorize_vllm_model
,
tensorizer_kwargs_arg
,
tensorizer_kwargs_arg
,
)
)
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
.argparse_utils
import
FlexibleArgumentParser
logger
=
logging
.
getLogger
()
logger
=
logging
.
getLogger
()
# yapf conflicts with isort for this docstring
# yapf: disable
"""
"""
tensorize_vllm_model.py is a script that can be used to serialize and
tensorize_vllm_model.py is a script that can be used to serialize and
deserialize vLLM models. These models can be loaded using tensorizer
deserialize vLLM models. These models can be loaded using tensorizer
...
@@ -86,7 +84,7 @@ directly to load models:
...
@@ -86,7 +84,7 @@ directly to load models:
from vllm import LLM
from vllm import LLM
llm = LLM(
llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1",
"s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer"
load_format="tensorizer"
,
)
)
```
```
...
@@ -132,7 +130,8 @@ def get_parser():
...
@@ -132,7 +130,8 @@ def get_parser():
"can be loaded using tensorizer directly to the GPU "
"can be loaded using tensorizer directly to the GPU "
"extremely quickly. Tensor encryption and decryption is "
"extremely quickly. Tensor encryption and decryption is "
"also supported, although libsodium must be installed to "
"also supported, although libsodium must be installed to "
"use it."
)
"use it."
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -144,13 +143,14 @@ def get_parser():
...
@@ -144,13 +143,14 @@ def get_parser():
"along with the model by instantiating a TensorizerConfig object, "
"along with the model by instantiating a TensorizerConfig object, "
"creating a dict from it with TensorizerConfig.to_serializable(), "
"creating a dict from it with TensorizerConfig.to_serializable(), "
"and passing it to LoRARequest's initializer with the kwarg "
"and passing it to LoRARequest's initializer with the kwarg "
"tensorizer_config_dict."
"tensorizer_config_dict."
,
)
)
subparsers
=
parser
.
add_subparsers
(
dest
=
'
command
'
,
required
=
True
)
subparsers
=
parser
.
add_subparsers
(
dest
=
"
command
"
,
required
=
True
)
serialize_parser
=
subparsers
.
add_parser
(
serialize_parser
=
subparsers
.
add_parser
(
'serialize'
,
help
=
"Serialize a model to `--serialized-directory`"
)
"serialize"
,
help
=
"Serialize a model to `--serialized-directory`"
)
serialize_parser
.
add_argument
(
serialize_parser
.
add_argument
(
"--suffix"
,
"--suffix"
,
...
@@ -163,7 +163,9 @@ def get_parser():
...
@@ -163,7 +163,9 @@ def get_parser():
"`--suffix` is `v1`, the serialized model tensors will be "
"`--suffix` is `v1`, the serialized model tensors will be "
"saved to "
"saved to "
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
"If none is provided, a random UUID will be used."
))
"If none is provided, a random UUID will be used."
),
)
serialize_parser
.
add_argument
(
serialize_parser
.
add_argument
(
"--serialized-directory"
,
"--serialized-directory"
,
type
=
str
,
type
=
str
,
...
@@ -175,108 +177,127 @@ def get_parser():
...
@@ -175,108 +177,127 @@ def get_parser():
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
"where `suffix` is given by `--suffix` or a random UUID if not "
"where `suffix` is given by `--suffix` or a random UUID if not "
"provided."
)
"provided."
,
)
serialize_parser
.
add_argument
(
serialize_parser
.
add_argument
(
"--serialization-kwargs"
,
"--serialization-kwargs"
,
type
=
tensorizer_kwargs_arg
,
type
=
tensorizer_kwargs_arg
,
required
=
False
,
required
=
False
,
help
=
(
"A JSON string containing additional keyword arguments to "
help
=
(
"pass to Tensorizer's TensorSerializer during "
"A JSON string containing additional keyword arguments to "
"serialization."
))
"pass to Tensorizer's TensorSerializer during "
"serialization."
),
)
serialize_parser
.
add_argument
(
serialize_parser
.
add_argument
(
"--keyfile"
,
"--keyfile"
,
type
=
str
,
type
=
str
,
required
=
False
,
required
=
False
,
help
=
(
"Encrypt the model weights with a randomly-generated binary key,"
help
=
(
" and save the key at this path"
))
"Encrypt the model weights with a randomly-generated binary key,"
" and save the key at this path"
),
)
deserialize_parser
=
subparsers
.
add_parser
(
deserialize_parser
=
subparsers
.
add_parser
(
'deserialize'
,
"deserialize"
,
help
=
(
"Deserialize a model from `--path-to-tensors`"
help
=
(
" to verify it can be loaded and used."
))
"Deserialize a model from `--path-to-tensors`"
" to verify it can be loaded and used."
),
)
deserialize_parser
.
add_argument
(
deserialize_parser
.
add_argument
(
"--path-to-tensors"
,
"--path-to-tensors"
,
type
=
str
,
type
=
str
,
required
=
False
,
required
=
False
,
help
=
"The local path or S3 URI to the model tensors to deserialize. "
)
help
=
"The local path or S3 URI to the model tensors to deserialize. "
,
)
deserialize_parser
.
add_argument
(
deserialize_parser
.
add_argument
(
"--serialized-directory"
,
"--serialized-directory"
,
type
=
str
,
type
=
str
,
required
=
False
,
required
=
False
,
help
=
"Directory with model artifacts for loading. Assumes a "
help
=
"Directory with model artifacts for loading. Assumes a "
"model.tensors file exists therein. Can supersede "
"model.tensors file exists therein. Can supersede "
"--path-to-tensors."
)
"--path-to-tensors."
,
)
deserialize_parser
.
add_argument
(
deserialize_parser
.
add_argument
(
"--keyfile"
,
"--keyfile"
,
type
=
str
,
type
=
str
,
required
=
False
,
required
=
False
,
help
=
(
"Path to a binary key to use to decrypt the model weights,"
help
=
(
" if the model was serialized with encryption"
))
"Path to a binary key to use to decrypt the model weights,"
" if the model was serialized with encryption"
),
)
deserialize_parser
.
add_argument
(
deserialize_parser
.
add_argument
(
"--deserialization-kwargs"
,
"--deserialization-kwargs"
,
type
=
tensorizer_kwargs_arg
,
type
=
tensorizer_kwargs_arg
,
required
=
False
,
required
=
False
,
help
=
(
"A JSON string containing additional keyword arguments to "
help
=
(
"pass to Tensorizer's `TensorDeserializer` during "
"A JSON string containing additional keyword arguments to "
"deserialization."
))
"pass to Tensorizer's `TensorDeserializer` during "
"deserialization."
),
)
TensorizerArgs
.
add_cli_args
(
deserialize_parser
)
TensorizerArgs
.
add_cli_args
(
deserialize_parser
)
return
parser
return
parser
def
merge_extra_config_with_tensorizer_config
(
extra_cfg
:
dict
,
cfg
:
TensorizerConfig
):
def
merge_extra_config_with_tensorizer_config
(
extra_cfg
:
dict
,
cfg
:
TensorizerConfig
):
for
k
,
v
in
extra_cfg
.
items
():
for
k
,
v
in
extra_cfg
.
items
():
if
hasattr
(
cfg
,
k
):
if
hasattr
(
cfg
,
k
):
setattr
(
cfg
,
k
,
v
)
setattr
(
cfg
,
k
,
v
)
logger
.
info
(
logger
.
info
(
"Updating TensorizerConfig with %s from "
"Updating TensorizerConfig with %s from "
"--model-loader-extra-config provided"
,
k
"--model-loader-extra-config provided"
,
k
,
)
)
def
deserialize
(
args
,
tensorizer_config
):
def
deserialize
(
args
,
tensorizer_config
):
if
args
.
lora_path
:
if
args
.
lora_path
:
tensorizer_config
.
lora_dir
=
tensorizer_config
.
tensorizer_dir
tensorizer_config
.
lora_dir
=
tensorizer_config
.
tensorizer_dir
llm
=
LLM
(
model
=
args
.
model
,
llm
=
LLM
(
load_format
=
"tensorizer"
,
model
=
args
.
model
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
load_format
=
"tensorizer"
,
model_loader_extra_config
=
tensorizer_config
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
enable_lora
=
True
,
model_loader_extra_config
=
tensorizer_config
,
enable_lora
=
True
,
)
)
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
0
,
temperature
=
0
,
max_tokens
=
256
,
stop
=
[
"[/assistant]"
]
max_tokens
=
256
,
stop
=
[
"[/assistant]"
]
)
)
# Truncating this as the extra text isn't necessary
# Truncating this as the extra text isn't necessary
prompts
=
[
prompts
=
[
"[user] Write a SQL query to answer the question based on ..."
]
"[user] Write a SQL query to answer the question based on ..."
]
# Test LoRA load
# Test LoRA load
print
(
print
(
llm
.
generate
(
llm
.
generate
(
prompts
,
prompts
,
sampling_params
,
sampling_params
,
lora_request
=
LoRARequest
(
"sql-lora"
,
lora_request
=
LoRARequest
(
1
,
"sql-lora"
,
args
.
lora_path
,
1
,
tensorizer_config_dict
=
tensorizer_config
args
.
lora_path
,
.
to_serializable
())
tensorizer_config_dict
=
tensorizer_config
.
to_serializable
(),
),
)
)
)
)
else
:
else
:
llm
=
LLM
(
model
=
args
.
model
,
llm
=
LLM
(
load_format
=
"tensorizer"
,
model
=
args
.
model
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
load_format
=
"tensorizer"
,
model_loader_extra_config
=
tensorizer_config
tensor_parallel_size
=
args
.
tensor_parallel_size
,
model_loader_extra_config
=
tensorizer_config
,
)
)
return
llm
return
llm
...
@@ -285,17 +306,20 @@ def main():
...
@@ -285,17 +306,20 @@ def main():
parser
=
get_parser
()
parser
=
get_parser
()
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
s3_access_key_id
=
(
getattr
(
args
,
's3_access_key_id'
,
None
)
s3_access_key_id
=
getattr
(
args
,
"s3_access_key_id"
,
None
)
or
os
.
environ
.
get
(
or
os
.
environ
.
get
(
"S3_ACCESS_KEY_ID"
,
None
))
"S3_ACCESS_KEY_ID"
,
None
s3_secret_access_key
=
(
getattr
(
args
,
's3_secret_access_key'
,
None
)
)
or
os
.
environ
.
get
(
"S3_SECRET_ACCESS_KEY"
,
None
))
s3_secret_access_key
=
getattr
(
s3_endpoint
=
(
getattr
(
args
,
's3_endpoint'
,
None
)
args
,
"s3_secret_access_key"
,
None
or
os
.
environ
.
get
(
"S3_ENDPOINT_URL"
,
None
))
)
or
os
.
environ
.
get
(
"S3_SECRET_ACCESS_KEY"
,
None
)
s3_endpoint
=
getattr
(
args
,
"s3_endpoint"
,
None
)
or
os
.
environ
.
get
(
"S3_ENDPOINT_URL"
,
None
)
credentials
=
{
credentials
=
{
"s3_access_key_id"
:
s3_access_key_id
,
"s3_access_key_id"
:
s3_access_key_id
,
"s3_secret_access_key"
:
s3_secret_access_key
,
"s3_secret_access_key"
:
s3_secret_access_key
,
"s3_endpoint"
:
s3_endpoint
"s3_endpoint"
:
s3_endpoint
,
}
}
model_ref
=
args
.
model
model_ref
=
args
.
model
...
@@ -309,25 +333,25 @@ def main():
...
@@ -309,25 +333,25 @@ def main():
if
args
.
model_loader_extra_config
:
if
args
.
model_loader_extra_config
:
extra_config
=
json
.
loads
(
args
.
model_loader_extra_config
)
extra_config
=
json
.
loads
(
args
.
model_loader_extra_config
)
tensorizer_dir
=
args
.
serialized_directory
or
extra_config
.
get
(
"tensorizer_dir"
)
tensorizer_dir
=
(
args
.
serialized_directory
or
tensorizer_uri
=
getattr
(
args
,
"path_to_tensors"
,
None
)
or
extra_config
.
get
(
extra_config
.
get
(
"tensorizer_dir"
))
"tensorizer_uri"
tensorizer_uri
=
(
getattr
(
args
,
"path_to_tensors"
,
None
)
)
or
extra_config
.
get
(
"tensorizer_uri"
))
if
tensorizer_dir
and
tensorizer_uri
:
if
tensorizer_dir
and
tensorizer_uri
:
parser
.
error
(
"--serialized-directory and --path-to-tensors "
parser
.
error
(
"cannot both be provided"
)
"--serialized-directory and --path-to-tensors cannot both be provided"
)
if
not
tensorizer_dir
and
not
tensorizer_uri
:
if
not
tensorizer_dir
and
not
tensorizer_uri
:
parser
.
error
(
"Either --serialized-directory or --path-to-tensors "
parser
.
error
(
"
must be provided"
)
"Either --serialized-directory or --path-to-tensors
must be provided"
)
if
args
.
command
==
"serialize"
:
if
args
.
command
==
"serialize"
:
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
input_dir
=
tensorizer_dir
.
rstrip
(
'/'
)
input_dir
=
tensorizer_dir
.
rstrip
(
"/"
)
suffix
=
args
.
suffix
if
args
.
suffix
else
uuid
.
uuid4
().
hex
suffix
=
args
.
suffix
if
args
.
suffix
else
uuid
.
uuid4
().
hex
base_path
=
f
"
{
input_dir
}
/vllm/
{
model_ref
}
/
{
suffix
}
"
base_path
=
f
"
{
input_dir
}
/vllm/
{
model_ref
}
/
{
suffix
}
"
if
engine_args
.
tensor_parallel_size
>
1
:
if
engine_args
.
tensor_parallel_size
>
1
:
...
@@ -339,15 +363,14 @@ def main():
...
@@ -339,15 +363,14 @@ def main():
tensorizer_uri
=
model_path
,
tensorizer_uri
=
model_path
,
encryption_keyfile
=
keyfile
,
encryption_keyfile
=
keyfile
,
serialization_kwargs
=
args
.
serialization_kwargs
or
{},
serialization_kwargs
=
args
.
serialization_kwargs
or
{},
**
credentials
**
credentials
,
)
)
if
args
.
lora_path
:
if
args
.
lora_path
:
tensorizer_config
.
lora_dir
=
tensorizer_config
.
tensorizer_dir
tensorizer_config
.
lora_dir
=
tensorizer_config
.
tensorizer_dir
tensorize_lora_adapter
(
args
.
lora_path
,
tensorizer_config
)
tensorize_lora_adapter
(
args
.
lora_path
,
tensorizer_config
)
merge_extra_config_with_tensorizer_config
(
extra_config
,
merge_extra_config_with_tensorizer_config
(
extra_config
,
tensorizer_config
)
tensorizer_config
)
tensorize_vllm_model
(
engine_args
,
tensorizer_config
)
tensorize_vllm_model
(
engine_args
,
tensorizer_config
)
elif
args
.
command
==
"deserialize"
:
elif
args
.
command
==
"deserialize"
:
...
@@ -356,11 +379,10 @@ def main():
...
@@ -356,11 +379,10 @@ def main():
tensorizer_dir
=
args
.
serialized_directory
,
tensorizer_dir
=
args
.
serialized_directory
,
encryption_keyfile
=
keyfile
,
encryption_keyfile
=
keyfile
,
deserialization_kwargs
=
args
.
deserialization_kwargs
or
{},
deserialization_kwargs
=
args
.
deserialization_kwargs
or
{},
**
credentials
**
credentials
,
)
)
merge_extra_config_with_tensorizer_config
(
extra_config
,
merge_extra_config_with_tensorizer_config
(
extra_config
,
tensorizer_config
)
tensorizer_config
)
deserialize
(
args
,
tensorizer_config
)
deserialize
(
args
,
tensorizer_config
)
else
:
else
:
raise
ValueError
(
"Either serialize or deserialize must be specified."
)
raise
ValueError
(
"Either serialize or deserialize must be specified."
)
...
...
examples/pyproject.toml
deleted
100644 → 0
View file @
4b51e6f1
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length
=
88
exclude
=
[
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py"
,
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**"
=
["ALL"]
"vllm/version.py"
=
["F401"]
"vllm/_version.py"
=
["ALL"]
[tool.ruff.lint]
select
=
[
# pycodestyle
"E"
,
# Pyflakes
"F"
,
# pyupgrade
"UP"
,
# flake8-bugbear
"B"
,
# flake8-simplify
"SIM"
,
# isort
"I"
,
# flake8-logging-format
"G"
,
]
ignore
=
[
# star imports
"F405"
,
"F403"
,
# lambda expression assignment
"E731"
,
# Loop control variable not used within loop body
"B007"
,
# f-string format
"UP032"
,
# Can remove once 3.10+ is the minimum Python version
"UP007"
,
]
[tool.ruff.lint.isort]
known-first-party
=
["vllm"]
[tool.ruff.format]
docstring-code-format
=
true
\ No newline at end of file
Prev
1
…
21
22
23
24
25
26
27
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment