Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
LLaMA_vllm
Commits
25f39502
Commit
25f39502
authored
Aug 16, 2025
by
laibao
Browse files
更新README.md,修改Docker镜像版本和深度学习库依赖,删除多个示例文件以简化代码库。
parent
951558c2
Changes
186
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1805 additions
and
378 deletions
+1805
-378
examples/online_serving/structured_outputs/structured_outputs.py
...s/online_serving/structured_outputs/structured_outputs.py
+272
-0
examples/online_serving/utils.py
examples/online_serving/utils.py
+26
-0
examples/openai_audio_api_client.py
examples/openai_audio_api_client.py
+0
-90
examples/openai_chat_completion_client_with_tools.py
examples/openai_chat_completion_client_with_tools.py
+0
-162
examples/openai_vision_api_client.py
examples/openai_vision_api_client.py
+0
-126
examples/others/lmcache/README.md
examples/others/lmcache/README.md
+56
-0
examples/others/lmcache/cpu_offload_lmcache.py
examples/others/lmcache/cpu_offload_lmcache.py
+153
-0
examples/others/lmcache/disagg_prefill_lmcache_v0.py
examples/others/lmcache/disagg_prefill_lmcache_v0.py
+144
-0
examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
...gg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
+13
-0
examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
..._prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
+13
-0
examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
.../lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
+136
-0
examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
.../lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
+208
-0
examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
...lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
+59
-0
examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
+133
-0
examples/others/logging_configuration.md
examples/others/logging_configuration.md
+162
-0
examples/others/tensorize_vllm_model.py
examples/others/tensorize_vllm_model.py
+316
-0
examples/pyproject.toml
examples/pyproject.toml
+54
-0
examples/template_alpaca.jinja
examples/template_alpaca.jinja
+29
-0
examples/template_baichuan.jinja
examples/template_baichuan.jinja
+13
-0
examples/template_chatglm.jinja
examples/template_chatglm.jinja
+18
-0
No files found.
examples/online_serving/structured_outputs/structured_outputs.py
0 → 100644
View file @
25f39502
# ruff: noqa: E501
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
import
argparse
import
asyncio
import
enum
import
os
from
typing
import
TYPE_CHECKING
,
Any
,
Literal
import
openai
import
pydantic
if
TYPE_CHECKING
:
from
openai.types.chat
import
ChatCompletionChunk
ConstraintsFormat
=
Literal
[
"choice"
,
"regex"
,
"json"
,
"grammar"
,
"structural_tag"
,
]
async
def
print_stream_response
(
stream_response
:
openai
.
AsyncStream
[
ChatCompletionChunk
],
title
:
str
,
args
:
argparse
.
Namespace
,
):
print
(
f
"
\n\n
{
title
}
(Streaming):"
)
local_reasoning_header_printed
=
False
local_content_header_printed
=
False
async
for
chunk
in
stream_response
:
delta
=
chunk
.
choices
[
0
].
delta
reasoning_chunk_text
:
str
|
None
=
getattr
(
delta
,
"reasoning_content"
,
None
)
content_chunk_text
=
delta
.
content
if
args
.
reasoning
:
if
reasoning_chunk_text
:
if
not
local_reasoning_header_printed
:
print
(
" Reasoning: "
,
end
=
""
)
local_reasoning_header_printed
=
True
print
(
reasoning_chunk_text
,
end
=
""
,
flush
=
True
)
if
content_chunk_text
:
if
not
local_content_header_printed
:
if
local_reasoning_header_printed
:
print
()
print
(
" Content: "
,
end
=
""
)
local_content_header_printed
=
True
print
(
content_chunk_text
,
end
=
""
,
flush
=
True
)
else
:
if
content_chunk_text
:
if
not
local_content_header_printed
:
print
(
" Content: "
,
end
=
""
)
local_content_header_printed
=
True
print
(
content_chunk_text
,
end
=
""
,
flush
=
True
)
print
()
class
CarType
(
str
,
enum
.
Enum
):
SEDAN
=
"SEDAN"
SUV
=
"SUV"
TRUCK
=
"TRUCK"
COUPE
=
"COUPE"
class
CarDescription
(
pydantic
.
BaseModel
):
brand
:
str
model
:
str
car_type
:
CarType
PARAMS
:
dict
[
ConstraintsFormat
,
dict
[
str
,
Any
]]
=
{
"choice"
:
{
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
"Classify this sentiment: vLLM is wonderful!"
,
}
],
"extra_body"
:
{
"guided_choice"
:
[
"positive"
,
"negative"
]},
},
"regex"
:
{
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
"Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com
\n
'"
,
}
],
"extra_body"
:
{
"guided_regex"
:
r
"[a-z0-9.]{1,20}@\w{6,10}\.com\n"
,
},
},
"json"
:
{
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
"Generate a JSON with the brand, model and car_type of the most iconic car from the 90's"
,
}
],
"response_format"
:
{
"type"
:
"json_schema"
,
"json_schema"
:
{
"name"
:
"car-description"
,
"schema"
:
CarDescription
.
model_json_schema
(),
},
},
},
"grammar"
:
{
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
"Generate an SQL query to show the 'username' and 'email'from the 'users' table."
,
}
],
"extra_body"
:
{
"guided_grammar"
:
"""
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
column ::= "col_1 " | "col_2 "
table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
"""
,
},
},
"structural_tag"
:
{
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
"""
You have access to the following function to retrieve the weather in a city:
{
"name": "get_weather",
"parameters": {
"city": {
"param_type": "string",
"description": "The city to get the weather for",
"required": True
}
}
}
If a you choose to call a function ONLY reply in the following format:
<{start_tag}={function_name}>{parameters}{end_tag}
where
start_tag => `<function`
parameters => a JSON dict with the function argument name as key and function
argument value as value.
end_tag => `</function>`
Here is an example,
<function=example_function_name>{"example_name": "example_value"}</function>
Reminder:
- Function calls MUST follow the specified format
- Required parameters MUST be specified
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
You are a helpful assistant.
Given the previous instructions, what is the weather in New York City, Boston,
and San Francisco?"""
,
},
],
"response_format"
:
{
"type"
:
"structural_tag"
,
"structures"
:
[
{
"begin"
:
"<function=get_weather>"
,
"schema"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
}},
"required"
:
[
"city"
],
},
"end"
:
"</function>"
,
}
],
"triggers"
:
[
"<function="
],
},
},
}
async
def
cli
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Run OpenAI Chat Completion with various structured outputs capabilities"
,
)
_
=
parser
.
add_argument
(
"--constraint"
,
type
=
str
,
nargs
=
"+"
,
choices
=
[
*
list
(
PARAMS
),
"*"
],
default
=
[
"*"
],
help
=
"Specify which constraint(s) to run."
,
)
_
=
parser
.
add_argument
(
"--stream"
,
action
=
argparse
.
BooleanOptionalAction
,
default
=
False
,
help
=
"Enable streaming output"
,
)
_
=
parser
.
add_argument
(
"--reasoning"
,
action
=
argparse
.
BooleanOptionalAction
,
default
=
False
,
help
=
"Enable printing of reasoning traces if available."
,
)
args
=
parser
.
parse_args
()
base_url
=
os
.
getenv
(
"OPENAI_BASE_URL"
,
"http://localhost:8000/v1"
)
client
=
openai
.
AsyncOpenAI
(
base_url
=
base_url
,
api_key
=
"EMPTY"
)
constraints
=
list
(
PARAMS
)
if
"*"
in
args
.
constraint
else
list
(
set
(
args
.
constraint
))
model
=
(
await
client
.
models
.
list
()).
data
[
0
].
id
if
args
.
stream
:
results
=
await
asyncio
.
gather
(
*
[
client
.
chat
.
completions
.
create
(
model
=
model
,
max_tokens
=
1024
,
stream
=
True
,
**
PARAMS
[
name
],
)
for
name
in
constraints
]
)
for
constraint
,
stream
in
zip
(
constraints
,
results
):
await
print_stream_response
(
stream
,
constraint
,
args
)
else
:
results
=
await
asyncio
.
gather
(
*
[
client
.
chat
.
completions
.
create
(
model
=
model
,
max_tokens
=
1024
,
stream
=
False
,
**
PARAMS
[
name
],
)
for
name
in
constraints
]
)
for
constraint
,
response
in
zip
(
constraints
,
results
):
print
(
f
"
\n\n
{
constraint
}
:"
)
message
=
response
.
choices
[
0
].
message
if
args
.
reasoning
and
hasattr
(
message
,
"reasoning_content"
):
print
(
f
" Reasoning:
{
message
.
reasoning_content
or
''
}
"
)
print
(
f
" Content:
{
message
.
content
!
r
}
"
)
def
main
():
asyncio
.
run
(
cli
())
if
__name__
==
"__main__"
:
main
()
examples/online_serving/utils.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
openai
import
APIConnectionError
,
OpenAI
from
openai.pagination
import
SyncPage
from
openai.types.model
import
Model
def
get_first_model
(
client
:
OpenAI
)
->
str
:
"""
Get the first model from the vLLM server.
"""
try
:
models
:
SyncPage
[
Model
]
=
client
.
models
.
list
()
except
APIConnectionError
as
e
:
raise
RuntimeError
(
"Failed to get the list of models from the vLLM server at "
f
"
{
client
.
base_url
}
with API key
{
client
.
api_key
}
. Check
\n
"
"1. the server is running
\n
"
"2. the server URL is correct
\n
"
"3. the API key is correct"
)
from
e
if
len
(
models
.
data
)
==
0
:
raise
RuntimeError
(
f
"No models found on the vLLM server at
{
client
.
base_url
}
"
)
return
models
.
data
[
0
].
id
examples/openai_audio_api_client.py
deleted
100644 → 0
View file @
951558c2
"""An example showing how to use vLLM to serve VLMs.
Launch the vLLM server with the following command:
vllm serve fixie-ai/ultravox-v0_3
"""
import
base64
import
requests
from
openai
import
OpenAI
from
vllm.assets.audio
import
AudioAsset
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
# Any format supported by librosa is supported
audio_url
=
AudioAsset
(
"winning_call"
).
url
# Use audio url in the payload
chat_completion_from_url
=
client
.
chat
.
completions
.
create
(
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this audio?"
},
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
audio_url
},
},
],
}],
model
=
model
,
max_tokens
=
64
,
)
result
=
chat_completion_from_url
.
choices
[
0
].
message
.
content
print
(
f
"Chat completion output:
{
result
}
"
)
# Use base64 encoded audio in the payload
def
encode_audio_base64_from_url
(
audio_url
:
str
)
->
str
:
"""Encode an audio retrieved from a remote url to base64 format."""
with
requests
.
get
(
audio_url
)
as
response
:
response
.
raise_for_status
()
result
=
base64
.
b64encode
(
response
.
content
).
decode
(
'utf-8'
)
return
result
audio_base64
=
encode_audio_base64_from_url
(
audio_url
=
audio_url
)
chat_completion_from_base64
=
client
.
chat
.
completions
.
create
(
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this audio?"
},
{
"type"
:
"audio_url"
,
"audio_url"
:
{
# Any format supported by librosa is supported
"url"
:
f
"data:audio/ogg;base64,
{
audio_base64
}
"
},
},
],
}],
model
=
model
,
max_tokens
=
64
,
)
result
=
chat_completion_from_base64
.
choices
[
0
].
message
.
content
print
(
f
"Chat completion output:
{
result
}
"
)
examples/openai_chat_completion_client_with_tools.py
deleted
100644 → 0
View file @
951558c2
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled. For example:
IMPORTANT: for mistral, you must use one of the provided mistral tool call
templates, or your own - the model default doesn't work for tool calls with vLLM
See the vLLM docs on OpenAI server & tool calling for more details.
vllm serve --model mistralai/Mistral-7B-Instruct-v0.3
\
--chat-template examples/tool_chat_template_mistral.jinja
\
--enable-auto-tool-choice --tool-call-parser mistral
OR
vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B
\
--chat-template examples/tool_chat_template_hermes.jinja
\
--enable-auto-tool-choice --tool-call-parser hermes
"""
import
json
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"The city to find the weather for, e.g. 'San Francisco'"
},
"state"
:
{
"type"
:
"string"
,
"description"
:
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
]
}
},
"required"
:
[
"city"
,
"state"
,
"unit"
]
}
}
}]
messages
=
[{
"role"
:
"user"
,
"content"
:
"Hi! How are you doing today?"
},
{
"role"
:
"assistant"
,
"content"
:
"I'm doing well! How can I help you?"
},
{
"role"
:
"user"
,
"content"
:
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
chat_completion
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
)
print
(
"Chat completion results:"
)
print
(
chat_completion
)
print
(
"
\n\n
"
)
tool_calls_stream
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
True
)
chunks
=
[]
for
chunk
in
tool_calls_stream
:
chunks
.
append
(
chunk
)
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
print
(
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
])
else
:
print
(
chunk
.
choices
[
0
].
delta
)
arguments
=
[]
tool_call_idx
=
-
1
for
chunk
in
chunks
:
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
tool_call
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
]
if
tool_call
.
index
!=
tool_call_idx
:
if
tool_call_idx
>=
0
:
print
(
f
"streamed tool call arguments:
{
arguments
[
tool_call_idx
]
}
"
)
tool_call_idx
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
].
index
arguments
.
append
(
""
)
if
tool_call
.
id
:
print
(
f
"streamed tool call id:
{
tool_call
.
id
}
"
)
if
tool_call
.
function
:
if
tool_call
.
function
.
name
:
print
(
f
"streamed tool call name:
{
tool_call
.
function
.
name
}
"
)
if
tool_call
.
function
.
arguments
:
arguments
[
tool_call_idx
]
+=
tool_call
.
function
.
arguments
if
len
(
arguments
):
print
(
f
"streamed tool call arguments:
{
arguments
[
-
1
]
}
"
)
print
(
"
\n\n
"
)
messages
.
append
({
"role"
:
"assistant"
,
"tool_calls"
:
chat_completion
.
choices
[
0
].
message
.
tool_calls
})
# Now, simulate a tool call
def
get_current_weather
(
city
:
str
,
state
:
str
,
unit
:
'str'
):
return
(
"The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
"partly cloudly, with highs in the 90's."
)
available_tools
=
{
"get_current_weather"
:
get_current_weather
}
completion_tool_calls
=
chat_completion
.
choices
[
0
].
message
.
tool_calls
for
call
in
completion_tool_calls
:
tool_to_call
=
available_tools
[
call
.
function
.
name
]
args
=
json
.
loads
(
call
.
function
.
arguments
)
result
=
tool_to_call
(
**
args
)
print
(
result
)
messages
.
append
({
"role"
:
"tool"
,
"content"
:
result
,
"tool_call_id"
:
call
.
id
,
"name"
:
call
.
function
.
name
})
chat_completion_2
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model
,
tools
=
tools
,
stream
=
False
)
print
(
"
\n\n
"
)
print
(
chat_completion_2
)
examples/openai_vision_api_client.py
deleted
100644 → 0
View file @
951558c2
"""An example showing how to use vLLM to serve VLMs.
Launch the vLLM server with the following command:
(single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096
\
--trust-remote-code --limit-mm-per-prompt image=2
"""
import
base64
import
requests
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
# Single-image input inference
image_url
=
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
## Use image url in the payload
chat_completion_from_url
=
client
.
chat
.
completions
.
create
(
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
},
],
}],
model
=
model
,
max_tokens
=
64
,
)
result
=
chat_completion_from_url
.
choices
[
0
].
message
.
content
print
(
"Chat completion output:"
,
result
)
## Use base64 encoded image in the payload
def
encode_image_base64_from_url
(
image_url
:
str
)
->
str
:
"""Encode an image retrieved from a remote url to base64 format."""
with
requests
.
get
(
image_url
)
as
response
:
response
.
raise_for_status
()
result
=
base64
.
b64encode
(
response
.
content
).
decode
(
'utf-8'
)
return
result
image_base64
=
encode_image_base64_from_url
(
image_url
=
image_url
)
chat_completion_from_base64
=
client
.
chat
.
completions
.
create
(
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
image_base64
}
"
},
},
],
}],
model
=
model
,
max_tokens
=
64
,
)
result
=
chat_completion_from_base64
.
choices
[
0
].
message
.
content
print
(
f
"Chat completion output:
{
result
}
"
)
# Multi-image input inference
image_url_duck
=
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion
=
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
chat_completion_from_url
=
client
.
chat
.
completions
.
create
(
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What are the animals in these images?"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url_duck
},
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url_lion
},
},
],
}],
model
=
model
,
max_tokens
=
64
,
)
result
=
chat_completion_from_url
.
choices
[
0
].
message
.
content
print
(
"Chat completion output:"
,
result
)
examples/others/lmcache/README.md
0 → 100644
View file @
25f39502
# LMCache Examples
This folder demonstrates how to use LMCache for disaggregated prefilling, CPU offloading and KV cache sharing.
## 1. Disaggregated Prefill in vLLM v1
This example demonstrates how to run LMCache with disaggregated prefill using NIXL on a single node.
### Prerequisites
-
Install
[
LMCache
](
https://github.com/LMCache/LMCache
)
. You can simply run
`pip install lmcache`
.
-
Install
[
NIXL
](
https://github.com/ai-dynamo/nixl
)
.
-
At least 2 GPUs
-
Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct.
### Usage
Run
`cd disagg_prefill_lmcache_v1`
to get into
`disagg_prefill_lmcache_v1`
folder, and then run
```
bash
bash disagg_example_nixl.sh
```
to run disaggregated prefill and benchmark the performance.
### Components
#### Server Scripts
-
`disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh`
- Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
-
`disagg_prefill_lmcache_v1/disagg_proxy_server.py`
- FastAPI proxy server that coordinates between prefiller and decoder
-
`disagg_prefill_lmcache_v1/disagg_example_nixl.sh`
- Main script to run the example
#### Configuration
-
`disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml`
- Configuration for prefiller server
-
`disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml`
- Configuration for decoder server
#### Log Files
The main script generates several log files:
-
`prefiller.log`
- Logs from the prefill server
-
`decoder.log`
- Logs from the decode server
-
`proxy.log`
- Logs from the proxy server
## 2. CPU Offload Examples
-
`python cpu_offload_lmcache.py -v v0`
- CPU offloading implementation for vLLM v0
-
`python cpu_offload_lmcache.py -v v1`
- CPU offloading implementation for vLLM v1
## 3. KV Cache Sharing
The
`kv_cache_sharing_lmcache_v1.py`
example demonstrates how to share KV caches between vLLM v1 instances.
## 4. Disaggregated Prefill in vLLM v0
The
`disaggregated_prefill_lmcache_v0.py`
provides an example of how to run disaggregated prefill in vLLM v0.
examples/others/lmcache/cpu_offload_lmcache.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of cpu offloading
with LMCache in vLLM v1 or v0.
Usage:
Specify vLLM version
-v v0 : Use LMCacheConnector
model = mistralai/Mistral-7B-Instruct-v0.2
(Includes enable_chunked_prefill = True)
-v v1 : Use LMCacheConnectorV1 (default)
model = meta-llama/Meta-Llama-3.1-8B-Instruct
(Without enable_chunked_prefill)
Note that `lmcache` is needed to run this example.
Requirements:
https://docs.lmcache.ai/getting_started/installation.html#prerequisites
Learn more about LMCache environment setup, please refer to:
https://docs.lmcache.ai/getting_started/installation.html
"""
import
argparse
import
contextlib
import
os
import
time
from
dataclasses
import
asdict
from
lmcache.integration.vllm.utils
import
ENGINE_NAME
from
lmcache.v1.cache_engine
import
LMCacheEngineBuilder
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
from
vllm.engine.arg_utils
import
EngineArgs
def
setup_environment_variables
(
vllm_version
:
str
):
# LMCache-related environment variables
# Use experimental features in LMCache
os
.
environ
[
"LMCACHE_USE_EXPERIMENTAL"
]
=
"True"
# LMCache is set to use 256 tokens per chunk
os
.
environ
[
"LMCACHE_CHUNK_SIZE"
]
=
"256"
# Enable local CPU backend in LMCache
os
.
environ
[
"LMCACHE_LOCAL_CPU"
]
=
"True"
# Set local CPU memory limit to 5.0 GB
os
.
environ
[
"LMCACHE_MAX_LOCAL_CPU_SIZE"
]
=
"5.0"
if
vllm_version
==
"v0"
:
os
.
environ
[
"VLLM_USE_V1"
]
=
"0"
@
contextlib
.
contextmanager
def
build_llm_with_lmcache
(
lmcache_connector
:
str
,
model
:
str
,
vllm_version
:
str
):
ktc
=
KVTransferConfig
(
kv_connector
=
lmcache_connector
,
kv_role
=
"kv_both"
,
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
if
vllm_version
==
"v0"
:
llm_args
=
EngineArgs
(
model
=
model
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enable_chunked_prefill
=
True
,
# Only in v0
)
else
:
llm_args
=
EngineArgs
(
model
=
model
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
)
llm
=
LLM
(
**
asdict
(
llm_args
))
try
:
yield
llm
finally
:
# Clean up lmcache backend
LMCacheEngineBuilder
.
destroy
(
ENGINE_NAME
)
def
print_output
(
llm
:
LLM
,
prompt
:
list
[
str
],
sampling_params
:
SamplingParams
,
req_str
:
str
,
):
# Should be able to see logs like the following:
# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
# This indicates that the KV cache has been stored in LMCache.
start
=
time
.
time
()
outputs
=
llm
.
generate
(
prompt
,
sampling_params
)
print
(
"-"
*
50
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Generation took
{
time
.
time
()
-
start
:.
2
f
}
seconds,
{
req_str
}
request done."
)
print
(
"-"
*
50
)
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"-v"
,
"--version"
,
choices
=
[
"v0"
,
"v1"
],
default
=
"v1"
,
help
=
"Specify vLLM version (default: v1)"
,
)
return
parser
.
parse_args
()
def
main
():
args
=
parse_args
()
if
args
.
version
==
"v0"
:
lmcache_connector
=
"LMCacheConnector"
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
else
:
lmcache_connector
=
"LMCacheConnectorV1"
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
setup_environment_variables
(
args
.
version
)
with
build_llm_with_lmcache
(
lmcache_connector
,
model
,
args
.
version
)
as
llm
:
# This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts
shared_prompt
=
"Hello, how are you?"
*
1000
first_prompt
=
[
shared_prompt
+
"Hello, my name is"
,
]
second_prompt
=
[
shared_prompt
+
"Tell me a very long story"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
10
)
# Print the first output
print_output
(
llm
,
first_prompt
,
sampling_params
,
"first"
)
time
.
sleep
(
1
)
# print the second output
print_output
(
llm
,
second_prompt
,
sampling_params
,
"second"
)
if
__name__
==
"__main__"
:
main
()
examples/others/lmcache/disagg_prefill_lmcache_v0.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of disaggregated prefilling
with LMCache.
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and launch an additional LMCache server.
KV cache is transferred in the following manner:
vLLM prefill node -> LMCache server -> vLLM decode node.
Note that `pip install lmcache` is needed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache.
"""
import
os
import
subprocess
import
time
from
multiprocessing
import
Event
,
Process
from
lmcache.experimental.cache_engine
import
LMCacheEngineBuilder
from
lmcache.integration.vllm.utils
import
ENGINE_NAME
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
# LMCache-related environment variables
# The port to start LMCache server
port
=
8100
# Use experimental features in LMCache
os
.
environ
[
"LMCACHE_USE_EXPERIMENTAL"
]
=
"True"
# LMCache is set to use 256 tokens per chunk
os
.
environ
[
"LMCACHE_CHUNK_SIZE"
]
=
"256"
# Disable local CPU backend in LMCache
os
.
environ
[
"LMCACHE_LOCAL_CPU"
]
=
"False"
# Set local CPU memory buffer limit to 5.0 GB
os
.
environ
[
"LMCACHE_MAX_LOCAL_CPU_SIZE"
]
=
"5.0"
# Set the remote URL for LMCache server
os
.
environ
[
"LMCACHE_REMOTE_URL"
]
=
f
"lm://localhost:
{
port
}
"
# Set the serializer/deserializer between vllm and LMCache server
# `naive` indicates using raw bytes of the tensor without any compression
os
.
environ
[
"LMCACHE_REMOTE_SERDE"
]
=
"naive"
prompts
=
[
"Hello, how are you?"
*
1000
,
]
def
run_prefill
(
prefill_done
,
prompts
):
# We use GPU 0 for prefill node.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0"
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
1
)
ktc
=
KVTransferConfig
(
kv_connector
=
"LMCacheConnector"
,
kv_role
=
"kv_producer"
,
kv_rank
=
0
,
kv_parallel_size
=
2
,
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enforce_eager
=
True
,
)
# llm.generate(prompts, sampling_params)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
"Prefill node is finished."
)
prefill_done
.
set
()
# Clean up lmcache backend
LMCacheEngineBuilder
.
destroy
(
ENGINE_NAME
)
def
run_decode
(
prefill_done
,
prompts
,
timeout
=
1
):
# We use GPU 1 for decode node.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"1"
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
10
)
ktc
=
KVTransferConfig
(
kv_connector
=
"LMCacheConnector"
,
kv_role
=
"kv_consumer"
,
kv_rank
=
1
,
kv_parallel_size
=
2
,
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# of memory. Reduce the value if your GPU has less memory.
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enforce_eager
=
True
,
)
print
(
"Waiting for prefill node to finish..."
)
prefill_done
.
wait
()
time
.
sleep
(
timeout
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
# Clean up lmcache backend
LMCacheEngineBuilder
.
destroy
(
ENGINE_NAME
)
def
run_lmcache_server
(
port
):
server_proc
=
subprocess
.
Popen
(
[
"python"
,
"-m"
,
"lmcache.experimental.server"
,
"localhost"
,
str
(
port
)]
)
return
server_proc
def
main
():
prefill_done
=
Event
()
prefill_process
=
Process
(
target
=
run_prefill
,
args
=
(
prefill_done
,
prompts
))
decode_process
=
Process
(
target
=
run_decode
,
args
=
(
prefill_done
,
prompts
))
lmcache_server_process
=
run_lmcache_server
(
port
)
# Start prefill node
prefill_process
.
start
()
# Start decode node
decode_process
.
start
()
# Clean up the processes
decode_process
.
join
()
prefill_process
.
terminate
()
lmcache_server_process
.
terminate
()
lmcache_server_process
.
wait
()
if
__name__
==
"__main__"
:
main
()
examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
0 → 100644
View file @
25f39502
local_cpu
:
False
max_local_cpu_size
:
0
#local_disk:
max_local_disk_size
:
0
remote_serde
:
NULL
enable_nixl
:
True
nixl_role
:
"
receiver"
nixl_peer_host
:
"
localhost"
nixl_peer_port
:
55555
nixl_buffer_size
:
1073741824
# 1GB
nixl_buffer_device
:
"
cuda"
nixl_enable_gc
:
True
examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
0 → 100644
View file @
25f39502
local_cpu
:
False
max_local_cpu_size
:
0
#local_disk:
max_local_disk_size
:
0
remote_serde
:
NULL
enable_nixl
:
True
nixl_role
:
"
sender"
nixl_peer_host
:
"
localhost"
nixl_peer_port
:
55555
nixl_buffer_size
:
1073741824
# 1GB
nixl_buffer_device
:
"
cuda"
nixl_enable_gc
:
True
examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
0 → 100644
View file @
25f39502
#!/bin/bash
echo
"Warning: LMCache disaggregated prefill support for vLLM v1 is experimental and subject to change."
PIDS
=()
# Switch to the directory of the current script
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
check_hf_token
()
{
if
[
-z
"
$HF_TOKEN
"
]
;
then
echo
"HF_TOKEN is not set. Please set it to your Hugging Face token."
exit
1
fi
if
[[
"
$HF_TOKEN
"
!=
hf_
*
]]
;
then
echo
"HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
exit
1
fi
echo
"HF_TOKEN is set and valid."
}
check_num_gpus
()
{
# can you check if the number of GPUs are >=2 via nvidia-smi?
num_gpus
=
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader |
wc
-l
)
if
[
"
$num_gpus
"
-lt
2
]
;
then
echo
"You need at least 2 GPUs to run disaggregated prefill."
exit
1
else
echo
"Found
$num_gpus
GPUs."
fi
}
ensure_python_library_installed
()
{
echo
"Checking if
$1
is installed..."
python3
-c
"import
$1
"
>
/dev/null 2>&1
if
[
$?
-ne
0
]
;
then
if
[
"
$1
"
==
"nixl"
]
;
then
echo
"
$1
is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
else
echo
"
$1
is not installed. Please install it via pip install
$1
."
fi
exit
1
else
echo
"
$1
is installed."
fi
}
cleanup
()
{
echo
"Stopping everything…"
trap
- INT TERM
# prevent re-entrancy
kill
--
-
$$
# negative PID == “this whole process-group”
wait
# reap children so we don't leave zombies
exit
0
}
wait_for_server
()
{
local
port
=
$1
local
timeout_seconds
=
1200
local
start_time
=
$(
date
+%s
)
echo
"Waiting for server on port
$port
..."
while
true
;
do
if
curl
-s
"localhost:
${
port
}
/v1/completions"
>
/dev/null
;
then
return
0
fi
local
now
=
$(
date
+%s
)
if
((
now - start_time
>=
timeout_seconds
))
;
then
echo
"Timeout waiting for server"
return
1
fi
sleep
1
done
}
main
()
{
check_hf_token
check_num_gpus
ensure_python_library_installed lmcache
ensure_python_library_installed nixl
ensure_python_library_installed pandas
ensure_python_library_installed datasets
ensure_python_library_installed vllm
trap
cleanup INT
trap
cleanup USR1
trap
cleanup TERM
echo
"Launching prefiller, decoder and proxy..."
echo
"Please check prefiller.log, decoder.log and proxy.log for logs."
bash disagg_vllm_launcher.sh prefiller
\
>
>(
tee
prefiller.log
)
2>&1 &
prefiller_pid
=
$!
PIDS+
=(
$prefiller_pid
)
bash disagg_vllm_launcher.sh decoder
\
>
>(
tee
decoder.log
)
2>&1 &
decoder_pid
=
$!
PIDS+
=(
$decoder_pid
)
python3 disagg_proxy_server.py
\
--host
localhost
\
--port
9000
\
--prefiller-host
localhost
\
--prefiller-port
8100
\
--decoder-host
localhost
\
--decoder-port
8200
\
>
>(
tee
proxy.log
)
2>&1 &
proxy_pid
=
$!
PIDS+
=(
$proxy_pid
)
wait_for_server 8100
wait_for_server 8200
wait_for_server 9000
echo
"All servers are up. Starting benchmark..."
# begin benchmark
cd
../../../../benchmarks/
python3 benchmark_serving.py
--port
9000
--seed
$(
date
+%s
)
\
--model
meta-llama/Llama-3.1-8B-Instruct
\
--dataset-name
random
--random-input-len
7500
--random-output-len
200
\
--num-prompts
200
--burstiness
100
--request-rate
3.6 |
tee
benchmark.log
echo
"Benchmarking done. Cleaning up..."
cleanup
}
main
\ No newline at end of file
examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
os
import
time
from
contextlib
import
asynccontextmanager
import
httpx
import
numpy
as
np
from
fastapi
import
FastAPI
,
Request
from
fastapi.responses
import
StreamingResponse
@
asynccontextmanager
async
def
lifespan
(
app
:
FastAPI
):
"""
Lifespan context manager to handle startup and shutdown events.
"""
# Startup: Initialize clients
prefiller_base_url
=
(
f
"http://
{
global_args
.
prefiller_host
}
:
{
global_args
.
prefiller_port
}
/v1"
)
decoder_base_url
=
(
f
"http://
{
global_args
.
decoder_host
}
:
{
global_args
.
decoder_port
}
/v1"
)
app
.
state
.
prefill_client
=
httpx
.
AsyncClient
(
timeout
=
None
,
base_url
=
prefiller_base_url
)
app
.
state
.
decode_client
=
httpx
.
AsyncClient
(
timeout
=
None
,
base_url
=
decoder_base_url
)
yield
# Shutdown: Close clients
await
app
.
state
.
prefill_client
.
aclose
()
await
app
.
state
.
decode_client
.
aclose
()
# Update FastAPI app initialization to use lifespan
app
=
FastAPI
(
lifespan
=
lifespan
)
class
StatsCalculator
:
def
__init__
(
self
):
self
.
_stats
=
[]
self
.
_last_log_time
=
time
.
time
()
def
add
(
self
,
value
):
self
.
_stats
.
append
(
value
)
if
time
.
time
()
-
self
.
_last_log_time
>
5
:
self
.
_log_stats
()
self
.
_last_log_time
=
time
.
time
()
def
_log_stats
(
self
):
# Print average, median, and 99th percentile
np_arr
=
np
.
array
(
self
.
_stats
)
output_str
=
(
f
"
\n
Num requests:
{
len
(
self
.
_stats
)
}
"
+
"
\n
Prefill node TTFT stats:"
+
f
"
\n
- Average (ms):
{
np
.
mean
(
np_arr
)
}
"
+
f
"
\n
- Median (ms):
{
np
.
median
(
np_arr
)
}
"
+
f
"
\n
- 99th Percentile (ms):
{
np
.
percentile
(
np_arr
,
99
)
}
\n
"
)
print
(
"==============================="
,
output_str
,
"==============================="
,
)
stats_calculator
=
StatsCalculator
()
counter
=
0
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--prefiller-host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--prefiller-port"
,
type
=
int
,
default
=
8100
)
parser
.
add_argument
(
"--decoder-host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--decoder-port"
,
type
=
int
,
default
=
8200
)
args
=
parser
.
parse_args
()
return
args
# Initialize variables to hold the persistent clients
app
.
state
.
prefill_client
=
None
app
.
state
.
decode_client
=
None
async
def
send_request_to_service
(
client
:
httpx
.
AsyncClient
,
endpoint
:
str
,
req_data
:
dict
):
"""
Send a request to a service using a persistent client.
"""
req_data
=
req_data
.
copy
()
req_data
[
"max_tokens"
]
=
1
if
"max_completion_tokens"
in
req_data
:
req_data
[
"max_completion_tokens"
]
=
1
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
response
=
await
client
.
post
(
endpoint
,
json
=
req_data
,
headers
=
headers
)
response
.
raise_for_status
()
return
response
async
def
stream_service_response
(
client
:
httpx
.
AsyncClient
,
endpoint
:
str
,
req_data
:
dict
):
"""
Asynchronously stream the response from a service using a persistent client.
"""
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
async
with
client
.
stream
(
"POST"
,
endpoint
,
json
=
req_data
,
headers
=
headers
)
as
response
:
response
.
raise_for_status
()
async
for
chunk
in
response
.
aiter_bytes
():
yield
chunk
@
app
.
post
(
"/v1/completions"
)
async
def
handle_completions
(
request
:
Request
):
global
counter
,
stats_calculator
counter
+=
1
st
=
time
.
time
()
try
:
req_data
=
await
request
.
json
()
# Send request to prefill service, ignore the response
await
send_request_to_service
(
app
.
state
.
prefill_client
,
"/completions"
,
req_data
)
et
=
time
.
time
()
stats_calculator
.
add
(
et
-
st
)
# Stream response from decode service
async
def
generate_stream
():
async
for
chunk
in
stream_service_response
(
app
.
state
.
decode_client
,
"/completions"
,
req_data
):
yield
chunk
return
StreamingResponse
(
generate_stream
(),
media_type
=
"text/event-stream"
)
except
Exception
as
e
:
import
sys
import
traceback
exc_info
=
sys
.
exc_info
()
print
(
"Error occurred in disagg prefill proxy server - completions endpoint"
)
print
(
e
)
print
(
""
.
join
(
traceback
.
format_exception
(
*
exc_info
)))
raise
@
app
.
post
(
"/v1/chat/completions"
)
async
def
handle_chat_completions
(
request
:
Request
):
global
counter
,
stats_calculator
counter
+=
1
st
=
time
.
time
()
try
:
req_data
=
await
request
.
json
()
# Send request to prefill service, ignore the response
await
send_request_to_service
(
app
.
state
.
prefill_client
,
"/chat/completions"
,
req_data
)
et
=
time
.
time
()
stats_calculator
.
add
(
et
-
st
)
# Stream response from decode service
async
def
generate_stream
():
async
for
chunk
in
stream_service_response
(
app
.
state
.
decode_client
,
"/chat/completions"
,
req_data
):
yield
chunk
return
StreamingResponse
(
generate_stream
(),
media_type
=
"text/event-stream"
)
except
Exception
as
e
:
import
sys
import
traceback
exc_info
=
sys
.
exc_info
()
print
(
"Error occurred in disagg prefill proxy server - chat completions endpoint"
)
print
(
e
)
print
(
""
.
join
(
traceback
.
format_exception
(
*
exc_info
)))
raise
if
__name__
==
"__main__"
:
global
global_args
global_args
=
parse_args
()
import
uvicorn
uvicorn
.
run
(
app
,
host
=
global_args
.
host
,
port
=
global_args
.
port
)
examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
0 → 100644
View file @
25f39502
#!/bin/bash
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
if
[[
$#
-lt
1
]]
;
then
echo
"Usage:
$0
<prefiller | decoder> [model]"
exit
1
fi
if
[[
$#
-eq
1
]]
;
then
echo
"Using default model: meta-llama/Llama-3.1-8B-Instruct"
MODEL
=
"meta-llama/Llama-3.1-8B-Instruct"
else
echo
"Using model:
$2
"
MODEL
=
$2
fi
if
[[
$1
==
"prefiller"
]]
;
then
# Prefiller listens on port 8100
prefill_config_file
=
$SCRIPT_DIR
/configs/lmcache-prefiller-config.yaml
UCX_TLS
=
cuda_ipc,cuda_copy,tcp
\
LMCACHE_CONFIG_FILE
=
$prefill_config_file
\
LMCACHE_USE_EXPERIMENTAL
=
True
\
VLLM_ENABLE_V1_MULTIPROCESSING
=
1
\
VLLM_WORKER_MULTIPROC_METHOD
=
spawn
\
CUDA_VISIBLE_DEVICES
=
0
\
vllm serve
$MODEL
\
--port
8100
\
--disable-log-requests
\
--enforce-eager
\
--kv-transfer-config
\
'{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
elif
[[
$1
==
"decoder"
]]
;
then
# Decoder listens on port 8200
decode_config_file
=
$SCRIPT_DIR
/configs/lmcache-decoder-config.yaml
UCX_TLS
=
cuda_ipc,cuda_copy,tcp
\
LMCACHE_CONFIG_FILE
=
$decode_config_file
\
LMCACHE_USE_EXPERIMENTAL
=
True
\
VLLM_ENABLE_V1_MULTIPROCESSING
=
1
\
VLLM_WORKER_MULTIPROC_METHOD
=
spawn
\
CUDA_VISIBLE_DEVICES
=
1
\
vllm serve
$MODEL
\
--port
8200
\
--disable-log-requests
\
--enforce-eager
\
--kv-transfer-config
\
'{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
else
echo
"Invalid role:
$1
"
echo
"Should be either prefiller, decoder"
exit
1
fi
examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of remote KV cache sharing
with LMCache.
We will launch 2 vllm instances, and launch an additional LMCache server.
KV cache is transferred in the following manner:
(1) vLLM instance 1 -> LMCache server (KV cache store).
(2) LMCache server -> vLLM instance 2 (KV cache reuse/retrieve).
Note that lmcache needs to be installed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache.
"""
import
os
import
subprocess
import
time
from
multiprocessing
import
Event
,
Process
from
lmcache.integration.vllm.utils
import
ENGINE_NAME
from
lmcache.v1.cache_engine
import
LMCacheEngineBuilder
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
# LMCache-related environment variables
# The port to start LMCache server
port
=
8100
# Use experimental features in LMCache
os
.
environ
[
"LMCACHE_USE_EXPERIMENTAL"
]
=
"True"
# LMCache is set to use 256 tokens per chunk
os
.
environ
[
"LMCACHE_CHUNK_SIZE"
]
=
"256"
# Disable local CPU backend in LMCache
os
.
environ
[
"LMCACHE_LOCAL_CPU"
]
=
"False"
# Set local CPU memory buffer limit to 5.0 GB
os
.
environ
[
"LMCACHE_MAX_LOCAL_CPU_SIZE"
]
=
"5.0"
# Set the remote URL for LMCache server
os
.
environ
[
"LMCACHE_REMOTE_URL"
]
=
f
"lm://localhost:
{
port
}
"
# Set the serializer/deserializer between vllm and LMCache server
# `naive` indicates using raw bytes of the tensor without any compression
os
.
environ
[
"LMCACHE_REMOTE_SERDE"
]
=
"naive"
prompts
=
[
"Hello, how are you?"
*
1000
,
]
def
run_store
(
store_done
,
prompts
):
# We use GPU 0 for KV cache store process.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"0"
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
10
)
ktc
=
KVTransferConfig
(
kv_connector
=
"LMCacheConnectorV1"
,
kv_role
=
"kv_both"
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enforce_eager
=
True
,
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
print
(
"KV cache store is finished."
)
store_done
.
set
()
# Clean up lmcache backend
LMCacheEngineBuilder
.
destroy
(
ENGINE_NAME
)
def
run_retrieve
(
store_done
,
prompts
,
timeout
=
1
):
# We use GPU 1 for KV cache retrieve process.
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"1"
sampling_params
=
SamplingParams
(
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
10
)
ktc
=
KVTransferConfig
(
kv_connector
=
"LMCacheConnectorV1"
,
kv_role
=
"kv_both"
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# of memory. Reduce the value if your GPU has less memory.
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enforce_eager
=
True
,
)
print
(
"Waiting for KV cache store to finish..."
)
store_done
.
wait
()
time
.
sleep
(
timeout
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Generated text:
{
generated_text
!
r
}
"
)
# Clean up lmcache backend
LMCacheEngineBuilder
.
destroy
(
ENGINE_NAME
)
def
run_lmcache_server
(
port
):
server_proc
=
subprocess
.
Popen
(
[
"python"
,
"-m"
,
"lmcache.v1.server"
,
"localhost"
,
str
(
port
)]
)
return
server_proc
def
main
():
store_done
=
Event
()
store_process
=
Process
(
target
=
run_store
,
args
=
(
store_done
,
prompts
))
retrieve_process
=
Process
(
target
=
run_retrieve
,
args
=
(
store_done
,
prompts
))
lmcache_server_process
=
run_lmcache_server
(
port
)
# Start KV cache store process
store_process
.
start
()
# Start KV cache retrieve process
retrieve_process
.
start
()
# Clean up the processes
store_process
.
join
()
retrieve_process
.
terminate
()
lmcache_server_process
.
terminate
()
lmcache_server_process
.
wait
()
if
__name__
==
"__main__"
:
main
()
examples/others/logging_configuration.md
0 → 100644
View file @
25f39502
# Logging Configuration
vLLM leverages Python's
`logging.config.dictConfig`
functionality to enable
robust and flexible configuration of the various loggers used by vLLM.
vLLM offers two environment variables that can be used to accommodate a range
of logging configurations that range from simple-and-inflexible to
more-complex-and-more-flexible.
-
No vLLM logging (simple and inflexible)
-
Set
`VLLM_CONFIGURE_LOGGING=0`
(leaving
`VLLM_LOGGING_CONFIG_PATH`
unset)
-
vLLM's default logging configuration (simple and inflexible)
-
Leave
`VLLM_CONFIGURE_LOGGING`
unset or set
`VLLM_CONFIGURE_LOGGING=1`
-
Fine-grained custom logging configuration (more complex, more flexible)
-
Leave
`VLLM_CONFIGURE_LOGGING`
unset or set
`VLLM_CONFIGURE_LOGGING=1`
and
set
`VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
## Logging Configuration Environment Variables
### `VLLM_CONFIGURE_LOGGING`
`VLLM_CONFIGURE_LOGGING`
controls whether or not vLLM takes any action to
configure the loggers used by vLLM. This functionality is enabled by default,
but can be disabled by setting
`VLLM_CONFIGURE_LOGGING=0`
when running vLLM.
If
`VLLM_CONFIGURE_LOGGING`
is enabled and no value is given for
`VLLM_LOGGING_CONFIG_PATH`
, vLLM will use built-in default configuration to
configure the root vLLM logger. By default, no other vLLM loggers are
configured and, as such, all vLLM loggers defer to the root vLLM logger to make
all logging decisions.
If
`VLLM_CONFIGURE_LOGGING`
is disabled and a value is given for
`VLLM_LOGGING_CONFIG_PATH`
, an error will occur while starting vLLM.
### `VLLM_LOGGING_CONFIG_PATH`
`VLLM_LOGGING_CONFIG_PATH`
allows users to specify a path to a JSON file of
alternative, custom logging configuration that will be used instead of vLLM's
built-in default logging configuration. The logging configuration should be
provided in JSON format following the schema specified by Python's
[
logging
configuration dictionary
schema
](
https://docs.python.org/3/library/logging.config.html#dictionary-schema-details
)
.
If
`VLLM_LOGGING_CONFIG_PATH`
is specified, but
`VLLM_CONFIGURE_LOGGING`
is
disabled, an error will occur while starting vLLM.
## Examples
### Example 1: Customize vLLM root logger
For this example, we will customize the vLLM root logger to use
[
`python-json-logger`
](
https://github.com/nhairs/python-json-logger
)
(which is part of the container image) to log to
STDOUT of the console in JSON format with a log level of
`INFO`
.
To begin, first, create an appropriate JSON logging configuration file:
??? note "/path/to/logging_config.json"
```json
{
"formatters": {
"json": {
"class": "pythonjsonlogger.jsonlogger.JsonFormatter"
}
},
"handlers": {
"console": {
"class" : "logging.StreamHandler",
"formatter": "json",
"level": "INFO",
"stream": "ext://sys.stdout"
}
},
"loggers": {
"vllm": {
"handlers": ["console"],
"level": "INFO",
"propagate": false
}
},
"version": 1
}
```
Finally, run vLLM with the
`VLLM_LOGGING_CONFIG_PATH`
environment variable set
to the path of the custom logging configuration JSON file:
```
bash
VLLM_LOGGING_CONFIG_PATH
=
/path/to/logging_config.json
\
vllm serve mistralai/Mistral-7B-v0.1
--max-model-len
2048
```
### Example 2: Silence a particular vLLM logger
To silence a particular vLLM logger, it is necessary to provide custom logging
configuration for the target logger that configures the logger so that it won't
propagate its log messages to the root vLLM logger.
When custom configuration is provided for any logger, it is also necessary to
provide configuration for the root vLLM logger since any custom logger
configuration overrides the built-in default logging configuration used by vLLM.
First, create an appropriate JSON logging configuration file that includes
configuration for the root vLLM logger and for the logger you wish to silence:
??? note "/path/to/logging_config.json"
```json
{
"formatters": {
"vllm": {
"class": "vllm.logging_utils.NewLineFormatter",
"datefmt": "%m-%d %H:%M:%S",
"format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
}
},
"handlers": {
"vllm": {
"class" : "logging.StreamHandler",
"formatter": "vllm",
"level": "INFO",
"stream": "ext://sys.stdout"
}
},
"loggers": {
"vllm": {
"handlers": ["vllm"],
"level": "DEBUG",
"propagate": false
},
"vllm.example_noisy_logger": {
"propagate": false
}
},
"version": 1
}
```
Finally, run vLLM with the
`VLLM_LOGGING_CONFIG_PATH`
environment variable set
to the path of the custom logging configuration JSON file:
```
bash
VLLM_LOGGING_CONFIG_PATH
=
/path/to/logging_config.json
\
vllm serve mistralai/Mistral-7B-v0.1
--max-model-len
2048
```
### Example 3: Disable vLLM default logging configuration
To disable vLLM's default logging configuration and silence all vLLM loggers,
simple set
`VLLM_CONFIGURE_LOGGING=0`
when running vLLM. This will prevent vLLM
for configuring the root vLLM logger, which in turn, silences all other vLLM
loggers.
```
bash
VLLM_CONFIGURE_LOGGING
=
0
\
vllm serve mistralai/Mistral-7B-v0.1
--max-model-len
2048
```
## Additional resources
-
[
`logging.config` Dictionary Schema Details
](
https://docs.python.org/3/library/logging.config.html#dictionary-schema-details
)
examples/others/tensorize_vllm_model.py
0 → 100644
View file @
25f39502
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
dataclasses
import
json
import
os
import
uuid
from
vllm
import
LLM
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.lora.request
import
LoRARequest
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerArgs
,
TensorizerConfig
,
tensorize_lora_adapter
,
tensorize_vllm_model
,
)
from
vllm.utils
import
FlexibleArgumentParser
# yapf conflicts with isort for this docstring
# yapf: disable
"""
tensorize_vllm_model.py is a script that can be used to serialize and
deserialize vLLM models. These models can be loaded using tensorizer
to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
or locally. Tensor encryption and decryption is also supported, although
libsodium must be installed to use it. Install vllm with tensorizer support
using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
https://github.com/coreweave/tensorizer
To serialize a model, install vLLM from source, then run something
like this from the root level of this repository:
python examples/others/tensorize_vllm_model.py
\
--model facebook/opt-125m
\
serialize
\
--serialized-directory s3://my-bucket
\
--suffix v1
Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
and saves it to your S3 bucket. A local directory can also be used. This
assumes your S3 credentials are specified as environment variables
in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and
`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide
`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint`
as CLI args to this script.
You can also encrypt the model weights with a randomly-generated key by
providing a `--keyfile` argument.
To deserialize a model, you can run something like this from the root
level of this repository:
python examples/others/tensorize_vllm_model.py
\
--model EleutherAI/gpt-j-6B
\
--dtype float16
\
deserialize
\
--path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors
Which downloads the model tensors from your S3 bucket and deserializes them.
You can also provide a `--keyfile` argument to decrypt the model weights if
they were serialized with encryption.
To support distributed tensor-parallel models, each model shard will be
serialized to a separate file. The tensorizer_uri is then specified as a string
template with a format specifier such as '%03d' that will be rendered with the
shard's rank. Sharded models serialized with this script will be named as
model-rank-%03d.tensors
For more information on the available arguments for serializing, run
`python -m examples.others.tensorize_vllm_model serialize --help`.
Or for deserializing:
`python examples/others/tensorize_vllm_model.py deserialize --help`.
Once a model is serialized, tensorizer can be invoked with the `LLM` class
directly to load models:
llm = LLM(model="facebook/opt-125m",
load_format="tensorizer",
model_loader_extra_config=TensorizerConfig(
tensorizer_uri = path_to_tensors,
num_readers=3,
)
)
A serialized model can be used during model loading for the vLLM OpenAI
inference server. `model_loader_extra_config` is exposed as the CLI arg
`--model-loader-extra-config`, and accepts a JSON string literal of the
TensorizerConfig arguments desired.
In order to see all of the available arguments usable to configure
loading with tensorizer that are given to `TensorizerConfig`, run:
`python examples/others/tensorize_vllm_model.py deserialize --help`
under the `tensorizer options` section. These can also be used for
deserialization in this example script, although `--tensorizer-uri` and
`--path-to-tensors` are functionally the same in this case.
Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter
can be serialized directly with the path to the LoRA adapter on HF Hub and
a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter
will serialize the LoRA adapter artifacts to `--serialized-directory`.
You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring
the LoRA artifacts are in your model artifacts directory and specifying
`--enable-lora`. For instance:
```
vllm serve <model_path>
\
--load-format tensorizer
\
--model-loader-extra-config '{"tensorizer_uri": "<model_path>.tensors"}'
\
--enable-lora
```
"""
def
parse_args
():
parser
=
FlexibleArgumentParser
(
description
=
"An example script that can be used to serialize and "
"deserialize vLLM models. These models "
"can be loaded using tensorizer directly to the GPU "
"extremely quickly. Tensor encryption and decryption is "
"also supported, although libsodium must be installed to "
"use it."
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
parser
.
add_argument
(
"--lora-path"
,
type
=
str
,
required
=
False
,
help
=
"Path to a LoRA adapter to "
"serialize along with model tensors. This can then be deserialized "
"along with the model by passing a tensorizer_config kwarg to "
"LoRARequest with type TensorizerConfig. See the docstring for this "
"for a usage example."
)
subparsers
=
parser
.
add_subparsers
(
dest
=
'command'
)
serialize_parser
=
subparsers
.
add_parser
(
'serialize'
,
help
=
"Serialize a model to `--serialized-directory`"
)
serialize_parser
.
add_argument
(
"--suffix"
,
type
=
str
,
required
=
False
,
help
=
(
"The suffix to append to the serialized model directory, which is "
"used to construct the location of the serialized model tensors, "
"e.g. if `--serialized-directory` is `s3://my-bucket/` and "
"`--suffix` is `v1`, the serialized model tensors will be "
"saved to "
"`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
"If none is provided, a random UUID will be used."
))
serialize_parser
.
add_argument
(
"--serialized-directory"
,
type
=
str
,
required
=
True
,
help
=
"The directory to serialize the model to. "
"This can be a local directory or S3 URI. The path to where the "
"tensors are saved is a combination of the supplied `dir` and model "
"reference ID. For instance, if `dir` is the serialized directory, "
"and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
"be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
"where `suffix` is given by `--suffix` or a random UUID if not "
"provided."
)
serialize_parser
.
add_argument
(
"--keyfile"
,
type
=
str
,
required
=
False
,
help
=
(
"Encrypt the model weights with a randomly-generated binary key,"
" and save the key at this path"
))
deserialize_parser
=
subparsers
.
add_parser
(
'deserialize'
,
help
=
(
"Deserialize a model from `--path-to-tensors`"
" to verify it can be loaded and used."
))
deserialize_parser
.
add_argument
(
"--path-to-tensors"
,
type
=
str
,
required
=
True
,
help
=
"The local path or S3 URI to the model tensors to deserialize. "
)
deserialize_parser
.
add_argument
(
"--keyfile"
,
type
=
str
,
required
=
False
,
help
=
(
"Path to a binary key to use to decrypt the model weights,"
" if the model was serialized with encryption"
))
TensorizerArgs
.
add_cli_args
(
deserialize_parser
)
return
parser
.
parse_args
()
def
deserialize
(
args
,
tensorizer_config
):
if
args
.
lora_path
:
tensorizer_config
.
lora_dir
=
tensorizer_config
.
tensorizer_dir
llm
=
LLM
(
model
=
args
.
model
,
load_format
=
"tensorizer"
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
model_loader_extra_config
=
tensorizer_config
,
enable_lora
=
True
,
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
256
,
stop
=
[
"[/assistant]"
]
)
# Truncating this as the extra text isn't necessary
prompts
=
[
"[user] Write a SQL query to answer the question based on ..."
]
# Test LoRA load
print
(
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
LoRARequest
(
"sql-lora"
,
1
,
args
.
lora_path
,
tensorizer_config
=
tensorizer_config
)
)
)
else
:
llm
=
LLM
(
model
=
args
.
model
,
load_format
=
"tensorizer"
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
model_loader_extra_config
=
tensorizer_config
)
return
llm
def
main
():
args
=
parse_args
()
s3_access_key_id
=
(
getattr
(
args
,
's3_access_key_id'
,
None
)
or
os
.
environ
.
get
(
"S3_ACCESS_KEY_ID"
,
None
))
s3_secret_access_key
=
(
getattr
(
args
,
's3_secret_access_key'
,
None
)
or
os
.
environ
.
get
(
"S3_SECRET_ACCESS_KEY"
,
None
))
s3_endpoint
=
(
getattr
(
args
,
's3_endpoint'
,
None
)
or
os
.
environ
.
get
(
"S3_ENDPOINT_URL"
,
None
))
credentials
=
{
"s3_access_key_id"
:
s3_access_key_id
,
"s3_secret_access_key"
:
s3_secret_access_key
,
"s3_endpoint"
:
s3_endpoint
}
model_ref
=
args
.
model
if
args
.
command
==
"serialize"
or
args
.
command
==
"deserialize"
:
keyfile
=
args
.
keyfile
else
:
keyfile
=
None
if
args
.
model_loader_extra_config
:
config
=
json
.
loads
(
args
.
model_loader_extra_config
)
tensorizer_args
=
\
TensorizerConfig
(
**
config
).
_construct_tensorizer_args
()
tensorizer_args
.
tensorizer_uri
=
args
.
path_to_tensors
else
:
tensorizer_args
=
None
if
args
.
command
==
"serialize"
:
eng_args_dict
=
{
f
.
name
:
getattr
(
args
,
f
.
name
)
for
f
in
dataclasses
.
fields
(
EngineArgs
)}
engine_args
=
EngineArgs
.
from_cli_args
(
argparse
.
Namespace
(
**
eng_args_dict
)
)
input_dir
=
args
.
serialized_directory
.
rstrip
(
'/'
)
suffix
=
args
.
suffix
if
args
.
suffix
else
uuid
.
uuid4
().
hex
base_path
=
f
"
{
input_dir
}
/vllm/
{
model_ref
}
/
{
suffix
}
"
if
engine_args
.
tensor_parallel_size
>
1
:
model_path
=
f
"
{
base_path
}
/model-rank-%03d.tensors"
else
:
model_path
=
f
"
{
base_path
}
/model.tensors"
tensorizer_config
=
TensorizerConfig
(
tensorizer_uri
=
model_path
,
encryption_keyfile
=
keyfile
,
**
credentials
)
if
args
.
lora_path
:
tensorizer_config
.
lora_dir
=
tensorizer_config
.
tensorizer_dir
tensorize_lora_adapter
(
args
.
lora_path
,
tensorizer_config
)
tensorize_vllm_model
(
engine_args
,
tensorizer_config
)
elif
args
.
command
==
"deserialize"
:
if
not
tensorizer_args
:
tensorizer_config
=
TensorizerConfig
(
tensorizer_uri
=
args
.
path_to_tensors
,
encryption_keyfile
=
keyfile
,
**
credentials
)
deserialize
(
args
,
tensorizer_config
)
else
:
raise
ValueError
(
"Either serialize or deserialize must be specified."
)
if
__name__
==
"__main__"
:
main
()
examples/pyproject.toml
0 → 100644
View file @
25f39502
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length
=
88
exclude
=
[
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py"
,
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**"
=
["ALL"]
"vllm/version.py"
=
["F401"]
"vllm/_version.py"
=
["ALL"]
[tool.ruff.lint]
select
=
[
# pycodestyle
"E"
,
# Pyflakes
"F"
,
# pyupgrade
"UP"
,
# flake8-bugbear
"B"
,
# flake8-simplify
"SIM"
,
# isort
"I"
,
# flake8-logging-format
"G"
,
]
ignore
=
[
# star imports
"F405"
,
"F403"
,
# lambda expression assignment
"E731"
,
# Loop control variable not used within loop body
"B007"
,
# f-string format
"UP032"
,
# Can remove once 3.10+ is the minimum Python version
"UP007"
,
]
[tool.ruff.lint.isort]
known-first-party
=
["vllm"]
[tool.ruff.format]
docstring-code-format
=
true
\ No newline at end of file
examples/template_alpaca.jinja
0 → 100644
View file @
25f39502
{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
{% for message in messages %}
{% if message['role'] == 'user' %}
### Instruction:
{{ message['content']|trim -}}
{% if not loop.last %}
{% endif %}
{% elif message['role'] == 'assistant' %}
### Response:
{{ message['content']|trim -}}
{% if not loop.last %}
{% endif %}
{% elif message['role'] == 'user_context' %}
### Input:
{{ message['content']|trim -}}
{% if not loop.last %}
{% endif %}
{% endif %}
{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
### Response:
{% endif %}
\ No newline at end of file
examples/template_baichuan.jinja
0 → 100644
View file @
25f39502
{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
{%- for message in messages -%}
{%- if message['role'] == 'user' -%}
{{- '<reserved_106>' + message['content'] -}}
{%- elif message['role'] == 'assistant' -%}
{{- '<reserved_107>' + message['content'] -}}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
{{- '<reserved_107>' -}}
{% endif %}
\ No newline at end of file
examples/template_chatglm.jinja
0 → 100644
View file @
25f39502
{%- set counter = namespace(index=0) -%}
{%- for message in messages -%}
{%- if message['role'] == 'user' -%}
{{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}}
{%- set counter.index = counter.index + 1 -%}
{%- endif -%}
{%- if message['role'] == 'assistant' -%}
{{- '\n答:' + message['content'] -}}
{%- if (loop.last and add_generation_prompt) or not loop.last -%}
{{- '\n' -}}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
{{- '\n答:' -}}
{%- endif -%}
\ No newline at end of file
Prev
1
…
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment