Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
705f6a35
Commit
705f6a35
authored
Jul 16, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1
parents
af837396
4cf256ae
Changes
439
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1552 additions
and
234 deletions
+1552
-234
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/cli_args.py
+28
-4
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+81
-13
vllm/entrypoints/openai/run_batch.py
vllm/entrypoints/openai/run_batch.py
+21
-16
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+95
-72
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+72
-20
vllm/entrypoints/openai/serving_embedding.py
vllm/entrypoints/openai/serving_embedding.py
+14
-14
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+64
-14
vllm/envs.py
vllm/envs.py
+43
-3
vllm/executor/cpu_executor.py
vllm/executor/cpu_executor.py
+19
-1
vllm/executor/distributed_gpu_executor.py
vllm/executor/distributed_gpu_executor.py
+21
-14
vllm/executor/executor_base.py
vllm/executor/executor_base.py
+52
-6
vllm/executor/gpu_executor.py
vllm/executor/gpu_executor.py
+29
-3
vllm/executor/multiproc_gpu_executor.py
vllm/executor/multiproc_gpu_executor.py
+34
-18
vllm/executor/neuron_executor.py
vllm/executor/neuron_executor.py
+23
-5
vllm/executor/openvino_executor.py
vllm/executor/openvino_executor.py
+179
-0
vllm/executor/ray_gpu_executor.py
vllm/executor/ray_gpu_executor.py
+115
-22
vllm/executor/ray_utils.py
vllm/executor/ray_utils.py
+18
-9
vllm/executor/ray_xpu_executor.py
vllm/executor/ray_xpu_executor.py
+404
-0
vllm/executor/tpu_executor.py
vllm/executor/tpu_executor.py
+139
-0
vllm/executor/xpu_executor.py
vllm/executor/xpu_executor.py
+101
-0
No files found.
Too many changes to show.
To preserve performance only
439 of 439+
files are displayed.
Plain diff
Email patch
vllm/entrypoints/openai/cli_args.py
View file @
705f6a35
...
@@ -9,7 +9,9 @@ import json
...
@@ -9,7 +9,9 @@ import json
import
ssl
import
ssl
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
nullable_str
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
nullable_str
from
vllm.entrypoints.openai.serving_engine
import
LoRAModulePath
from
vllm.entrypoints.openai.serving_engine
import
(
LoRAModulePath
,
PromptAdapterPath
)
from
vllm.utils
import
FlexibleArgumentParser
class
LoRAParserAction
(
argparse
.
Action
):
class
LoRAParserAction
(
argparse
.
Action
):
...
@@ -22,9 +24,17 @@ class LoRAParserAction(argparse.Action):
...
@@ -22,9 +24,17 @@ class LoRAParserAction(argparse.Action):
setattr
(
namespace
,
self
.
dest
,
lora_list
)
setattr
(
namespace
,
self
.
dest
,
lora_list
)
def
make_arg_parser
():
class
PromptAdapterParserAction
(
argparse
.
Action
):
parser
=
argparse
.
ArgumentParser
(
description
=
"vLLM OpenAI-Compatible RESTful API server."
)
def
__call__
(
self
,
parser
,
namespace
,
values
,
option_string
=
None
):
adapter_list
=
[]
for
item
in
values
:
name
,
path
=
item
.
split
(
'='
)
adapter_list
.
append
(
PromptAdapterPath
(
name
,
path
))
setattr
(
namespace
,
self
.
dest
,
adapter_list
)
def
make_arg_parser
(
parser
:
FlexibleArgumentParser
)
->
FlexibleArgumentParser
:
parser
.
add_argument
(
"--host"
,
parser
.
add_argument
(
"--host"
,
type
=
nullable_str
,
type
=
nullable_str
,
default
=
None
,
default
=
None
,
...
@@ -64,6 +74,14 @@ def make_arg_parser():
...
@@ -64,6 +74,14 @@ def make_arg_parser():
action
=
LoRAParserAction
,
action
=
LoRAParserAction
,
help
=
"LoRA module configurations in the format name=path. "
help
=
"LoRA module configurations in the format name=path. "
"Multiple modules can be specified."
)
"Multiple modules can be specified."
)
parser
.
add_argument
(
"--prompt-adapters"
,
type
=
nullable_str
,
default
=
None
,
nargs
=
'+'
,
action
=
PromptAdapterParserAction
,
help
=
"Prompt adapter configurations in the format name=path. "
"Multiple adapters can be specified."
)
parser
.
add_argument
(
"--chat-template"
,
parser
.
add_argument
(
"--chat-template"
,
type
=
nullable_str
,
type
=
nullable_str
,
default
=
None
,
default
=
None
,
...
@@ -113,3 +131,9 @@ def make_arg_parser():
...
@@ -113,3 +131,9 @@ def make_arg_parser():
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
return
parser
return
parser
def
create_parser_for_docs
()
->
FlexibleArgumentParser
:
parser_for_docs
=
FlexibleArgumentParser
(
prog
=
"-m vllm.entrypoints.openai.api_server"
)
return
make_arg_parser
(
parser_for_docs
)
vllm/entrypoints/openai/protocol.py
View file @
705f6a35
...
@@ -103,7 +103,8 @@ class ResponseFormat(OpenAIBaseModel):
...
@@ -103,7 +103,8 @@ class ResponseFormat(OpenAIBaseModel):
class
StreamOptions
(
OpenAIBaseModel
):
class
StreamOptions
(
OpenAIBaseModel
):
include_usage
:
Optional
[
bool
]
include_usage
:
Optional
[
bool
]
=
True
continuous_usage_stats
:
Optional
[
bool
]
=
True
class
FunctionDefinition
(
OpenAIBaseModel
):
class
FunctionDefinition
(
OpenAIBaseModel
):
...
@@ -190,6 +191,27 @@ class ChatCompletionRequest(OpenAIBaseModel):
...
@@ -190,6 +191,27 @@ class ChatCompletionRequest(OpenAIBaseModel):
"special tokens so this should be set to False (as is the "
"special tokens so this should be set to False (as is the "
"default)."
),
"default)."
),
)
)
documents
:
Optional
[
List
[
Dict
[
str
,
str
]]]
=
Field
(
default
=
None
,
description
=
(
"A list of dicts representing documents that will be accessible to "
"the model if it is performing RAG (retrieval-augmented generation)."
" If the template does not support RAG, this argument will have no "
"effect. We recommend that each document should be a dict containing "
"
\"
title
\"
and
\"
text
\"
keys."
),
)
chat_template
:
Optional
[
str
]
=
Field
(
default
=
None
,
description
=
(
"A Jinja template to use for this conversion. "
"If this is not passed, the model's default chat template will be "
"used instead."
),
)
chat_template_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."
),
)
include_stop_str_in_output
:
Optional
[
bool
]
=
Field
(
include_stop_str_in_output
:
Optional
[
bool
]
=
Field
(
default
=
False
,
default
=
False
,
description
=
(
description
=
(
...
@@ -234,15 +256,22 @@ class ChatCompletionRequest(OpenAIBaseModel):
...
@@ -234,15 +256,22 @@ class ChatCompletionRequest(OpenAIBaseModel):
logits_processors
=
None
logits_processors
=
None
if
self
.
logit_bias
:
if
self
.
logit_bias
:
logit_bias
:
Dict
[
int
,
float
]
=
{}
try
:
for
token_id
,
bias
in
self
.
logit_bias
.
items
():
# Convert token_id to integer before we add to LLMEngine
# Clamp the bias between -100 and 100 per OpenAI API spec
logit_bias
[
int
(
token_id
)]
=
min
(
100
,
max
(
-
100
,
bias
))
except
ValueError
as
exc
:
raise
ValueError
(
f
"Found token_id `
{
token_id
}
` in logit_bias "
f
"but token_id must be an integer or string "
f
"representing an integer"
)
from
exc
def
logit_bias_logits_processor
(
def
logit_bias_logits_processor
(
token_ids
:
List
[
int
],
token_ids
:
List
[
int
],
logits
:
torch
.
Tensor
)
->
torch
.
Tensor
:
logits
:
torch
.
Tensor
)
->
torch
.
Tensor
:
assert
self
.
logit_bias
is
not
None
for
token_id
,
bias
in
logit_bias
.
items
():
for
token_id
,
bias
in
self
.
logit_bias
.
items
():
logits
[
token_id
]
+=
bias
# Clamp the bias between -100 and 100 per OpenAI API spec
bias
=
min
(
100
,
max
(
-
100
,
bias
))
logits
[
int
(
token_id
)]
+=
bias
return
logits
return
logits
logits_processors
=
[
logit_bias_logits_processor
]
logits_processors
=
[
logit_bias_logits_processor
]
...
@@ -419,15 +448,22 @@ class CompletionRequest(OpenAIBaseModel):
...
@@ -419,15 +448,22 @@ class CompletionRequest(OpenAIBaseModel):
logits_processors
=
None
logits_processors
=
None
if
self
.
logit_bias
:
if
self
.
logit_bias
:
logit_bias
:
Dict
[
int
,
float
]
=
{}
try
:
for
token_id
,
bias
in
self
.
logit_bias
.
items
():
# Convert token_id to integer
# Clamp the bias between -100 and 100 per OpenAI API spec
logit_bias
[
int
(
token_id
)]
=
min
(
100
,
max
(
-
100
,
bias
))
except
ValueError
as
exc
:
raise
ValueError
(
f
"Found token_id `
{
token_id
}
` in logit_bias "
f
"but token_id must be an integer or string "
f
"representing an integer"
)
from
exc
def
logit_bias_logits_processor
(
def
logit_bias_logits_processor
(
token_ids
:
List
[
int
],
token_ids
:
List
[
int
],
logits
:
torch
.
Tensor
)
->
torch
.
Tensor
:
logits
:
torch
.
Tensor
)
->
torch
.
Tensor
:
assert
self
.
logit_bias
is
not
None
for
token_id
,
bias
in
logit_bias
.
items
():
for
token_id
,
bias
in
self
.
logit_bias
.
items
():
logits
[
token_id
]
+=
bias
# Clamp the bias between -100 and 100 per OpenAI API spec
bias
=
min
(
100
,
max
(
-
100
,
bias
))
logits
[
int
(
token_id
)]
+=
bias
return
logits
return
logits
logits_processors
=
[
logit_bias_logits_processor
]
logits_processors
=
[
logit_bias_logits_processor
]
...
@@ -566,7 +602,7 @@ class CompletionStreamResponse(OpenAIBaseModel):
...
@@ -566,7 +602,7 @@ class CompletionStreamResponse(OpenAIBaseModel):
class
EmbeddingResponseData
(
BaseModel
):
class
EmbeddingResponseData
(
BaseModel
):
index
:
int
index
:
int
object
:
str
=
"embedding"
object
:
str
=
"embedding"
embedding
:
List
[
float
]
embedding
:
Union
[
List
[
float
]
,
str
]
class
EmbeddingResponse
(
BaseModel
):
class
EmbeddingResponse
(
BaseModel
):
...
@@ -672,6 +708,17 @@ class BatchRequestInput(OpenAIBaseModel):
...
@@ -672,6 +708,17 @@ class BatchRequestInput(OpenAIBaseModel):
body
:
Union
[
ChatCompletionRequest
,
]
body
:
Union
[
ChatCompletionRequest
,
]
class
BatchResponseData
(
OpenAIBaseModel
):
# HTTP status code of the response.
status_code
:
int
=
200
# An unique identifier for the API request.
request_id
:
str
# The body of the response.
body
:
Optional
[
ChatCompletionResponse
]
=
None
class
BatchRequestOutput
(
OpenAIBaseModel
):
class
BatchRequestOutput
(
OpenAIBaseModel
):
"""
"""
The per-line object of the batch output and error files
The per-line object of the batch output and error files
...
@@ -683,8 +730,29 @@ class BatchRequestOutput(OpenAIBaseModel):
...
@@ -683,8 +730,29 @@ class BatchRequestOutput(OpenAIBaseModel):
# inputs.
# inputs.
custom_id
:
str
custom_id
:
str
response
:
Optional
[
ChatCompletion
Response
]
response
:
Optional
[
Batch
Response
Data
]
# For requests that failed with a non-HTTP error, this will contain more
# For requests that failed with a non-HTTP error, this will contain more
# information on the cause of the failure.
# information on the cause of the failure.
error
:
Optional
[
Any
]
error
:
Optional
[
Any
]
class
TokenizeRequest
(
OpenAIBaseModel
):
model
:
str
prompt
:
str
add_special_tokens
:
bool
=
Field
(
default
=
True
)
class
TokenizeResponse
(
OpenAIBaseModel
):
tokens
:
List
[
int
]
count
:
int
max_model_len
:
int
class
DetokenizeRequest
(
OpenAIBaseModel
):
model
:
str
tokens
:
List
[
int
]
class
DetokenizeResponse
(
OpenAIBaseModel
):
prompt
:
str
vllm/entrypoints/openai/run_batch.py
View file @
705f6a35
import
argparse
import
asyncio
import
asyncio
import
sys
from
io
import
StringIO
from
io
import
StringIO
from
typing
import
Awaitable
,
List
import
aiohttp
import
aiohttp
import
vllm
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
nullable_str
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
nullable_str
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
BatchRequestInput
,
from
vllm.entrypoints.openai.protocol
import
(
BatchRequestInput
,
BatchRequestOutput
,
BatchRequestOutput
,
ChatCompletionResponse
)
BatchResponseData
,
ChatCompletionResponse
,
ErrorResponse
)
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
random_uuid
from
vllm.utils
import
FlexibleArgumentParser
,
random_uuid
from
vllm.version
import
__version__
as
VLLM_VERSION
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
def
parse_args
():
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
parser
=
Flexible
ArgumentParser
(
description
=
"vLLM OpenAI-Compatible batch runner."
)
description
=
"vLLM OpenAI-Compatible batch runner."
)
parser
.
add_argument
(
parser
.
add_argument
(
"-i"
,
"-i"
,
...
@@ -55,7 +56,7 @@ async def read_file(path_or_url: str) -> str:
...
@@ -55,7 +56,7 @@ async def read_file(path_or_url: str) -> str:
session
.
get
(
path_or_url
)
as
resp
:
session
.
get
(
path_or_url
)
as
resp
:
return
await
resp
.
text
()
return
await
resp
.
text
()
else
:
else
:
with
open
(
path_or_url
,
"r"
)
as
f
:
with
open
(
path_or_url
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
return
f
.
read
()
return
f
.
read
()
...
@@ -68,7 +69,7 @@ async def write_file(path_or_url: str, data: str) -> None:
...
@@ -68,7 +69,7 @@ async def write_file(path_or_url: str, data: str) -> None:
# We should make this async, but as long as this is always run as a
# We should make this async, but as long as this is always run as a
# standalone program, blocking the event loop won't effect performance
# standalone program, blocking the event loop won't effect performance
# in this particular case.
# in this particular case.
with
open
(
path_or_url
,
"w"
)
as
f
:
with
open
(
path_or_url
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
data
)
f
.
write
(
data
)
...
@@ -76,20 +77,27 @@ async def run_request(chat_serving: OpenAIServingChat,
...
@@ -76,20 +77,27 @@ async def run_request(chat_serving: OpenAIServingChat,
request
:
BatchRequestInput
)
->
BatchRequestOutput
:
request
:
BatchRequestInput
)
->
BatchRequestOutput
:
chat_request
=
request
.
body
chat_request
=
request
.
body
chat_response
=
await
chat_serving
.
create_chat_completion
(
chat_request
)
chat_response
=
await
chat_serving
.
create_chat_completion
(
chat_request
)
if
isinstance
(
chat_response
,
ChatCompletionResponse
):
if
isinstance
(
chat_response
,
ChatCompletionResponse
):
batch_output
=
BatchRequestOutput
(
batch_output
=
BatchRequestOutput
(
id
=
f
"vllm-
{
random_uuid
()
}
"
,
id
=
f
"vllm-
{
random_uuid
()
}
"
,
custom_id
=
request
.
custom_id
,
custom_id
=
request
.
custom_id
,
response
=
chat_response
,
response
=
BatchResponseData
(
body
=
chat_response
,
request_id
=
f
"vllm-batch-
{
random_uuid
()
}
"
),
error
=
None
,
error
=
None
,
)
)
el
se
:
el
if
isinstance
(
chat_response
,
ErrorResponse
)
:
batch_output
=
BatchRequestOutput
(
batch_output
=
BatchRequestOutput
(
id
=
f
"vllm-
{
random_uuid
()
}
"
,
id
=
f
"vllm-
{
random_uuid
()
}
"
,
custom_id
=
request
.
custom_id
,
custom_id
=
request
.
custom_id
,
response
=
None
,
response
=
BatchResponseData
(
status_code
=
chat_response
.
code
,
request_id
=
f
"vllm-batch-
{
random_uuid
()
}
"
),
error
=
chat_response
,
error
=
chat_response
,
)
)
else
:
raise
ValueError
(
"Request must not be sent in stream mode"
)
return
batch_output
return
batch_output
...
@@ -114,7 +122,7 @@ async def main(args):
...
@@ -114,7 +122,7 @@ async def main(args):
)
)
# Submit all requests in the file to the engine "concurrently".
# Submit all requests in the file to the engine "concurrently".
response_futures
=
[]
response_futures
:
List
[
Awaitable
[
BatchRequestOutput
]]
=
[]
for
request_json
in
(
await
read_file
(
args
.
input_file
)).
strip
().
split
(
"
\n
"
):
for
request_json
in
(
await
read_file
(
args
.
input_file
)).
strip
().
split
(
"
\n
"
):
request
=
BatchRequestInput
.
model_validate_json
(
request_json
)
request
=
BatchRequestInput
.
model_validate_json
(
request_json
)
response_futures
.
append
(
run_request
(
openai_serving_chat
,
request
))
response_futures
.
append
(
run_request
(
openai_serving_chat
,
request
))
...
@@ -128,14 +136,11 @@ async def main(args):
...
@@ -128,14 +136,11 @@ async def main(args):
output_buffer
.
seek
(
0
)
output_buffer
.
seek
(
0
)
await
write_file
(
args
.
output_file
,
output_buffer
.
read
().
strip
())
await
write_file
(
args
.
output_file
,
output_buffer
.
read
().
strip
())
# Temporary workaround for https://github.com/vllm-project/vllm/issues/4789
sys
.
exit
(
0
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
args
=
parse_args
()
args
=
parse_args
()
logger
.
info
(
"vLLM API server version %s"
,
vllm
.
__version__
)
logger
.
info
(
"vLLM API server version %s"
,
VLLM_VERSION
)
logger
.
info
(
"args: %s"
,
args
)
logger
.
info
(
"args: %s"
,
args
)
asyncio
.
run
(
main
(
args
))
asyncio
.
run
(
main
(
args
))
vllm/entrypoints/openai/serving_chat.py
View file @
705f6a35
import
codecs
import
codecs
import
time
import
time
from
dataclasses
import
dataclass
,
field
from
dataclasses
import
dataclass
,
field
from
functools
import
cached_property
from
typing
import
(
AsyncGenerator
,
AsyncIterator
,
Awaitable
,
Dict
,
Iterable
,
from
typing
import
(
AsyncGenerator
,
AsyncIterator
,
Awaitable
,
Dict
,
Iterable
,
List
,
Optional
)
List
,
Optional
)
from
typing
import
Sequence
as
GenericSequence
from
typing
import
Sequence
as
GenericSequence
...
@@ -10,7 +11,7 @@ from fastapi import Request
...
@@ -10,7 +11,7 @@ from fastapi import Request
from
openai.types.chat
import
(
ChatCompletionContentPartImageParam
,
from
openai.types.chat
import
(
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartTextParam
)
ChatCompletionContentPartTextParam
)
from
vllm.config
import
ModelConfig
,
VisionLanguageConfig
from
vllm.config
import
ModelConfig
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionContentPartParam
,
ChatCompletionLogProb
,
ChatCompletionContentPartParam
,
ChatCompletionLogProb
,
...
@@ -26,11 +27,12 @@ from vllm.inputs import PromptInputs
...
@@ -26,11 +27,12 @@ from vllm.inputs import PromptInputs
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.guided_decoding
import
(
from
vllm.model_executor.guided_decoding
import
(
get_guided_decoding_logits_processor
)
get_guided_decoding_logits_processor
)
from
vllm.multimodal.image
import
ImagePixelData
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.multimodal.utils
import
(
async_get_and_parse_image
,
from
vllm.multimodal.utils
import
async_get_and_parse_image
get_full_image_text_prompt
)
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
from
vllm.sequence
import
Logprob
from
vllm.sequence
import
Logprob
from
vllm.tracing
import
(
contains_trace_headers
,
extract_trace_headers
,
log_tracing_disabled_warning
)
from
vllm.utils
import
random_uuid
from
vllm.utils
import
random_uuid
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -45,7 +47,7 @@ class ConversationMessage(TypedDict):
...
@@ -45,7 +47,7 @@ class ConversationMessage(TypedDict):
@
dataclass
(
frozen
=
True
)
@
dataclass
(
frozen
=
True
)
class
ChatMessageParseResult
:
class
ChatMessageParseResult
:
messages
:
List
[
ConversationMessage
]
messages
:
List
[
ConversationMessage
]
image
_futures
:
List
[
Awaitable
[
ImagePixe
lData
]]
=
field
(
mm
_futures
:
List
[
Awaitable
[
MultiModa
lData
Dict
]]
=
field
(
default_factory
=
list
)
default_factory
=
list
)
...
@@ -95,82 +97,85 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -95,82 +97,85 @@ class OpenAIServingChat(OpenAIServing):
logger
.
warning
(
logger
.
warning
(
"No chat template provided. Chat API will not work."
)
"No chat template provided. Chat API will not work."
)
@
cached_property
def
image_token_str
(
self
)
->
Optional
[
str
]:
# TODO: Let user specify how to insert image tokens into prompt
# (similar to chat template)
model_type
=
self
.
model_config
.
hf_config
.
model_type
if
model_type
==
"phi3_v"
:
# Workaround since this token is not defined in the tokenizer
return
"<|image_1|>"
if
model_type
in
(
"blip-2"
,
"chatglm"
,
"fuyu"
,
"minicpmv"
,
"paligemma"
):
# These models do not use image tokens in the prompt
return
None
if
model_type
.
startswith
(
"llava"
):
return
self
.
tokenizer
.
decode
(
self
.
model_config
.
hf_config
.
image_token_index
)
else
:
raise
TypeError
(
"Unknown model type: {model_type}"
)
# TODO: Let user specify how to insert image tokens into prompt
# (similar to chat template)
def
_get_full_image_text_prompt
(
self
,
image_token_str
:
str
,
text_prompt
:
str
)
->
str
:
"""Combine image and text prompts for vision language model"""
# NOTE: For now we assume all model architectures use the same
# image + text prompt format. This may change in the future.
return
f
"
{
image_token_str
}
\n
{
text_prompt
}
"
def
_parse_chat_message_content_parts
(
def
_parse_chat_message_content_parts
(
self
,
self
,
role
:
str
,
role
:
str
,
parts
:
Iterable
[
ChatCompletionContentPartParam
],
parts
:
Iterable
[
ChatCompletionContentPartParam
],
)
->
ChatMessageParseResult
:
)
->
ChatMessageParseResult
:
texts
:
List
[
str
]
=
[]
texts
:
List
[
str
]
=
[]
image_futures
:
List
[
Awaitable
[
ImagePixelData
]]
=
[]
mm_futures
:
List
[
Awaitable
[
MultiModalDataDict
]]
=
[]
vlm_config
:
Optional
[
VisionLanguageConfig
]
=
getattr
(
self
.
engine
.
engine
,
"vision_language_config"
,
None
)
model_config
=
getattr
(
self
.
engine
.
engine
,
"model_config"
,
None
)
for
part
in
parts
:
for
part
in
parts
:
part_type
=
part
[
"type"
]
part_type
=
part
[
"type"
]
if
part_type
==
"text"
:
if
part_type
==
"text"
:
text
=
cast
(
ChatCompletionContentPartTextParam
,
part
)[
"text"
]
text
=
cast
(
ChatCompletionContentPartTextParam
,
part
)[
"text"
]
texts
.
append
(
text
)
texts
.
append
(
text
)
elif
part_type
==
"image_url"
:
elif
part_type
==
"image_url"
:
if
vlm_config
is
None
:
if
len
(
mm_futures
)
>
0
:
raise
ValueError
(
"'image_url' input is not supported as the loaded "
"model is not multimodal."
)
elif
len
(
image_futures
)
==
0
:
assert
self
.
tokenizer
is
not
None
image_url
=
cast
(
ChatCompletionContentPartImageParam
,
part
)[
"image_url"
]
if
image_url
.
get
(
"detail"
,
"auto"
)
!=
"auto"
:
logger
.
warning
(
"'image_url.detail' is currently not supported and "
"will be ignored."
)
image_future
=
async_get_and_parse_image
(
image_url
[
"url"
])
image_futures
.
append
(
image_future
)
else
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"Multiple 'image_url' input is currently not supported."
"Multiple 'image_url' input is currently not supported."
)
)
image_url
=
cast
(
ChatCompletionContentPartImageParam
,
part
)[
"image_url"
]
if
image_url
.
get
(
"detail"
,
"auto"
)
!=
"auto"
:
logger
.
warning
(
"'image_url.detail' is currently not supported and "
"will be ignored."
)
image_future
=
async_get_and_parse_image
(
image_url
[
"url"
])
mm_futures
.
append
(
image_future
)
else
:
else
:
raise
NotImplementedError
(
f
"Unknown part type:
{
part_type
}
"
)
raise
NotImplementedError
(
f
"Unknown part type:
{
part_type
}
"
)
text_prompt
=
"
\n
"
.
join
(
texts
)
text_prompt
=
"
\n
"
.
join
(
texts
)
if
vlm_config
is
not
None
and
len
(
image_futures
):
if
mm_futures
:
image_token_str
=
self
.
image_token_str
(
image_token_prompt
,
if
image_token_str
is
not
None
:
image_token_str
)
=
vlm_config
.
get_image_token_text
(
self
.
tokenizer
)
if
image_token_str
in
text_prompt
:
logger
.
warning
(
# NOTE: If image token string (e.g, <image>) is already present
"Detected image token string in the text prompt. "
# in the text prompt, we assume it follows the same format required
"Skipping prompt formatting."
)
# by the engine.
else
:
if
image_token_str
in
text_prompt
:
text_prompt
=
self
.
_get_full_image_text_prompt
(
logger
.
warning
(
image_token_str
=
image_token_str
,
"Detected image token string in the text prompt. "
text_prompt
=
text_prompt
,
"Skipping prompt formatting."
)
)
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
text_prompt
)
]
else
:
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
text_prompt
)]
full_prompt
=
get_full_image_text_prompt
(
image_prompt
=
image_token_prompt
,
text_prompt
=
text_prompt
,
config
=
model_config
)
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
full_prompt
)
]
else
:
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
text_prompt
)]
return
ChatMessageParseResult
(
messages
=
messages
,
return
ChatMessageParseResult
(
messages
=
messages
,
mm_futures
=
mm_futures
)
image_futures
=
image_futures
)
def
_parse_chat_message_content
(
def
_parse_chat_message_content
(
self
,
self
,
...
@@ -180,10 +185,10 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -180,10 +185,10 @@ class OpenAIServingChat(OpenAIServing):
content
=
message
.
get
(
"content"
)
content
=
message
.
get
(
"content"
)
if
content
is
None
:
if
content
is
None
:
return
ChatMessageParseResult
(
messages
=
[],
image
_futures
=
[])
return
ChatMessageParseResult
(
messages
=
[],
mm
_futures
=
[])
if
isinstance
(
content
,
str
):
if
isinstance
(
content
,
str
):
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
content
)]
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
content
)]
return
ChatMessageParseResult
(
messages
=
messages
,
image
_futures
=
[])
return
ChatMessageParseResult
(
messages
=
messages
,
mm
_futures
=
[])
return
self
.
_parse_chat_message_content_parts
(
role
,
content
)
return
self
.
_parse_chat_message_content_parts
(
role
,
content
)
...
@@ -208,32 +213,41 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -208,32 +213,41 @@ class OpenAIServingChat(OpenAIServing):
try
:
try
:
conversation
:
List
[
ConversationMessage
]
=
[]
conversation
:
List
[
ConversationMessage
]
=
[]
image
_futures
:
List
[
Awaitable
[
ImagePixe
lData
]]
=
[]
mm
_futures
:
List
[
Awaitable
[
MultiModa
lData
Dict
]]
=
[]
for
msg
in
request
.
messages
:
for
msg
in
request
.
messages
:
chat_parsed_result
=
self
.
_parse_chat_message_content
(
msg
)
chat_parsed_result
=
self
.
_parse_chat_message_content
(
msg
)
conversation
.
extend
(
chat_parsed_result
.
messages
)
conversation
.
extend
(
chat_parsed_result
.
messages
)
image_futures
.
extend
(
chat_parsed_result
.
image_futures
)
mm_futures
.
extend
(
chat_parsed_result
.
mm_futures
)
tool_dicts
=
None
if
request
.
tools
is
None
else
[
tool
.
model_dump
()
for
tool
in
request
.
tools
]
prompt
=
self
.
tokenizer
.
apply_chat_template
(
prompt
=
self
.
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
conversation
=
conversation
,
tokenize
=
False
,
tokenize
=
False
,
add_generation_prompt
=
request
.
add_generation_prompt
,
add_generation_prompt
=
request
.
add_generation_prompt
,
tools
=
tool_dicts
,
documents
=
request
.
documents
,
chat_template
=
request
.
chat_template
,
**
(
request
.
chat_template_kwargs
or
{}),
)
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
"Error in applying chat template from request: %s"
,
e
)
logger
.
error
(
"Error in applying chat template from request: %s"
,
e
)
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
str
(
e
))
# Fetch image data
mm_data
:
Optional
[
MultiModalDataDict
]
=
None
image_data
:
Optional
[
ImagePixelData
]
=
None
try
:
try
:
if
len
(
image_futures
):
if
len
(
mm_futures
):
# since we support only single image currently
# since we support only single mm data currently
assert
len
(
image_futures
)
==
1
assert
len
(
image_data
=
await
image_futures
[
0
]
mm_futures
)
==
1
,
"Multiple 'image_url' input is currently not supported."
mm_data
=
await
mm_futures
[
0
]
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
"Error in loading
image
data: %s"
,
e
)
logger
.
error
(
"Error in loading
multi-modal
data: %s"
,
e
)
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
str
(
e
))
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
...
@@ -244,7 +258,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -244,7 +258,7 @@ class OpenAIServingChat(OpenAIServing):
prompt
=
prompt
,
prompt
=
prompt
,
add_special_tokens
=
request
.
add_special_tokens
)
add_special_tokens
=
request
.
add_special_tokens
)
sampling_params
=
request
.
to_sampling_params
()
sampling_params
=
request
.
to_sampling_params
()
lora_request
=
self
.
_maybe_get_
lora
(
request
)
_
,
lora_request
=
self
.
_maybe_get_
adapter
(
request
)
decoding_config
=
await
self
.
engine
.
get_decoding_config
()
decoding_config
=
await
self
.
engine
.
get_decoding_config
()
guided_decoding_backend
=
request
.
guided_decoding_backend
\
guided_decoding_backend
=
request
.
guided_decoding_backend
\
or
decoding_config
.
guided_decoding_backend
or
decoding_config
.
guided_decoding_backend
...
@@ -264,14 +278,23 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -264,14 +278,23 @@ class OpenAIServingChat(OpenAIServing):
"prompt"
:
prompt_text
,
"prompt"
:
prompt_text
,
"prompt_token_ids"
:
prompt_ids
,
"prompt_token_ids"
:
prompt_ids
,
}
}
if
image_data
is
not
None
:
if
mm_data
:
inputs
[
"multi_modal_data"
]
=
image_data
inputs
[
"multi_modal_data"
]
=
mm_data
is_tracing_enabled
=
await
self
.
engine
.
is_tracing_enabled
()
trace_headers
=
None
if
is_tracing_enabled
and
raw_request
:
trace_headers
=
extract_trace_headers
(
raw_request
.
headers
)
if
not
is_tracing_enabled
and
raw_request
and
contains_trace_headers
(
raw_request
.
headers
):
log_tracing_disabled_warning
()
result_generator
=
self
.
engine
.
generate
(
result_generator
=
self
.
engine
.
generate
(
inputs
,
inputs
,
sampling_params
,
sampling_params
,
request_id
,
request_id
,
lora_request
,
lora_request
,
trace_headers
=
trace_headers
,
)
)
# Streaming response
# Streaming response
if
request
.
stream
:
if
request
.
stream
:
...
@@ -487,7 +510,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -487,7 +510,7 @@ class OpenAIServingChat(OpenAIServing):
final_res
=
res
final_res
=
res
assert
final_res
is
not
None
assert
final_res
is
not
None
choices
=
[]
choices
:
List
[
ChatCompletionResponseChoice
]
=
[]
role
=
self
.
get_chat_request_role
(
request
)
role
=
self
.
get_chat_request_role
(
request
)
for
output
in
final_res
.
outputs
:
for
output
in
final_res
.
outputs
:
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
705f6a35
...
@@ -16,14 +16,21 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
...
@@ -16,14 +16,21 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
CompletionResponseChoice
,
CompletionResponseChoice
,
CompletionResponseStreamChoice
,
CompletionResponseStreamChoice
,
CompletionStreamResponse
,
CompletionStreamResponse
,
UsageInfo
)
DetokenizeRequest
,
DetokenizeResponse
,
TokenizeRequest
,
TokenizeResponse
,
UsageInfo
)
# yapf: enable
from
vllm.entrypoints.openai.serving_engine
import
(
LoRAModulePath
,
from
vllm.entrypoints.openai.serving_engine
import
(
LoRAModulePath
,
OpenAIServing
)
OpenAIServing
,
PromptAdapterPath
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.guided_decoding
import
(
from
vllm.model_executor.guided_decoding
import
(
get_guided_decoding_logits_processor
)
get_guided_decoding_logits_processor
)
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
from
vllm.sequence
import
Logprob
from
vllm.sequence
import
Logprob
from
vllm.tracing
import
(
contains_trace_headers
,
extract_trace_headers
,
log_tracing_disabled_warning
)
from
vllm.utils
import
merge_async_iterators
,
random_uuid
from
vllm.utils
import
merge_async_iterators
,
random_uuid
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -61,11 +68,13 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -61,11 +68,13 @@ class OpenAIServingCompletion(OpenAIServing):
def
__init__
(
self
,
engine
:
AsyncLLMEngine
,
model_config
:
ModelConfig
,
def
__init__
(
self
,
engine
:
AsyncLLMEngine
,
model_config
:
ModelConfig
,
served_model_names
:
List
[
str
],
served_model_names
:
List
[
str
],
lora_modules
:
Optional
[
List
[
LoRAModulePath
]]):
lora_modules
:
Optional
[
List
[
LoRAModulePath
]],
prompt_adapters
:
Optional
[
List
[
PromptAdapterPath
]]):
super
().
__init__
(
engine
=
engine
,
super
().
__init__
(
engine
=
engine
,
model_config
=
model_config
,
model_config
=
model_config
,
served_model_names
=
served_model_names
,
served_model_names
=
served_model_names
,
lora_modules
=
lora_modules
)
lora_modules
=
lora_modules
,
prompt_adapters
=
prompt_adapters
)
async
def
create_completion
(
self
,
request
:
CompletionRequest
,
async
def
create_completion
(
self
,
request
:
CompletionRequest
,
raw_request
:
Request
):
raw_request
:
Request
):
...
@@ -95,7 +104,12 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -95,7 +104,12 @@ class OpenAIServingCompletion(OpenAIServing):
generators
:
List
[
AsyncIterator
[
RequestOutput
]]
=
[]
generators
:
List
[
AsyncIterator
[
RequestOutput
]]
=
[]
try
:
try
:
sampling_params
=
request
.
to_sampling_params
()
sampling_params
=
request
.
to_sampling_params
()
lora_request
=
self
.
_maybe_get_lora
(
request
)
adapter_type
,
adapter_request
=
self
.
_maybe_get_adapter
(
request
)
lora_request
,
prompt_adapter_request
=
None
,
None
if
adapter_type
==
'LoRA'
:
lora_request
,
prompt_adapter_request
=
adapter_request
,
None
elif
adapter_type
==
'PromptAdapter'
:
lora_request
,
prompt_adapter_request
=
None
,
adapter_request
decoding_config
=
await
self
.
engine
.
get_decoding_config
()
decoding_config
=
await
self
.
engine
.
get_decoding_config
()
guided_decoding_backend
=
request
.
guided_decoding_backend
\
guided_decoding_backend
=
request
.
guided_decoding_backend
\
or
decoding_config
.
guided_decoding_backend
or
decoding_config
.
guided_decoding_backend
...
@@ -125,6 +139,14 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -125,6 +139,14 @@ class OpenAIServingCompletion(OpenAIServing):
truncate_prompt_tokens
)
truncate_prompt_tokens
)
prompt_ids
,
prompt_text
=
prompt_formats
prompt_ids
,
prompt_text
=
prompt_formats
is_tracing_enabled
=
await
self
.
engine
.
is_tracing_enabled
()
trace_headers
=
None
if
is_tracing_enabled
:
trace_headers
=
extract_trace_headers
(
raw_request
.
headers
)
if
not
is_tracing_enabled
and
contains_trace_headers
(
raw_request
.
headers
):
log_tracing_disabled_warning
()
generator
=
self
.
engine
.
generate
(
generator
=
self
.
engine
.
generate
(
{
{
"prompt"
:
prompt_text
,
"prompt"
:
prompt_text
,
...
@@ -133,6 +155,8 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -133,6 +155,8 @@ class OpenAIServingCompletion(OpenAIServing):
sampling_params
,
sampling_params
,
f
"
{
request_id
}
-
{
i
}
"
,
f
"
{
request_id
}
-
{
i
}
"
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
prompt_adapter_request
=
prompt_adapter_request
,
trace_headers
=
trace_headers
,
)
)
generators
.
append
(
generator
)
generators
.
append
(
generator
)
...
@@ -256,16 +280,6 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -256,16 +280,6 @@ class OpenAIServingCompletion(OpenAIServing):
previous_num_tokens
[
i
]
=
len
(
output
.
token_ids
)
previous_num_tokens
[
i
]
=
len
(
output
.
token_ids
)
finish_reason
=
output
.
finish_reason
finish_reason
=
output
.
finish_reason
stop_reason
=
output
.
stop_reason
stop_reason
=
output
.
stop_reason
if
output
.
finish_reason
is
not
None
:
# return final usage
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
completion_tokens
=
len
(
output
.
token_ids
)
final_usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
)
else
:
final_usage
=
None
chunk
=
CompletionStreamResponse
(
chunk
=
CompletionStreamResponse
(
id
=
request_id
,
id
=
request_id
,
...
@@ -282,9 +296,21 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -282,9 +296,21 @@ class OpenAIServingCompletion(OpenAIServing):
])
])
if
(
request
.
stream_options
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
and
request
.
stream_options
.
include_usage
):
chunk
.
usage
=
None
if
(
request
.
stream_options
.
continuous_usage_stats
or
output
.
finish_reason
is
not
None
):
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
completion_tokens
=
len
(
output
.
token_ids
)
usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
completion_tokens
,
total_tokens
=
prompt_tokens
+
completion_tokens
,
)
if
request
.
stream_options
.
continuous_usage_stats
:
chunk
.
usage
=
usage
else
:
chunk
.
usage
=
None
response_json
=
chunk
.
model_dump_json
(
exclude_unset
=
Tru
e
)
response_json
=
chunk
.
model_dump_json
(
exclude_unset
=
Fals
e
)
yield
f
"data:
{
response_json
}
\n\n
"
yield
f
"data:
{
response_json
}
\n\n
"
if
(
request
.
stream_options
if
(
request
.
stream_options
...
@@ -294,10 +320,10 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -294,10 +320,10 @@ class OpenAIServingCompletion(OpenAIServing):
created
=
created_time
,
created
=
created_time
,
model
=
model_name
,
model
=
model_name
,
choices
=
[],
choices
=
[],
usage
=
final_
usage
,
usage
=
usage
,
)
)
final_usage_data
=
(
final_usage_chunk
.
model_dump_json
(
final_usage_data
=
(
final_usage_chunk
.
model_dump_json
(
exclude_unset
=
Tru
e
,
exclude_none
=
True
))
exclude_unset
=
Fals
e
,
exclude_none
=
True
))
yield
f
"data:
{
final_usage_data
}
\n\n
"
yield
f
"data:
{
final_usage_data
}
\n\n
"
except
ValueError
as
e
:
except
ValueError
as
e
:
...
@@ -330,7 +356,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -330,7 +356,7 @@ class OpenAIServingCompletion(OpenAIServing):
out_logprobs
=
prompt_logprobs
out_logprobs
=
prompt_logprobs
output_text
=
prompt_text
output_text
=
prompt_text
elif
request
.
echo
and
request
.
max_tokens
>
0
:
elif
request
.
echo
and
request
.
max_tokens
>
0
:
token_ids
=
prompt_token_ids
+
output
.
token_ids
token_ids
=
prompt_token_ids
+
list
(
output
.
token_ids
)
out_logprobs
=
(
prompt_logprobs
+
output
.
logprobs
out_logprobs
=
(
prompt_logprobs
+
output
.
logprobs
if
request
.
logprobs
is
not
None
else
None
)
if
request
.
logprobs
is
not
None
else
None
)
output_text
=
prompt_text
+
output
.
text
output_text
=
prompt_text
+
output
.
text
...
@@ -431,3 +457,29 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -431,3 +457,29 @@ class OpenAIServingCompletion(OpenAIServing):
tokens
=
out_tokens
,
tokens
=
out_tokens
,
top_logprobs
=
out_top_logprobs
,
top_logprobs
=
out_top_logprobs
,
)
)
async
def
create_tokenize
(
self
,
request
:
TokenizeRequest
)
->
TokenizeResponse
:
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
(
input_ids
,
input_text
)
=
self
.
_validate_prompt_and_tokenize
(
request
,
prompt
=
request
.
prompt
,
add_special_tokens
=
request
.
add_special_tokens
)
return
TokenizeResponse
(
tokens
=
input_ids
,
count
=
len
(
input_ids
),
max_model_len
=
self
.
max_model_len
)
async
def
create_detokenize
(
self
,
request
:
DetokenizeRequest
)
->
DetokenizeResponse
:
error_check_ret
=
await
self
.
_check_model
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
(
input_ids
,
input_text
)
=
self
.
_validate_prompt_and_tokenize
(
request
,
prompt_ids
=
request
.
tokens
)
return
DetokenizeResponse
(
prompt
=
input_text
)
vllm/entrypoints/openai/serving_embedding.py
View file @
705f6a35
import
base64
import
time
import
time
from
typing
import
AsyncIterator
,
List
,
Optional
,
Tuple
from
typing
import
AsyncIterator
,
List
,
Optional
,
Tuple
import
numpy
as
np
from
fastapi
import
Request
from
fastapi
import
Request
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
...
@@ -20,19 +22,18 @@ TypeTokenIDs = List[int]
...
@@ -20,19 +22,18 @@ TypeTokenIDs = List[int]
def
request_output_to_embedding_response
(
def
request_output_to_embedding_response
(
final_res_batch
:
List
[
EmbeddingRequestOutput
],
final_res_batch
:
List
[
EmbeddingRequestOutput
],
request_id
:
str
,
request_id
:
str
,
created_time
:
int
,
model_name
:
str
,
created_time
:
int
,
encoding_format
:
str
)
->
EmbeddingResponse
:
model_name
:
str
,
data
:
List
[
EmbeddingResponseData
]
=
[]
)
->
EmbeddingResponse
:
data
=
[]
num_prompt_tokens
=
0
num_prompt_tokens
=
0
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
assert
final_res
is
not
None
assert
final_res
is
not
None
prompt_token_ids
=
final_res
.
prompt_token_ids
prompt_token_ids
=
final_res
.
prompt_token_ids
embedding
=
final_res
.
outputs
.
embedding
embedding_data
=
EmbeddingResponseData
(
if
encoding_format
==
"base64"
:
index
=
idx
,
embedding
=
final_res
.
outputs
.
embedding
)
embedding
=
base64
.
b64encode
(
np
.
array
(
embedding
))
embedding_data
=
EmbeddingResponseData
(
index
=
idx
,
embedding
=
embedding
)
data
.
append
(
embedding_data
)
data
.
append
(
embedding_data
)
num_prompt_tokens
+=
len
(
prompt_token_ids
)
num_prompt_tokens
+=
len
(
prompt_token_ids
)
...
@@ -72,10 +73,8 @@ class OpenAIServingEmbedding(OpenAIServing):
...
@@ -72,10 +73,8 @@ class OpenAIServingEmbedding(OpenAIServing):
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
# Return error for unsupported features.
encoding_format
=
(
request
.
encoding_format
if
request
.
encoding_format
==
"base64"
:
if
request
.
encoding_format
else
"float"
)
return
self
.
create_error_response
(
"base64 encoding is not currently supported"
)
if
request
.
dimensions
is
not
None
:
if
request
.
dimensions
is
not
None
:
return
self
.
create_error_response
(
return
self
.
create_error_response
(
"dimensions is currently not supported"
)
"dimensions is currently not supported"
)
...
@@ -129,7 +128,8 @@ class OpenAIServingEmbedding(OpenAIServing):
...
@@ -129,7 +128,8 @@ class OpenAIServingEmbedding(OpenAIServing):
return
self
.
create_error_response
(
"Client disconnected"
)
return
self
.
create_error_response
(
"Client disconnected"
)
final_res_batch
[
i
]
=
res
final_res_batch
[
i
]
=
res
response
=
request_output_to_embedding_response
(
response
=
request_output_to_embedding_response
(
final_res_batch
,
request_id
,
created_time
,
model_name
)
final_res_batch
,
request_id
,
created_time
,
model_name
,
encoding_format
)
except
ValueError
as
e
:
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
# TODO: Use a vllm-specific Validation Error
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
str
(
e
))
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
705f6a35
...
@@ -10,17 +10,25 @@ from vllm.config import ModelConfig
...
@@ -10,17 +10,25 @@ from vllm.config import ModelConfig
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
CompletionRequest
,
CompletionRequest
,
DetokenizeRequest
,
EmbeddingRequest
,
ErrorResponse
,
EmbeddingRequest
,
ErrorResponse
,
ModelCard
,
ModelList
,
ModelCard
,
ModelList
,
ModelPermission
)
ModelPermission
,
TokenizeRequest
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sequence
import
Logprob
from
vllm.sequence
import
Logprob
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
@
dataclass
class
PromptAdapterPath
:
name
:
str
local_path
:
str
@
dataclass
@
dataclass
class
LoRAModulePath
:
class
LoRAModulePath
:
name
:
str
name
:
str
...
@@ -29,12 +37,18 @@ class LoRAModulePath:
...
@@ -29,12 +37,18 @@ class LoRAModulePath:
class
OpenAIServing
:
class
OpenAIServing
:
def
__init__
(
self
,
engine
:
AsyncLLMEngine
,
model_config
:
ModelConfig
,
def
__init__
(
served_model_names
:
List
[
str
],
self
,
lora_modules
:
Optional
[
List
[
LoRAModulePath
]]):
engine
:
AsyncLLMEngine
,
model_config
:
ModelConfig
,
served_model_names
:
List
[
str
],
lora_modules
:
Optional
[
List
[
LoRAModulePath
]],
prompt_adapters
:
Optional
[
List
[
PromptAdapterPath
]]
=
None
,
):
super
().
__init__
()
super
().
__init__
()
self
.
engine
=
engine
self
.
engine
=
engine
self
.
model_config
=
model_config
self
.
max_model_len
=
model_config
.
max_model_len
self
.
max_model_len
=
model_config
.
max_model_len
# A separate tokenizer to map token IDs to strings.
# A separate tokenizer to map token IDs to strings.
...
@@ -47,9 +61,8 @@ class OpenAIServing:
...
@@ -47,9 +61,8 @@ class OpenAIServing:
self
.
served_model_names
=
served_model_names
self
.
served_model_names
=
served_model_names
if
lora_modules
is
None
:
self
.
lora_requests
=
[]
self
.
lora_requests
=
[]
if
lora_modules
is
not
None
:
else
:
self
.
lora_requests
=
[
self
.
lora_requests
=
[
LoRARequest
(
LoRARequest
(
lora_name
=
lora
.
name
,
lora_name
=
lora
.
name
,
...
@@ -58,6 +71,20 @@ class OpenAIServing:
...
@@ -58,6 +71,20 @@ class OpenAIServing:
)
for
i
,
lora
in
enumerate
(
lora_modules
,
start
=
1
)
)
for
i
,
lora
in
enumerate
(
lora_modules
,
start
=
1
)
]
]
self
.
prompt_adapter_requests
=
[]
if
prompt_adapters
is
not
None
:
for
i
,
prompt_adapter
in
enumerate
(
prompt_adapters
,
start
=
1
):
with
open
(
f
"./
{
prompt_adapter
.
local_path
}
"
f
"/adapter_config.json"
)
as
f
:
adapter_config
=
json
.
load
(
f
)
num_virtual_tokens
=
adapter_config
[
"num_virtual_tokens"
]
self
.
prompt_adapter_requests
.
append
(
PromptAdapterRequest
(
prompt_adapter_name
=
prompt_adapter
.
name
,
prompt_adapter_id
=
i
,
prompt_adapter_local_path
=
prompt_adapter
.
local_path
,
prompt_adapter_num_virtual_tokens
=
num_virtual_tokens
))
async
def
show_available_models
(
self
)
->
ModelList
:
async
def
show_available_models
(
self
)
->
ModelList
:
"""Show available models. Right now we only have one model."""
"""Show available models. Right now we only have one model."""
model_cards
=
[
model_cards
=
[
...
@@ -73,7 +100,14 @@ class OpenAIServing:
...
@@ -73,7 +100,14 @@ class OpenAIServing:
permission
=
[
ModelPermission
()])
permission
=
[
ModelPermission
()])
for
lora
in
self
.
lora_requests
for
lora
in
self
.
lora_requests
]
]
prompt_adapter_cards
=
[
ModelCard
(
id
=
prompt_adapter
.
prompt_adapter_name
,
root
=
self
.
served_model_names
[
0
],
permission
=
[
ModelPermission
()])
for
prompt_adapter
in
self
.
prompt_adapter_requests
]
model_cards
.
extend
(
lora_cards
)
model_cards
.
extend
(
lora_cards
)
model_cards
.
extend
(
prompt_adapter_cards
)
return
ModelList
(
data
=
model_cards
)
return
ModelList
(
data
=
model_cards
)
def
create_error_response
(
def
create_error_response
(
...
@@ -99,34 +133,45 @@ class OpenAIServing:
...
@@ -99,34 +133,45 @@ class OpenAIServing:
return
json_str
return
json_str
async
def
_check_model
(
async
def
_check_model
(
self
,
request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
,
self
,
request
:
Union
[
ChatCompletionRequest
,
CompletionRequest
,
EmbeddingRequest
]
DetokenizeRequest
,
EmbeddingRequest
,
TokenizeRequest
]
)
->
Optional
[
ErrorResponse
]:
)
->
Optional
[
ErrorResponse
]:
if
request
.
model
in
self
.
served_model_names
:
if
request
.
model
in
self
.
served_model_names
:
return
None
return
None
if
request
.
model
in
[
lora
.
lora_name
for
lora
in
self
.
lora_requests
]:
if
request
.
model
in
[
lora
.
lora_name
for
lora
in
self
.
lora_requests
]:
return
None
return
None
if
request
.
model
in
[
prompt_adapter
.
prompt_adapter_name
for
prompt_adapter
in
self
.
prompt_adapter_requests
]:
return
None
return
self
.
create_error_response
(
return
self
.
create_error_response
(
message
=
f
"The model `
{
request
.
model
}
` does not exist."
,
message
=
f
"The model `
{
request
.
model
}
` does not exist."
,
err_type
=
"NotFoundError"
,
err_type
=
"NotFoundError"
,
status_code
=
HTTPStatus
.
NOT_FOUND
)
status_code
=
HTTPStatus
.
NOT_FOUND
)
def
_maybe_get_
lora
(
def
_maybe_get_
adapter
(
self
,
request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
,
self
,
request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
,
EmbeddingRequest
]
EmbeddingRequest
]
)
->
Optional
[
LoRARequest
]:
)
->
Tuple
[
Optional
[
str
],
Optional
[
Union
[
LoRARequest
,
PromptAdapterRequest
]]]:
if
request
.
model
in
self
.
served_model_names
:
if
request
.
model
in
self
.
served_model_names
:
return
None
return
None
,
None
for
lora
in
self
.
lora_requests
:
for
lora
in
self
.
lora_requests
:
if
request
.
model
==
lora
.
lora_name
:
if
request
.
model
==
lora
.
lora_name
:
return
lora
return
'LoRA'
,
lora
for
prompt_adapter
in
self
.
prompt_adapter_requests
:
if
request
.
model
==
prompt_adapter
.
prompt_adapter_name
:
return
'PromptAdapter'
,
prompt_adapter
# if _check_model has been called earlier, this will be unreachable
# if _check_model has been called earlier, this will be unreachable
raise
ValueError
(
f
"The model `
{
request
.
model
}
` does not exist."
)
raise
ValueError
(
f
"The model `
{
request
.
model
}
` does not exist."
)
def
_validate_prompt_and_tokenize
(
def
_validate_prompt_and_tokenize
(
self
,
self
,
request
:
Union
[
ChatCompletionRequest
,
CompletionRequest
,
request
:
Union
[
ChatCompletionRequest
,
CompletionRequest
,
EmbeddingRequest
],
DetokenizeRequest
,
EmbeddingRequest
,
TokenizeRequest
],
prompt
:
Optional
[
str
]
=
None
,
prompt
:
Optional
[
str
]
=
None
,
prompt_ids
:
Optional
[
List
[
int
]]
=
None
,
prompt_ids
:
Optional
[
List
[
int
]]
=
None
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
...
@@ -174,6 +219,11 @@ class OpenAIServing:
...
@@ -174,6 +219,11 @@ class OpenAIServing:
f
"generation. Please reduce the length of the input."
,
)
f
"generation. Please reduce the length of the input."
,
)
return
input_ids
,
input_text
return
input_ids
,
input_text
# Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
# and does not require model context length validation
if
isinstance
(
request
,
(
TokenizeRequest
,
DetokenizeRequest
)):
return
input_ids
,
input_text
if
request
.
max_tokens
is
None
:
if
request
.
max_tokens
is
None
:
if
token_num
>=
self
.
max_model_len
:
if
token_num
>=
self
.
max_model_len
:
raise
ValueError
(
raise
ValueError
(
...
...
vllm/envs.py
View file @
705f6a35
...
@@ -5,6 +5,7 @@ if TYPE_CHECKING:
...
@@ -5,6 +5,7 @@ if TYPE_CHECKING:
VLLM_HOST_IP
:
str
=
""
VLLM_HOST_IP
:
str
=
""
VLLM_PORT
:
Optional
[
int
]
=
None
VLLM_PORT
:
Optional
[
int
]
=
None
VLLM_USE_MODELSCOPE
:
bool
=
False
VLLM_USE_MODELSCOPE
:
bool
=
False
VLLM_RINGBUFFER_WARNING_INTERVAL
:
int
=
60
VLLM_INSTANCE_ID
:
Optional
[
str
]
=
None
VLLM_INSTANCE_ID
:
Optional
[
str
]
=
None
VLLM_NCCL_SO_PATH
:
Optional
[
str
]
=
None
VLLM_NCCL_SO_PATH
:
Optional
[
str
]
=
None
LD_LIBRARY_PATH
:
Optional
[
str
]
=
None
LD_LIBRARY_PATH
:
Optional
[
str
]
=
None
...
@@ -27,14 +28,20 @@ if TYPE_CHECKING:
...
@@ -27,14 +28,20 @@ if TYPE_CHECKING:
VLLM_TRACE_FUNCTION
:
int
=
0
VLLM_TRACE_FUNCTION
:
int
=
0
VLLM_ATTENTION_BACKEND
:
Optional
[
str
]
=
None
VLLM_ATTENTION_BACKEND
:
Optional
[
str
]
=
None
VLLM_CPU_KVCACHE_SPACE
:
int
=
0
VLLM_CPU_KVCACHE_SPACE
:
int
=
0
VLLM_OPENVINO_KVCACHE_SPACE
:
int
=
0
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION
:
Optional
[
str
]
=
None
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
:
bool
=
False
VLLM_XLA_CACHE_PATH
:
str
=
"~/.vllm/xla_cache/"
VLLM_FUSED_MOE_CHUNK_SIZE
:
int
=
64
*
1024
VLLM_USE_RAY_COMPILED_DAG
:
bool
=
False
VLLM_USE_RAY_COMPILED_DAG
:
bool
=
False
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"
spawn
"
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"
fork
"
VLLM_IMAGE_FETCH_TIMEOUT
:
int
=
5
VLLM_IMAGE_FETCH_TIMEOUT
:
int
=
5
VLLM_TARGET_DEVICE
:
str
=
"cuda"
VLLM_TARGET_DEVICE
:
str
=
"cuda"
MAX_JOBS
:
Optional
[
str
]
=
None
MAX_JOBS
:
Optional
[
str
]
=
None
NVCC_THREADS
:
Optional
[
str
]
=
None
NVCC_THREADS
:
Optional
[
str
]
=
None
VLLM_USE_PRECOMPILED
:
bool
=
False
VLLM_USE_PRECOMPILED
:
bool
=
False
VLLM_INSTALL_PUNICA_KERNELS
:
bool
=
False
VLLM_INSTALL_PUNICA_KERNELS
:
bool
=
False
VLLM_NO_DEPRECATION_WARNING
:
bool
=
False
CMAKE_BUILD_TYPE
:
Optional
[
str
]
=
None
CMAKE_BUILD_TYPE
:
Optional
[
str
]
=
None
VERBOSE
:
bool
=
False
VERBOSE
:
bool
=
False
...
@@ -47,7 +54,8 @@ environment_variables: Dict[str, Callable[[], Any]] = {
...
@@ -47,7 +54,8 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# ================== Installation Time Env Vars ==================
# ================== Installation Time Env Vars ==================
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
# Target device of vLLM, supporting [cuda (by default),
# rocm, neuron, cpu, openvino]
"VLLM_TARGET_DEVICE"
:
"VLLM_TARGET_DEVICE"
:
lambda
:
os
.
getenv
(
"VLLM_TARGET_DEVICE"
,
"cuda"
),
lambda
:
os
.
getenv
(
"VLLM_TARGET_DEVICE"
,
"cuda"
),
...
@@ -113,6 +121,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
...
@@ -113,6 +121,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_INSTANCE_ID"
:
"VLLM_INSTANCE_ID"
:
lambda
:
os
.
environ
.
get
(
"VLLM_INSTANCE_ID"
,
None
),
lambda
:
os
.
environ
.
get
(
"VLLM_INSTANCE_ID"
,
None
),
# Interval in seconds to log a warning message when the ring buffer is full
"VLLM_RINGBUFFER_WARNING_INTERVAL"
:
lambda
:
int
(
os
.
environ
.
get
(
"VLLM_RINGBUFFER_WARNING_INTERVAL"
,
"60"
)),
# path to cudatoolkit home directory, under which should be bin, include,
# path to cudatoolkit home directory, under which should be bin, include,
# and lib directories.
# and lib directories.
"CUDA_HOME"
:
"CUDA_HOME"
:
...
@@ -194,6 +206,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
...
@@ -194,6 +206,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# - "FLASH_ATTN": use FlashAttention
# - "FLASH_ATTN": use FlashAttention
# - "XFORMERS": use XFormers
# - "XFORMERS": use XFormers
# - "ROCM_FLASH": use ROCmFlashAttention
# - "ROCM_FLASH": use ROCmFlashAttention
# - "FLASHINFER": use flashinfer
"VLLM_ATTENTION_BACKEND"
:
"VLLM_ATTENTION_BACKEND"
:
lambda
:
os
.
getenv
(
"VLLM_ATTENTION_BACKEND"
,
None
),
lambda
:
os
.
getenv
(
"VLLM_ATTENTION_BACKEND"
,
None
),
...
@@ -202,6 +215,22 @@ environment_variables: Dict[str, Callable[[], Any]] = {
...
@@ -202,6 +215,22 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_CPU_KVCACHE_SPACE"
:
"VLLM_CPU_KVCACHE_SPACE"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_CPU_KVCACHE_SPACE"
,
"0"
)),
lambda
:
int
(
os
.
getenv
(
"VLLM_CPU_KVCACHE_SPACE"
,
"0"
)),
# OpenVINO key-value cache space
# default is 4GB
"VLLM_OPENVINO_KVCACHE_SPACE"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_OPENVINO_KVCACHE_SPACE"
,
"0"
)),
# OpenVINO KV cache precision
# default is bf16 if natively supported by platform, otherwise f16
# To enable KV cache compression, please, explicitly specify u8
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION"
:
lambda
:
os
.
getenv
(
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION"
,
None
),
# Enables weights compression during model export via HF Optimum
# default is False
"VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS"
:
lambda
:
bool
(
os
.
getenv
(
"VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS"
,
False
)),
# If the env var is set, it uses the Ray's compiled DAG API
# If the env var is set, it uses the Ray's compiled DAG API
# which optimizes the control plane overhead.
# which optimizes the control plane overhead.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
...
@@ -211,12 +240,23 @@ environment_variables: Dict[str, Callable[[], Any]] = {
...
@@ -211,12 +240,23 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# Use dedicated multiprocess context for workers.
# Use dedicated multiprocess context for workers.
# Both spawn and fork work
# Both spawn and fork work
"VLLM_WORKER_MULTIPROC_METHOD"
:
"VLLM_WORKER_MULTIPROC_METHOD"
:
lambda
:
os
.
getenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"
spawn
"
),
lambda
:
os
.
getenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"
fork
"
),
# Timeout for fetching images when serving multimodal models
# Timeout for fetching images when serving multimodal models
# Default is 5 seconds
# Default is 5 seconds
"VLLM_IMAGE_FETCH_TIMEOUT"
:
"VLLM_IMAGE_FETCH_TIMEOUT"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_IMAGE_FETCH_TIMEOUT"
,
"5"
)),
lambda
:
int
(
os
.
getenv
(
"VLLM_IMAGE_FETCH_TIMEOUT"
,
"5"
)),
# Path to the XLA persistent cache directory.
# Only used for XLA devices such as TPUs.
"VLLM_XLA_CACHE_PATH"
:
lambda
:
os
.
getenv
(
"VLLM_XLA_CACHE_PATH"
,
"~/.vllm/xla_cache/"
),
"VLLM_FUSED_MOE_CHUNK_SIZE"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"65536"
)),
# If set, vllm will skip the deprecation warnings.
"VLLM_NO_DEPRECATION_WARNING"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_NO_DEPRECATION_WARNING"
,
"0"
))),
}
}
# end-env-vars-definition
# end-env-vars-definition
...
...
vllm/executor/cpu_executor.py
View file @
705f6a35
...
@@ -7,6 +7,7 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
...
@@ -7,6 +7,7 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
from
vllm.executor.executor_base
import
ExecutorAsyncBase
,
ExecutorBase
from
vllm.executor.executor_base
import
ExecutorAsyncBase
,
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
make_async
)
...
@@ -46,8 +47,9 @@ class CPUExecutor(ExecutorBase):
...
@@ -46,8 +47,9 @@ class CPUExecutor(ExecutorBase):
rank
=
0
,
rank
=
0
,
distributed_init_method
=
distributed_init_method
,
distributed_init_method
=
distributed_init_method
,
lora_config
=
self
.
lora_config
,
lora_config
=
self
.
lora_config
,
vision_language_config
=
self
.
vision_language
_config
,
multimodal_config
=
self
.
multimodal
_config
,
kv_cache_dtype
=
self
.
cache_config
.
cache_dtype
,
kv_cache_dtype
=
self
.
cache_config
.
cache_dtype
,
prompt_adapter_config
=
self
.
prompt_adapter_config
,
is_driver_worker
=
True
,
is_driver_worker
=
True
,
)
)
self
.
driver_worker
.
init_device
()
self
.
driver_worker
.
init_device
()
...
@@ -84,9 +86,25 @@ class CPUExecutor(ExecutorBase):
...
@@ -84,9 +86,25 @@ class CPUExecutor(ExecutorBase):
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
return
self
.
driver_worker
.
remove_lora
(
lora_id
)
return
self
.
driver_worker
.
remove_lora
(
lora_id
)
def
pin_lora
(
self
,
lora_id
:
int
)
->
bool
:
return
self
.
driver_worker
.
pin_lora
(
lora_id
)
def
list_loras
(
self
)
->
Set
[
int
]:
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
driver_worker
.
list_loras
()
return
self
.
driver_worker
.
list_loras
()
def
add_prompt_adapter
(
self
,
prompt_adapter_request
:
PromptAdapterRequest
)
->
bool
:
return
self
.
driver_worker
.
add_prompt_adapter
(
prompt_adapter_request
)
def
remove_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
return
self
.
driver_worker
.
remove_prompt_adapter
(
prompt_adapter_id
)
def
list_prompt_adapters
(
self
)
->
Set
[
int
]:
return
self
.
driver_worker
.
list_prompt_adapters
()
def
pin_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
return
self
.
driver_worker
.
pin_prompt_adapter
(
prompt_adapter_id
)
def
check_health
(
self
)
->
None
:
def
check_health
(
self
)
->
None
:
# CPUExecutor will always be healthy as long as
# CPUExecutor will always be healthy as long as
# it's running.
# it's running.
...
...
vllm/executor/distributed_gpu_executor.py
View file @
705f6a35
...
@@ -64,12 +64,12 @@ class DistributedGPUExecutor(GPUExecutor):
...
@@ -64,12 +64,12 @@ class DistributedGPUExecutor(GPUExecutor):
num_cpu_blocks
=
num_cpu_blocks
)
num_cpu_blocks
=
num_cpu_blocks
)
def
execute_model
(
def
execute_model
(
self
,
self
,
execute_model_req
:
ExecuteModelRequest
execute_model_req
:
ExecuteModelRequest
)
->
List
[
SamplerOutput
]:
)
->
Optional
[
List
[
SamplerOutput
]
]
:
if
self
.
parallel_worker_tasks
is
None
:
if
self
.
parallel_worker_tasks
is
None
:
self
.
parallel_worker_tasks
=
self
.
_run_workers
(
self
.
parallel_worker_tasks
=
self
.
_run_workers
(
"start_worker_execution_loop"
,
"start_worker_execution_loop"
,
async_run_
remote
_workers_only
=
True
,
async_run_
tensor_parallel
_workers_only
=
True
,
**
self
.
extra_execute_model_run_workers_kwargs
)
**
self
.
extra_execute_model_run_workers_kwargs
)
# Only the driver worker returns the sampling results.
# Only the driver worker returns the sampling results.
...
@@ -79,7 +79,7 @@ class DistributedGPUExecutor(GPUExecutor):
...
@@ -79,7 +79,7 @@ class DistributedGPUExecutor(GPUExecutor):
if
self
.
parallel_worker_tasks
is
None
:
if
self
.
parallel_worker_tasks
is
None
:
return
return
self
.
_driver_execute_model
()
self
.
_driver_execute_model
(
execute_model_req
=
None
)
parallel_worker_tasks
=
self
.
parallel_worker_tasks
parallel_worker_tasks
=
self
.
parallel_worker_tasks
self
.
parallel_worker_tasks
=
None
self
.
parallel_worker_tasks
=
None
# Ensure that workers exit model loop cleanly
# Ensure that workers exit model loop cleanly
...
@@ -100,6 +100,13 @@ class DistributedGPUExecutor(GPUExecutor):
...
@@ -100,6 +100,13 @@ class DistributedGPUExecutor(GPUExecutor):
lora_id
=
lora_id
,
lora_id
=
lora_id
,
)
)
def
pin_lora
(
self
,
lora_id
:
int
)
->
bool
:
assert
lora_id
>
0
,
"lora_id must be greater than 0."
return
self
.
_run_workers
(
"pin_lora"
,
lora_id
=
lora_id
,
)
def
list_loras
(
self
)
->
Set
[
int
]:
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
_run_workers
(
"list_loras"
)
return
self
.
_run_workers
(
"list_loras"
)
...
@@ -116,13 +123,13 @@ class DistributedGPUExecutor(GPUExecutor):
...
@@ -116,13 +123,13 @@ class DistributedGPUExecutor(GPUExecutor):
@
abstractmethod
@
abstractmethod
def
_driver_execute_model
(
def
_driver_execute_model
(
self
,
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
Optional
[
List
[
SamplerOutput
]]:
)
->
List
[
SamplerOutput
]:
"""Run execute_model in the driver worker.
"""Run execute_model in the driver worker.
Passing None will cause the driver to stop the model execution
Passing None will cause the driver to stop the model execution loop
loop running in each of the remote workers.
running in each of the remote workers. In this case, this method
returns None. Otherwise, this method returns the model output.
"""
"""
raise
NotImplementedError
raise
NotImplementedError
...
@@ -131,17 +138,17 @@ class DistributedGPUExecutor(GPUExecutor):
...
@@ -131,17 +138,17 @@ class DistributedGPUExecutor(GPUExecutor):
self
,
self
,
method
:
str
,
method
:
str
,
*
args
,
*
args
,
async_run_
remote
_workers_only
:
bool
=
False
,
async_run_
tensor_parallel
_workers_only
:
bool
=
False
,
max_concurrent_workers
:
Optional
[
int
]
=
None
,
max_concurrent_workers
:
Optional
[
int
]
=
None
,
**
kwargs
,
**
kwargs
,
)
->
Any
:
)
->
Any
:
"""Runs the given method on all workers.
"""Runs the given method on all workers.
Args:
Args:
async_run_
remote
_workers_only: If True the method will be
run only
async_run_
tensor_parallel
_workers_only: If True the method will be
in the remote workers, not the driver worker.
It will also be
run only
in the remote
TP
workers, not the driver worker.
run asynchronously and return a list of futures
rather than
It will also be
run asynchronously and return a list of futures
blocking on the results.
rather than
blocking on the results.
"""
"""
raise
NotImplementedError
raise
NotImplementedError
...
...
vllm/executor/executor_base.py
View file @
705f6a35
import
asyncio
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
typing
import
List
,
Optional
,
Set
,
Tuple
from
typing
import
List
,
Optional
,
Set
,
Tuple
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
ModelConfig
,
MultiModalConfig
,
ParallelConfig
,
SpeculativeConfig
,
VisionLanguageConfig
)
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
...
@@ -25,8 +28,9 @@ class ExecutorBase(ABC):
...
@@ -25,8 +28,9 @@ class ExecutorBase(ABC):
device_config
:
DeviceConfig
,
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
lora_config
:
Optional
[
LoRAConfig
],
vision_language
_config
:
Optional
[
VisionLanguage
Config
],
multimodal
_config
:
Optional
[
MultiModal
Config
],
speculative_config
:
Optional
[
SpeculativeConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
],
)
->
None
:
)
->
None
:
self
.
model_config
=
model_config
self
.
model_config
=
model_config
self
.
cache_config
=
cache_config
self
.
cache_config
=
cache_config
...
@@ -35,8 +39,9 @@ class ExecutorBase(ABC):
...
@@ -35,8 +39,9 @@ class ExecutorBase(ABC):
self
.
parallel_config
=
parallel_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
device_config
=
device_config
self
.
vision_language_config
=
vision_language
_config
self
.
multimodal_config
=
multimodal
_config
self
.
speculative_config
=
speculative_config
self
.
speculative_config
=
speculative_config
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
_init_executor
()
self
.
_init_executor
()
...
@@ -69,8 +74,8 @@ class ExecutorBase(ABC):
...
@@ -69,8 +74,8 @@ class ExecutorBase(ABC):
@
abstractmethod
@
abstractmethod
def
execute_model
(
def
execute_model
(
self
,
self
,
execute_model_req
:
ExecuteModelRequest
execute_model_req
:
ExecuteModelRequest
)
->
List
[
SamplerOutput
]:
)
->
Optional
[
List
[
SamplerOutput
]
]
:
"""Executes at least one model step on the given sequences."""
"""Executes at least one model step on the given sequences."""
raise
NotImplementedError
raise
NotImplementedError
...
@@ -86,10 +91,31 @@ class ExecutorBase(ABC):
...
@@ -86,10 +91,31 @@ class ExecutorBase(ABC):
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
raise
NotImplementedError
raise
NotImplementedError
@
abstractmethod
def
pin_lora
(
self
,
lora_id
:
int
)
->
bool
:
raise
NotImplementedError
# type: ignore
@
abstractmethod
@
abstractmethod
def
list_loras
(
self
)
->
Set
[
int
]:
def
list_loras
(
self
)
->
Set
[
int
]:
raise
NotImplementedError
raise
NotImplementedError
@
abstractmethod
def
add_prompt_adapter
(
self
,
prompt_adapter_request
:
PromptAdapterRequest
)
->
bool
:
raise
NotImplementedError
@
abstractmethod
def
remove_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
raise
NotImplementedError
@
abstractmethod
def
pin_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
raise
NotImplementedError
# type: ignore
@
abstractmethod
def
list_prompt_adapters
(
self
)
->
Set
[
int
]:
raise
NotImplementedError
@
abstractmethod
@
abstractmethod
def
check_health
(
self
)
->
None
:
def
check_health
(
self
)
->
None
:
"""Checks if the executor is healthy. If not, it should raise an
"""Checks if the executor is healthy. If not, it should raise an
...
@@ -106,6 +132,26 @@ class ExecutorBase(ABC):
...
@@ -106,6 +132,26 @@ class ExecutorBase(ABC):
class
ExecutorAsyncBase
(
ExecutorBase
):
class
ExecutorAsyncBase
(
ExecutorBase
):
def
__init__
(
self
,
model_config
:
ModelConfig
,
cache_config
:
CacheConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
],
)
->
None
:
self
.
pp_locks
:
Optional
[
List
[
asyncio
.
Lock
]]
=
None
super
().
__init__
(
model_config
,
cache_config
,
parallel_config
,
scheduler_config
,
device_config
,
load_config
,
lora_config
,
multimodal_config
,
speculative_config
,
prompt_adapter_config
)
@
abstractmethod
@
abstractmethod
async
def
execute_model_async
(
async
def
execute_model_async
(
self
,
self
,
...
...
vllm/executor/gpu_executor.py
View file @
705f6a35
...
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union
...
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union
from
vllm.executor.executor_base
import
ExecutorAsyncBase
,
ExecutorBase
from
vllm.executor.executor_base
import
ExecutorAsyncBase
,
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sequence
import
ExecuteModelRequest
,
PoolerOutput
,
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
,
PoolerOutput
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
make_async
)
...
@@ -43,9 +44,11 @@ class GPUExecutor(ExecutorBase):
...
@@ -43,9 +44,11 @@ class GPUExecutor(ExecutorBase):
rank
=
rank
,
rank
=
rank
,
distributed_init_method
=
distributed_init_method
,
distributed_init_method
=
distributed_init_method
,
lora_config
=
self
.
lora_config
,
lora_config
=
self
.
lora_config
,
vision_language_config
=
self
.
vision_language
_config
,
multimodal_config
=
self
.
multimodal
_config
,
speculative_config
=
self
.
speculative_config
,
speculative_config
=
self
.
speculative_config
,
is_driver_worker
=
rank
==
0
,
prompt_adapter_config
=
self
.
prompt_adapter_config
,
is_driver_worker
=
(
not
self
.
parallel_config
)
or
(
rank
%
self
.
parallel_config
.
tensor_parallel_size
==
0
),
)
)
def
_create_worker
(
self
,
def
_create_worker
(
self
,
...
@@ -87,7 +90,7 @@ class GPUExecutor(ExecutorBase):
...
@@ -87,7 +90,7 @@ class GPUExecutor(ExecutorBase):
def
execute_model
(
def
execute_model
(
self
,
execute_model_req
:
ExecuteModelRequest
self
,
execute_model_req
:
ExecuteModelRequest
)
->
List
[
Union
[
SamplerOutput
,
PoolerOutput
]]:
)
->
Optional
[
List
[
Union
[
SamplerOutput
,
PoolerOutput
]]
]
:
output
=
self
.
driver_worker
.
execute_model
(
execute_model_req
)
output
=
self
.
driver_worker
.
execute_model
(
execute_model_req
)
return
output
return
output
...
@@ -99,9 +102,32 @@ class GPUExecutor(ExecutorBase):
...
@@ -99,9 +102,32 @@ class GPUExecutor(ExecutorBase):
assert
lora_id
>
0
,
"lora_id must be greater than 0."
assert
lora_id
>
0
,
"lora_id must be greater than 0."
return
self
.
driver_worker
.
remove_lora
(
lora_id
)
return
self
.
driver_worker
.
remove_lora
(
lora_id
)
def
pin_lora
(
self
,
lora_id
:
int
)
->
bool
:
assert
lora_id
>
0
,
"lora_id must be greater than 0."
return
self
.
driver_worker
.
pin_lora
(
lora_id
)
def
list_loras
(
self
)
->
Set
[
int
]:
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
driver_worker
.
list_loras
()
return
self
.
driver_worker
.
list_loras
()
def
add_prompt_adapter
(
self
,
prompt_adapter_request
:
PromptAdapterRequest
)
->
bool
:
assert
prompt_adapter_request
.
prompt_adapter_id
>
0
,
\
"prompt_adapter_id must be greater than 0."
return
self
.
driver_worker
.
add_prompt_adapter
(
prompt_adapter_request
)
def
remove_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
assert
prompt_adapter_id
>
0
,
\
"prompt_adapter_id must be greater than 0."
return
self
.
driver_worker
.
remove_prompt_adapter
(
prompt_adapter_id
)
def
pin_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
assert
prompt_adapter_id
>
0
,
\
"prompt_adapter_id must be greater than 0."
return
self
.
driver_worker
.
pin_prompt_adapter
(
prompt_adapter_id
)
def
list_prompt_adapters
(
self
)
->
Set
[
int
]:
return
self
.
driver_worker
.
list_prompt_adapters
()
def
check_health
(
self
)
->
None
:
def
check_health
(
self
)
->
None
:
# GPUExecutor will always be healthy as long as
# GPUExecutor will always be healthy as long as
# it's running.
# it's running.
...
...
vllm/executor/multiproc_gpu_executor.py
View file @
705f6a35
...
@@ -9,8 +9,12 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
...
@@ -9,8 +9,12 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
ResultHandler
,
WorkerMonitor
)
ResultHandler
,
WorkerMonitor
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
from
vllm.triton_utils
import
maybe_set_triton_cache_manager
get_vllm_instance_id
,
make_async
)
from
vllm.utils
import
(
cuda_device_count_stateless
,
error_on_invalid_device_count_status
,
get_distributed_init_method
,
get_open_port
,
get_vllm_instance_id
,
make_async
,
update_environment_variables
)
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -24,8 +28,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
...
@@ -24,8 +28,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
# Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
# Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
if
"CUDA_VISIBLE_DEVICES"
not
in
os
.
environ
:
if
"CUDA_VISIBLE_DEVICES"
not
in
os
.
environ
:
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
(
","
.
join
(
update_environment_variables
({
map
(
str
,
range
(
world_size
))))
"CUDA_VISIBLE_DEVICES"
:
(
","
.
join
(
map
(
str
,
range
(
world_size
))))
})
# Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
# Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
os
.
environ
[
"VLLM_INSTANCE_ID"
]
=
get_vllm_instance_id
()
os
.
environ
[
"VLLM_INSTANCE_ID"
]
=
get_vllm_instance_id
()
...
@@ -33,12 +38,25 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
...
@@ -33,12 +38,25 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
# Disable torch async compiling which won't work with daemonic processes
# Disable torch async compiling which won't work with daemonic processes
os
.
environ
[
"TORCHINDUCTOR_COMPILE_THREADS"
]
=
"1"
os
.
environ
[
"TORCHINDUCTOR_COMPILE_THREADS"
]
=
"1"
from
torch.cuda
import
device_count
# Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
assert
world_size
<=
device_count
(),
(
# contention amongst the shards
if
"OMP_NUM_THREADS"
not
in
os
.
environ
:
os
.
environ
[
"OMP_NUM_THREADS"
]
=
"1"
# workaround for https://github.com/vllm-project/vllm/issues/6103
if
world_size
>
1
:
maybe_set_triton_cache_manager
()
assert
world_size
<=
cuda_device_count_stateless
(),
(
"please set tensor_parallel_size to less than max local gpu count"
)
"please set tensor_parallel_size to less than max local gpu count"
)
error_on_invalid_device_count_status
()
# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
# 127.0.0.1 for communication.
distributed_init_method
=
get_distributed_init_method
(
distributed_init_method
=
get_distributed_init_method
(
get_ip
()
,
get_open_port
())
"127.0.0.1"
,
get_open_port
())
if
world_size
==
1
:
if
world_size
==
1
:
self
.
workers
=
[]
self
.
workers
=
[]
...
@@ -73,32 +91,30 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
...
@@ -73,32 +91,30 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
worker_monitor
.
close
()
worker_monitor
.
close
()
def
_driver_execute_model
(
def
_driver_execute_model
(
self
,
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
Optional
[
List
[
SamplerOutput
]]:
)
->
List
[
SamplerOutput
]:
"""Run execute_model in the driver worker.
"""Run execute_model in the driver worker.
Passing None will cause the driver to stop the model execution
Passing None will cause the driver to stop the model execution
loop running in each of the remote workers.
loop running in each of the remote workers.
"""
"""
return
self
.
driver_worker
.
execute_model
(
return
self
.
driver_worker
.
execute_model
(
execute_model_req
)
execute_model_req
=
execute_model_req
)
def
_run_workers
(
def
_run_workers
(
self
,
self
,
method
:
str
,
method
:
str
,
*
args
,
*
args
,
async_run_
remote
_workers_only
:
bool
=
False
,
async_run_
tensor_parallel
_workers_only
:
bool
=
False
,
max_concurrent_workers
:
Optional
[
int
]
=
None
,
max_concurrent_workers
:
Optional
[
int
]
=
None
,
**
kwargs
,
**
kwargs
,
)
->
Any
:
)
->
Any
:
"""Runs the given method on all workers.
"""Runs the given method on all workers.
Args:
Args:
async_run_
remote
_workers_only: If True the method will be
run only
async_run_
tensor_parallel
_workers_only: If True the method will be
in the remote workers, not the driver worker.
It will also be
run only
in the remote
TP
workers, not the driver worker.
run asynchronously and return a list of futures
rather than
It will also be
run asynchronously and return a list of futures
blocking on the results.
rather than
blocking on the results.
"""
"""
if
max_concurrent_workers
:
if
max_concurrent_workers
:
...
@@ -111,7 +127,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
...
@@ -111,7 +127,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
for
worker
in
self
.
workers
for
worker
in
self
.
workers
]
]
if
async_run_
remote
_workers_only
:
if
async_run_
tensor_parallel
_workers_only
:
# Just return futures
# Just return futures
return
worker_outputs
return
worker_outputs
...
...
vllm/executor/neuron_executor.py
View file @
705f6a35
...
@@ -48,15 +48,14 @@ class NeuronExecutor(ExecutorBase):
...
@@ -48,15 +48,14 @@ class NeuronExecutor(ExecutorBase):
def
execute_model
(
def
execute_model
(
self
,
self
,
execute_model_req
:
ExecuteModelRequest
)
->
List
[
SamplerOutput
]:
execute_model_req
:
ExecuteModelRequest
)
->
List
[
SamplerOutput
]:
assert
(
execute_model_req
.
blocks_to_swap_in
==
{}
assert
(
not
execute_model_req
.
blocks_to_swap_in
and
execute_model_req
.
blocks_to_swap_out
==
{}
and
not
execute_model_req
.
blocks_to_swap_out
and
execute_model_req
.
blocks_to_copy
==
{}
),
(
and
not
execute_model_req
.
blocks_to_copy
),
(
"Cache operations are not supported for Neuron backend."
)
"Cache operations are not supported for Neuron backend."
)
assert
execute_model_req
.
num_lookahead_slots
==
0
,
(
assert
execute_model_req
.
num_lookahead_slots
==
0
,
(
"lookahead not supported for Neuron backend."
)
"lookahead not supported for Neuron backend."
)
output
=
self
.
driver_worker
.
execute_model
(
output
=
self
.
driver_worker
.
execute_model
(
execute_model_req
)
execute_model_req
.
seq_group_metadata_list
)
return
output
return
output
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
...
@@ -65,9 +64,28 @@ class NeuronExecutor(ExecutorBase):
...
@@ -65,9 +64,28 @@ class NeuronExecutor(ExecutorBase):
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
return
self
.
driver_worker
.
remove_lora
(
lora_id
)
return
self
.
driver_worker
.
remove_lora
(
lora_id
)
def
pin_lora
(
self
,
lora_id
:
int
)
->
bool
:
return
self
.
driver_worker
.
pin_lora
(
lora_id
)
def
list_loras
(
self
)
->
Set
[
int
]:
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
driver_worker
.
list_loras
()
return
self
.
driver_worker
.
list_loras
()
def
add_prompt_adapter
(
self
,
prompt_adapter_request
)
->
bool
:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the Neuron backend."
)
def
remove_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the Neuron backend."
)
def
pin_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the Neuron backend."
)
def
list_prompt_adapters
(
self
)
->
Set
[
int
]:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the Neuron backend."
)
def
check_health
(
self
)
->
None
:
def
check_health
(
self
)
->
None
:
# NeuronExecutor will always be healthy as long as
# NeuronExecutor will always be healthy as long as
# it's running.
# it's running.
...
...
vllm/executor/openvino_executor.py
0 → 100644
View file @
705f6a35
from
typing
import
List
,
Set
,
Tuple
import
openvino
as
ov
import
openvino.properties.hint
as
hints
import
torch
import
vllm.envs
as
envs
from
vllm.config
import
CacheConfig
,
ModelConfig
from
vllm.executor.executor_base
import
ExecutorAsyncBase
,
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
logger
=
init_logger
(
__name__
)
class
OpenVINOExecutor
(
ExecutorBase
):
def
_init_executor
(
self
)
->
None
:
assert
self
.
device_config
.
device_type
==
"openvino"
assert
self
.
lora_config
is
None
,
"OpenVINO backend doesn't support LoRA"
self
.
model_config
=
_verify_and_get_model_config
(
self
.
model_config
)
self
.
cache_config
=
_verify_and_get_cache_config
(
self
.
cache_config
)
# Instantiate the worker and load the model to CPU.
self
.
_init_worker
()
def
_init_worker
(
self
):
from
vllm.worker.openvino_worker
import
OpenVINOWorker
assert
(
self
.
parallel_config
.
world_size
==
1
),
"OpenVINOExecutor only supports single CPU socket currently."
distributed_init_method
=
get_distributed_init_method
(
get_ip
(),
get_open_port
())
self
.
driver_worker
=
OpenVINOWorker
(
model_config
=
self
.
model_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
,
device_config
=
self
.
device_config
,
cache_config
=
self
.
cache_config
,
load_config
=
self
.
load_config
,
local_rank
=
0
,
rank
=
0
,
distributed_init_method
=
distributed_init_method
,
lora_config
=
self
.
lora_config
,
multimodal_config
=
self
.
multimodal_config
,
kv_cache_dtype
=
self
.
cache_config
.
cache_dtype
,
is_driver_worker
=
True
,
)
self
.
driver_worker
.
init_device
()
self
.
driver_worker
.
load_model
()
def
determine_num_available_blocks
(
self
)
->
Tuple
[
int
,
int
]:
"""Determine the number of available KV blocks by invoking the
underlying worker.
"""
return
self
.
driver_worker
.
determine_num_available_blocks
()
def
initialize_cache
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
)
->
None
:
"""Initialize the KV cache by invoking the underlying worker."""
# NOTE: We log here to avoid multiple logs when number of workers is
# greater than one. We could log in the engine, but not all executors
# have GPUs.
# NOTE: `cpu block` for OpenVINO backend is located on CPU memory but is
# referred as `gpu block`. Because we want to reuse the existing block
# management procedure.
logger
.
info
(
"# CPU blocks: %d"
,
num_gpu_blocks
)
self
.
driver_worker
.
initialize_cache
(
num_gpu_blocks
,
num_cpu_blocks
)
def
execute_model
(
self
,
execute_model_req
:
ExecuteModelRequest
)
->
List
[
SamplerOutput
]:
output
=
self
.
driver_worker
.
execute_model
(
execute_model_req
)
return
output
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
return
self
.
driver_worker
.
add_lora
(
lora_request
)
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
return
self
.
driver_worker
.
remove_lora
(
lora_id
)
def
pin_lora
(
self
,
lora_id
:
int
)
->
bool
:
return
self
.
driver_worker
.
pin_lora
(
lora_id
)
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
driver_worker
.
list_loras
()
def
add_prompt_adapter
(
self
,
prompt_adapter_request
)
->
bool
:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the OPENVINO backend."
)
def
remove_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the OPENVINO backend."
)
def
pin_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the OPENVINO backend."
)
def
list_prompt_adapters
(
self
)
->
Set
[
int
]:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the OPENVINO backend."
)
def
check_health
(
self
)
->
None
:
# OpenVINOExecutor will always be healthy as long as
# it's running.
return
class
OpenVINOExecutorAsync
(
OpenVINOExecutor
,
ExecutorAsyncBase
):
async
def
execute_model_async
(
self
,
execute_model_req
:
ExecuteModelRequest
)
->
List
[
SamplerOutput
]:
output
=
await
make_async
(
self
.
driver_worker
.
execute_model
)(
execute_model_req
=
execute_model_req
,
)
return
output
async
def
check_health_async
(
self
)
->
None
:
# OpenVINOExecutor will always be healthy as long as
# it's running.
return
def
_verify_and_get_model_config
(
config
:
ModelConfig
)
->
ModelConfig
:
if
config
.
dtype
!=
torch
.
float32
:
logger
.
warning
(
f
"Only float32 dtype is supported on OpenVINO, casting from
{
config
.
dtype
}
."
# noqa: G004, E501
)
config
.
dtype
=
torch
.
float32
if
not
config
.
enforce_eager
:
logger
.
warning
(
"CUDA graph is not supported on OpenVINO backend, fallback to the "
"eager mode."
)
config
.
enforce_eager
=
True
return
config
def
_verify_and_get_cache_config
(
config
:
CacheConfig
)
->
CacheConfig
:
if
envs
.
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION
==
"u8"
:
logger
.
info
(
"KV cache type is overried to u8 via "
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var."
)
config
.
cache_dtype
=
ov
.
Type
.
u8
else
:
core
=
ov
.
Core
()
inference_precision
=
core
.
get_property
(
"CPU"
,
hints
.
inference_precision
)
if
inference_precision
==
ov
.
Type
.
bf16
:
config
.
cache_dtype
=
ov
.
Type
.
bf16
else
:
config
.
cache_dtype
=
ov
.
Type
.
f16
if
config
.
block_size
!=
32
:
logger
.
info
(
f
"OpenVINO optimal block size is 32, overriding currently set
{
config
.
block_size
}
"
# noqa: G004, E501
)
config
.
block_size
=
32
kv_cache_space
=
envs
.
VLLM_OPENVINO_KVCACHE_SPACE
if
kv_cache_space
>=
0
:
_GB
=
1
<<
30
if
kv_cache_space
==
0
:
config
.
openvino_kvcache_space_bytes
=
4
*
_GB
# type: ignore
logger
.
warning
(
"Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
"for OpenVINO backend is not set, using 4 by default."
)
else
:
config
.
openvino_kvcache_space_bytes
=
kv_cache_space
*
_GB
# type: ignore
else
:
raise
RuntimeError
(
"Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
f
"
{
kv_cache_space
}
, expect a positive integer value."
)
return
config
vllm/executor/ray_gpu_executor.py
View file @
705f6a35
...
@@ -11,7 +11,8 @@ from vllm.executor.distributed_gpu_executor import ( # yapf: disable
...
@@ -11,7 +11,8 @@ from vllm.executor.distributed_gpu_executor import ( # yapf: disable
from
vllm.executor.ray_utils
import
RayWorkerWrapper
,
ray
from
vllm.executor.ray_utils
import
RayWorkerWrapper
,
ray
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
from
vllm.utils
import
(
error_on_invalid_device_count_status
,
get_distributed_init_method
,
get_ip
,
get_open_port
,
get_vllm_instance_id
,
make_async
)
get_vllm_instance_id
,
make_async
)
if
ray
is
not
None
:
if
ray
is
not
None
:
...
@@ -62,7 +63,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
...
@@ -62,7 +63,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
def
_init_workers_ray
(
self
,
placement_group
:
"PlacementGroup"
,
def
_init_workers_ray
(
self
,
placement_group
:
"PlacementGroup"
,
**
ray_remote_kwargs
):
**
ray_remote_kwargs
):
if
self
.
parallel_config
.
tensor_parallel_size
==
1
:
if
(
self
.
parallel_config
.
tensor_parallel_size
==
1
and
self
.
parallel_config
.
pipeline_parallel_size
==
1
):
# For single GPU case, we use a ray worker with constrained memory.
# For single GPU case, we use a ray worker with constrained memory.
num_gpus
=
self
.
cache_config
.
gpu_memory_utilization
num_gpus
=
self
.
cache_config
.
gpu_memory_utilization
else
:
else
:
...
@@ -132,11 +134,38 @@ class RayGPUExecutor(DistributedGPUExecutor):
...
@@ -132,11 +134,38 @@ class RayGPUExecutor(DistributedGPUExecutor):
worker_node_and_gpu_ids
=
self
.
_run_workers
(
"get_node_and_gpu_ids"
,
worker_node_and_gpu_ids
=
self
.
_run_workers
(
"get_node_and_gpu_ids"
,
use_dummy_driver
=
True
)
use_dummy_driver
=
True
)
node_workers
=
defaultdict
(
list
)
# the order in `worker_node_and_gpu_ids` does not necessarily match
node_gpus
=
defaultdict
(
list
)
# the machine boundaries. We need to make sure that workers in the
# same node are assigned consecutive ranks.
for
i
,
(
node_id
,
gpu_ids
)
in
enumerate
(
worker_node_and_gpu_ids
):
# examples:
node_workers
[
node_id
].
append
(
i
)
# [('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [1]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [2]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [3]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [1]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [2]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [3])] # noqa
# initialize worker ranks with -1 (unassigned)
worker_ranks
=
[
-
1
for
x
in
worker_node_and_gpu_ids
]
current_rank
=
0
while
-
1
in
worker_ranks
:
# whenever we find an unassigned worker, find the node
index
=
worker_ranks
.
index
(
-
1
)
current_node_id
=
worker_node_and_gpu_ids
[
index
][
0
]
# assign ranks to all workers in the same node
for
i
,
(
node_id
,
_
)
in
enumerate
(
worker_node_and_gpu_ids
):
if
node_id
==
current_node_id
:
worker_ranks
[
i
]
=
current_rank
current_rank
+=
1
# with the above example, worker_ranks will be [0, 4, 5, 6, 7, 1, 2, 3]
node_workers
=
defaultdict
(
list
)
# node id -> list of worker ranks
node_gpus
=
defaultdict
(
list
)
# node id -> list of gpu ids
for
worker_rank
,
(
node_id
,
gpu_ids
)
in
zip
(
worker_ranks
,
worker_node_and_gpu_ids
):
node_workers
[
node_id
].
append
(
worker_rank
)
# `gpu_ids` can be a list of strings or integers.
# convert them to integers for consistency.
# NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
# string sorting is not sufficient.
# see https://github.com/vllm-project/vllm/issues/5590
gpu_ids
=
[
int
(
x
)
for
x
in
gpu_ids
]
node_gpus
[
node_id
].
extend
(
gpu_ids
)
node_gpus
[
node_id
].
extend
(
gpu_ids
)
for
node_id
,
gpu_ids
in
node_gpus
.
items
():
for
node_id
,
gpu_ids
in
node_gpus
.
items
():
node_gpus
[
node_id
]
=
sorted
(
gpu_ids
)
node_gpus
[
node_id
]
=
sorted
(
gpu_ids
)
...
@@ -155,16 +184,29 @@ class RayGPUExecutor(DistributedGPUExecutor):
...
@@ -155,16 +184,29 @@ class RayGPUExecutor(DistributedGPUExecutor):
self
.
_run_workers
(
"update_environment_variables"
,
self
.
_run_workers
(
"update_environment_variables"
,
all_args
=
all_args_to_update_environment_variables
)
all_args
=
all_args_to_update_environment_variables
)
if
len
(
node_gpus
)
==
1
:
# in single node case, we don't need to get the IP address.
# the loopback address is sufficient
# NOTE: a node may have several IP addresses, one for each
# network interface. `get_ip()` might return any of them,
# while they might not work for communication inside the node
# if the network setup is complicated. Using the loopback address
# solves this issue, as it always works for communication inside
# the node.
driver_ip
=
"127.0.0.1"
distributed_init_method
=
get_distributed_init_method
(
distributed_init_method
=
get_distributed_init_method
(
driver_ip
,
get_open_port
())
driver_ip
,
get_open_port
())
error_on_invalid_device_count_status
()
# Initialize the actual workers inside worker wrapper.
# Initialize the actual workers inside worker wrapper.
init_worker_all_kwargs
=
[
init_worker_all_kwargs
=
[
self
.
_get_worker_kwargs
(
self
.
_get_worker_kwargs
(
local_rank
=
node_workers
[
node_id
].
index
(
rank
),
local_rank
=
node_workers
[
node_id
].
index
(
rank
),
rank
=
rank
,
rank
=
rank
,
distributed_init_method
=
distributed_init_method
,
distributed_init_method
=
distributed_init_method
,
)
for
rank
,
(
node_id
,
_
)
in
enumerate
(
worker_node_and_gpu_ids
)
)
for
rank
,
(
node_id
,
_
)
in
zip
(
worker_ranks
,
worker_node_and_gpu_ids
)
]
]
self
.
_run_workers
(
"init_worker"
,
all_kwargs
=
init_worker_all_kwargs
)
self
.
_run_workers
(
"init_worker"
,
all_kwargs
=
init_worker_all_kwargs
)
...
@@ -173,10 +215,26 @@ class RayGPUExecutor(DistributedGPUExecutor):
...
@@ -173,10 +215,26 @@ class RayGPUExecutor(DistributedGPUExecutor):
max_concurrent_workers
=
self
.
parallel_config
.
max_concurrent_workers
=
self
.
parallel_config
.
max_parallel_loading_workers
)
max_parallel_loading_workers
)
# This is the list of workers that are rank 0 of each TP group EXCEPT
# global rank 0. These are the workers that will broadcast to the
# rest of the workers.
self
.
tp_driver_workers
:
List
[
RayWorkerWrapper
]
=
[]
# This is the list of workers that are not drivers and not the first
# worker in a TP group. These are the workers that will be
# broadcasted to.
self
.
non_driver_workers
:
List
[
RayWorkerWrapper
]
=
[]
for
idx
,
rank
in
enumerate
(
worker_ranks
[
1
:]):
# We need to skip the driver worker, which we
# do by skipping worker_ranks[0] which is always 0.
if
rank
%
self
.
parallel_config
.
tensor_parallel_size
==
0
:
self
.
tp_driver_workers
.
append
(
self
.
workers
[
idx
])
else
:
self
.
non_driver_workers
.
append
(
self
.
workers
[
idx
])
def
_driver_execute_model
(
def
_driver_execute_model
(
self
,
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
Optional
[
List
[
SamplerOutput
]]:
)
->
List
[
SamplerOutput
]:
"""Run execute_model in the driver worker.
"""Run execute_model in the driver worker.
Passing None will cause the driver to stop the model execution
Passing None will cause the driver to stop the model execution
...
@@ -189,7 +247,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
...
@@ -189,7 +247,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
self
,
self
,
method
:
str
,
method
:
str
,
*
args
,
*
args
,
async_run_
remote
_workers_only
:
bool
=
False
,
async_run_
tensor_parallel
_workers_only
:
bool
=
False
,
all_args
:
Optional
[
List
[
Tuple
[
Any
,
...]]]
=
None
,
all_args
:
Optional
[
List
[
Tuple
[
Any
,
...]]]
=
None
,
all_kwargs
:
Optional
[
List
[
Dict
[
str
,
Any
]]]
=
None
,
all_kwargs
:
Optional
[
List
[
Dict
[
str
,
Any
]]]
=
None
,
use_dummy_driver
:
bool
=
False
,
use_dummy_driver
:
bool
=
False
,
...
@@ -200,10 +258,11 @@ class RayGPUExecutor(DistributedGPUExecutor):
...
@@ -200,10 +258,11 @@ class RayGPUExecutor(DistributedGPUExecutor):
"""Runs the given method on all workers. Can be used in the following
"""Runs the given method on all workers. Can be used in the following
ways:
ways:
- async_run_remote_workers_only: If True the method will be run only
Args:
in the remote workers, not the driver worker. It will also be
- async_run_tensor_parallel_workers_only: If True the method will be
run asynchronously and return a list of futures rather than blocking
run only in the remote TP workers, not the driver worker.
on the results.
It will also be run asynchronously and return a list of futures
rather than blocking on the results.
- args/kwargs: All workers share the same args/kwargs
- args/kwargs: All workers share the same args/kwargs
- all_args/all_kwargs: args/kwargs for each worker are specified
- all_args/all_kwargs: args/kwargs for each worker are specified
individually
individually
...
@@ -213,7 +272,9 @@ class RayGPUExecutor(DistributedGPUExecutor):
...
@@ -213,7 +272,9 @@ class RayGPUExecutor(DistributedGPUExecutor):
raise
NotImplementedError
(
raise
NotImplementedError
(
"max_concurrent_workers is not supported yet."
)
"max_concurrent_workers is not supported yet."
)
count
=
len
(
self
.
workers
)
count
=
len
(
self
.
workers
)
if
not
\
async_run_tensor_parallel_workers_only
\
else
len
(
self
.
non_driver_workers
)
all_worker_args
=
repeat
(
args
,
count
)
if
all_args
is
None
\
all_worker_args
=
repeat
(
args
,
count
)
if
all_args
is
None
\
else
islice
(
all_args
,
1
,
None
)
else
islice
(
all_args
,
1
,
None
)
all_worker_kwargs
=
repeat
(
kwargs
,
count
)
if
all_kwargs
is
None
\
all_worker_kwargs
=
repeat
(
kwargs
,
count
)
if
all_kwargs
is
None
\
...
@@ -227,14 +288,17 @@ class RayGPUExecutor(DistributedGPUExecutor):
...
@@ -227,14 +288,17 @@ class RayGPUExecutor(DistributedGPUExecutor):
ray_worker_outputs
=
[]
ray_worker_outputs
=
[]
else
:
else
:
# Start the ray workers first.
# Start the ray workers first.
ray_workers
=
self
.
workers
if
async_run_tensor_parallel_workers_only
:
ray_workers
=
self
.
non_driver_workers
ray_worker_outputs
=
[
ray_worker_outputs
=
[
worker
.
execute_method
.
remote
(
method
,
*
worker_args
,
worker
.
execute_method
.
remote
(
method
,
*
worker_args
,
**
worker_kwargs
)
**
worker_kwargs
)
for
(
worker
,
worker_args
,
worker_kwargs
for
(
worker
,
worker_args
,
worker_kwargs
)
in
zip
(
self
.
workers
,
all_worker_args
,
all_worker_kwargs
)
)
in
zip
(
ray_
workers
,
all_worker_args
,
all_worker_kwargs
)
]
]
if
async_run_
remote
_workers_only
:
if
async_run_
tensor_parallel
_workers_only
:
# Just return futures
# Just return futures
return
ray_worker_outputs
return
ray_worker_outputs
...
@@ -304,12 +368,41 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
...
@@ -304,12 +368,41 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
self
,
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
List
[
SamplerOutput
]:
)
->
List
[
SamplerOutput
]:
return
await
self
.
driver_exec_method
(
"execute_model"
,
if
self
.
pp_locks
is
None
:
execute_model_req
)
# This locks each pipeline parallel stage so multiple virtual
# engines can't execute on the same stage at the same time
# We create the locks here to avoid creating them in the constructor
# which uses a different asyncio loop.
self
.
pp_locks
=
[
asyncio
.
Lock
()
for
_
in
range
(
self
.
parallel_config
.
pipeline_parallel_size
)
]
async
def
_run_task_with_lock
(
task
,
lock
,
*
args
,
**
kwargs
):
async
with
lock
:
return
await
task
(
*
args
,
**
kwargs
)
tasks
=
[]
tasks
.
append
(
asyncio
.
create_task
(
_run_task_with_lock
(
self
.
driver_exec_method
,
self
.
pp_locks
[
0
],
"execute_model"
,
execute_model_req
)))
for
pp_rank
,
driver_worker
in
enumerate
(
self
.
tp_driver_workers
,
start
=
1
):
tasks
.
append
(
asyncio
.
create_task
(
_run_task_with_lock
(
driver_worker
.
execute_method
.
remote
,
self
.
pp_locks
[
pp_rank
],
"execute_model"
,
execute_model_req
)))
results
=
await
asyncio
.
gather
(
*
tasks
)
# Only the last PP stage has the final results.
return
results
[
-
1
]
async
def
_start_worker_execution_loop
(
self
):
async
def
_start_worker_execution_loop
(
self
):
coros
=
[
coros
=
[
worker
.
execute_method
.
remote
(
"start_worker_execution_loop"
)
worker
.
execute_method
.
remote
(
"start_worker_execution_loop"
)
for
worker
in
self
.
workers
for
worker
in
self
.
non_driver_
workers
]
]
return
await
asyncio
.
gather
(
*
coros
)
return
await
asyncio
.
gather
(
*
coros
)
vllm/executor/ray_utils.py
View file @
705f6a35
...
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
...
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
from
vllm.config
import
ParallelConfig
from
vllm.config
import
ParallelConfig
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils
import
get_ip
,
is_hip
from
vllm.utils
import
get_ip
,
is_hip
,
is_xpu
from
vllm.worker.worker_base
import
WorkerWrapperBase
from
vllm.worker.worker_base
import
WorkerWrapperBase
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -42,14 +42,26 @@ try:
...
@@ -42,14 +42,26 @@ try:
output
=
pickle
.
dumps
(
output
)
output
=
pickle
.
dumps
(
output
)
return
output
return
output
ray_import_err
=
None
except
ImportError
as
e
:
except
ImportError
as
e
:
logger
.
warning
(
"Failed to import Ray with %r. For multi-node inference, "
"please install Ray with `pip install ray`."
,
e
)
ray
=
None
# type: ignore
ray
=
None
# type: ignore
ray_import_err
=
e
RayWorkerWrapper
=
None
# type: ignore
RayWorkerWrapper
=
None
# type: ignore
def
ray_is_available
()
->
bool
:
"""Returns True if Ray is available."""
return
ray
is
not
None
def
assert_ray_available
():
"""Raise an exception if Ray is not available."""
if
ray
is
None
:
raise
ValueError
(
"Failed to import Ray, please install Ray with "
"`pip install ray`."
)
from
ray_import_err
def
initialize_ray_cluster
(
def
initialize_ray_cluster
(
parallel_config
:
ParallelConfig
,
parallel_config
:
ParallelConfig
,
ray_address
:
Optional
[
str
]
=
None
,
ray_address
:
Optional
[
str
]
=
None
,
...
@@ -65,13 +77,10 @@ def initialize_ray_cluster(
...
@@ -65,13 +77,10 @@ def initialize_ray_cluster(
ray_address: The address of the Ray cluster. If None, uses
ray_address: The address of the Ray cluster. If None, uses
the default Ray cluster address.
the default Ray cluster address.
"""
"""
if
ray
is
None
:
assert_ray_available
()
raise
ImportError
(
"Ray is not installed. Please install Ray to use multi-node "
"serving."
)
# Connect to a ray cluster.
# Connect to a ray cluster.
if
is_hip
():
if
is_hip
()
or
is_xpu
()
:
ray
.
init
(
address
=
ray_address
,
ray
.
init
(
address
=
ray_address
,
ignore_reinit_error
=
True
,
ignore_reinit_error
=
True
,
num_gpus
=
parallel_config
.
world_size
)
num_gpus
=
parallel_config
.
world_size
)
...
...
vllm/executor/ray_xpu_executor.py
0 → 100644
View file @
705f6a35
import
asyncio
import
os
import
pickle
from
collections
import
defaultdict
from
itertools
import
islice
,
repeat
from
typing
import
(
TYPE_CHECKING
,
Any
,
Awaitable
,
Dict
,
List
,
Optional
,
Set
,
Tuple
,
Union
)
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
from
vllm.executor.distributed_gpu_executor
import
(
# yapf: disable
DistributedGPUExecutor
,
DistributedGPUExecutorAsync
)
from
vllm.executor.ray_utils
import
RayWorkerWrapper
,
ray
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
if
ray
is
not
None
:
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
if
TYPE_CHECKING
:
from
ray.util.placement_group
import
PlacementGroup
logger
=
init_logger
(
__name__
)
# If the env var is set, it uses the Ray's compiled DAG API
# which optimizes the control plane overhead.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
USE_RAY_COMPILED_DAG
=
bool
(
os
.
getenv
(
"VLLM_USE_RAY_COMPILED_DAG"
,
0
))
class
RayXPUExecutor
(
DistributedGPUExecutor
):
def
__init__
(
self
,
model_config
:
ModelConfig
,
cache_config
:
CacheConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
)
->
None
:
assert
device_config
.
device_type
==
"xpu"
assert
(
not
speculative_config
),
"Speculative decoding not yet supported for XPU backend"
self
.
model_config
=
model_config
self
.
cache_config
=
cache_config
self
.
load_config
=
load_config
self
.
lora_config
=
lora_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
multimodal_config
=
multimodal_config
self
.
prompt_adapter_config
=
prompt_adapter_config
placement_group
=
self
.
parallel_config
.
placement_group
# Disable Ray usage stats collection.
ray_usage
=
os
.
environ
.
get
(
"RAY_USAGE_STATS_ENABLED"
,
"0"
)
if
ray_usage
!=
"1"
:
os
.
environ
[
"RAY_USAGE_STATS_ENABLED"
]
=
"0"
# Create the parallel GPU workers.
self
.
_init_workers_ray
(
placement_group
)
# Profile the memory usage and initialize the cache.
self
.
forward_dag
=
None
if
USE_RAY_COMPILED_DAG
:
self
.
forward_dag
=
self
.
_compiled_ray_dag
()
# This is non-None when the execute model loop is running
# in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
self
.
parallel_worker_tasks
:
Optional
[
Union
[
Any
,
Awaitable
[
Any
]]]
=
None
# Updated by implementations that require additional args to be passed
# to the _run_workers execute_model call
self
.
extra_execute_model_run_workers_kwargs
:
Dict
[
str
,
Any
]
=
{}
def
_init_executor
(
self
)
->
None
:
pass
def
determine_num_available_blocks
(
self
)
->
Tuple
[
int
,
int
]:
"""Determine the number of available KV blocks.
This invokes `determine_num_available_blocks` on each worker and takes
the min of the results, guaranteeing that the selected cache sizes are
compatible with all workers.
Returns:
- Tuple[num_gpu_blocks, num_cpu_blocks]
"""
# Get the maximum number of blocks that can be allocated on GPU and CPU.
num_blocks
=
self
.
_run_workers
(
"determine_num_available_blocks"
,
)
# Since we use a shared centralized controller, we take the minimum
# number of blocks across all workers to make sure all the memory
# operators can be applied to all workers.
num_gpu_blocks
=
min
(
b
[
0
]
for
b
in
num_blocks
)
num_cpu_blocks
=
min
(
b
[
1
]
for
b
in
num_blocks
)
return
num_gpu_blocks
,
num_cpu_blocks
def
_init_workers_ray
(
self
,
placement_group
:
"PlacementGroup"
,
**
ray_remote_kwargs
):
if
self
.
parallel_config
.
tensor_parallel_size
==
1
:
# For single GPU case, we use a ray worker with constrained memory.
num_gpus
=
self
.
cache_config
.
gpu_memory_utilization
else
:
# Otherwise, the ray workers are allocated with a full GPU.
num_gpus
=
1
# The driver dummy worker does not actually use any resources.
# It holds the resource for the driver worker.
self
.
driver_dummy_worker
:
Optional
[
RayWorkerWrapper
]
=
None
# The remaining workers are the actual ray actors.
self
.
workers
:
List
[
RayWorkerWrapper
]
=
[]
# Create the workers.
driver_ip
=
get_ip
()
for
bundle_id
,
bundle
in
enumerate
(
placement_group
.
bundle_specs
):
if
not
bundle
.
get
(
"GPU"
,
0
):
continue
scheduling_strategy
=
PlacementGroupSchedulingStrategy
(
placement_group
=
placement_group
,
placement_group_capture_child_tasks
=
True
,
placement_group_bundle_index
=
bundle_id
,
)
worker
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
num_gpus
,
scheduling_strategy
=
scheduling_strategy
,
**
ray_remote_kwargs
,
)(
RayWorkerWrapper
).
remote
(
worker_module_name
=
"vllm.worker.xpu_worker"
,
worker_class_name
=
"XPUWorker"
,
trust_remote_code
=
self
.
model_config
.
trust_remote_code
,
)
worker_ip
=
ray
.
get
(
worker
.
get_node_ip
.
remote
())
if
worker_ip
==
driver_ip
and
self
.
driver_dummy_worker
is
None
:
# If the worker is on the same node as the driver, we use it
# as the resource holder for the driver process.
self
.
driver_dummy_worker
=
worker
self
.
driver_worker
=
RayWorkerWrapper
(
worker_module_name
=
"vllm.worker.xpu_worker"
,
worker_class_name
=
"XPUWorker"
,
trust_remote_code
=
self
.
model_config
.
trust_remote_code
,
)
else
:
# Else, added to the list of workers.
self
.
workers
.
append
(
worker
)
if
self
.
driver_dummy_worker
is
None
:
raise
ValueError
(
"Ray does not allocate any GPUs on the driver node. Consider "
"adjusting the Ray placement group or running the driver on a "
"GPU node."
)
# Get the set of GPU IDs used on each node.
worker_node_and_gpu_ids
=
self
.
_run_workers
(
"get_node_and_gpu_ids"
,
use_dummy_driver
=
True
)
node_workers
=
defaultdict
(
list
)
node_gpus
=
defaultdict
(
list
)
for
i
,
(
node_id
,
gpu_ids
)
in
enumerate
(
worker_node_and_gpu_ids
):
node_workers
[
node_id
].
append
(
i
)
node_gpus
[
node_id
].
extend
(
gpu_ids
)
for
node_id
,
gpu_ids
in
node_gpus
.
items
():
node_gpus
[
node_id
]
=
sorted
(
gpu_ids
)
# TODO: add env var for xpu
distributed_init_method
=
get_distributed_init_method
(
driver_ip
,
get_open_port
())
def
collect_arg_helper_func
(
**
kwargs
):
# avoid writing `{"name": value}` manually
return
kwargs
init_worker_all_kwargs
=
[]
# Initialize the actual workers inside worker wrapper.
for
rank
,
(
node_id
,
_
)
in
enumerate
(
worker_node_and_gpu_ids
,
):
local_rank
=
node_workers
[
node_id
].
index
(
rank
)
init_worker_all_kwargs
.
append
(
collect_arg_helper_func
(
model_config
=
self
.
model_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
,
device_config
=
self
.
device_config
,
cache_config
=
self
.
cache_config
,
load_config
=
self
.
load_config
,
local_rank
=
local_rank
,
rank
=
rank
,
distributed_init_method
=
distributed_init_method
,
lora_config
=
self
.
lora_config
,
multimodal_config
=
self
.
multimodal_config
,
is_driver_worker
=
rank
==
0
,
))
self
.
_run_workers
(
"init_worker"
,
all_kwargs
=
init_worker_all_kwargs
)
self
.
_run_workers
(
"init_device"
)
self
.
_run_workers
(
"load_model"
,
max_concurrent_workers
=
self
.
parallel_config
.
max_parallel_loading_workers
,
)
def
initialize_cache
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
)
->
None
:
"""Initialize the KV cache in all workers.
"""
# NOTE: We log here to avoid multiple logs when number of workers is
# greater than one. We could log in the engine, but not all executors
# have GPUs.
logger
.
info
(
"# GPU blocks: %d, "
"# CPU blocks: %d"
,
num_gpu_blocks
,
num_cpu_blocks
)
self
.
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
self
.
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
self
.
_run_workers
(
"initialize_cache"
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
num_cpu_blocks
)
def
_driver_execute_model
(
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
List
[
SamplerOutput
]:
"""Run execute_model in the driver worker.
Passing None will cause the driver to stop the model execution
loop running in each of the remote workers.
"""
return
self
.
driver_worker
.
execute_method
(
"execute_model"
,
execute_model_req
)
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
assert
lora_request
.
lora_int_id
>
0
,
"lora_id must be greater than 0."
return
self
.
_run_workers
(
"add_lora"
,
lora_request
=
lora_request
,
)
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
assert
lora_id
>
0
,
"lora_id must be greater than 0."
return
self
.
_run_workers
(
"remove_lora"
,
lora_id
=
lora_id
,
)
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
_run_workers
(
"list_loras"
)
def
_run_workers
(
self
,
method
:
str
,
*
args
,
async_run_remote_workers_only
:
bool
=
False
,
all_args
:
Optional
[
List
[
Tuple
[
Any
,
...]]]
=
None
,
all_kwargs
:
Optional
[
List
[
Dict
[
str
,
Any
]]]
=
None
,
use_dummy_driver
:
bool
=
False
,
max_concurrent_workers
:
Optional
[
int
]
=
None
,
use_ray_compiled_dag
:
bool
=
False
,
**
kwargs
,
)
->
Any
:
"""Runs the given method on all workers. Can be used in the following
ways:
- args/kwargs: All workers share the same args/kwargs
- args/kwargs and driver_args/driver_kwargs: Driver worker has
different args
- all_args/all_kwargs: args/kwargs for each worker are specified
individually
"""
if
max_concurrent_workers
:
raise
NotImplementedError
(
"max_concurrent_workers is not supported yet."
)
count
=
len
(
self
.
workers
)
all_worker_args
=
repeat
(
args
,
count
)
if
all_args
is
None
\
else
islice
(
all_args
,
1
,
None
)
all_worker_kwargs
=
repeat
(
kwargs
,
count
)
if
all_kwargs
is
None
\
else
islice
(
all_kwargs
,
1
,
None
)
if
use_ray_compiled_dag
:
# Right now, compiled DAG can only accept a single
# input. TODO(sang): Fix it.
assert
self
.
forward_dag
is
not
None
output_channels
=
self
.
forward_dag
.
execute
(
1
)
else
:
# Start the ray workers first.
ray_worker_outputs
=
[
worker
.
execute_method
.
remote
(
method
,
*
worker_args
,
**
worker_kwargs
)
for
(
worker
,
worker_args
,
worker_kwargs
)
in
zip
(
self
.
workers
,
all_worker_args
,
all_worker_kwargs
)
]
if
async_run_remote_workers_only
:
# Just return futures
return
ray_worker_outputs
driver_args
=
args
if
all_args
is
None
else
all_args
[
0
]
driver_kwargs
=
kwargs
if
all_kwargs
is
None
else
all_kwargs
[
0
]
# Start the driver worker after all the ray workers.
if
not
use_dummy_driver
:
driver_worker_output
=
self
.
driver_worker
.
execute_method
(
method
,
*
driver_args
,
**
driver_kwargs
)
else
:
assert
self
.
driver_dummy_worker
is
not
None
driver_worker_output
=
ray
.
get
(
self
.
driver_dummy_worker
.
execute_method
.
remote
(
method
,
*
driver_args
,
**
driver_kwargs
))
# Get the results of the ray workers.
if
self
.
workers
:
if
use_ray_compiled_dag
:
try
:
ray_worker_outputs
=
[
pickle
.
loads
(
chan
.
begin_read
())
for
chan
in
output_channels
]
finally
:
# Has to call end_read in order to reuse the DAG.
for
chan
in
output_channels
:
chan
.
end_read
()
else
:
ray_worker_outputs
=
ray
.
get
(
ray_worker_outputs
)
return
[
driver_worker_output
]
+
ray_worker_outputs
def
_wait_for_tasks_completion
(
self
,
parallel_worker_tasks
:
Any
)
->
None
:
"""Wait for futures returned from _run_workers() with
async_run_remote_workers_only to complete."""
ray
.
get
(
parallel_worker_tasks
)
def
_compiled_ray_dag
(
self
):
import
pkg_resources
required_version
=
"2.9"
current_version
=
pkg_resources
.
get_distribution
(
"ray"
).
version
if
current_version
<
required_version
:
raise
ValueError
(
f
"Ray version
{
required_version
}
or greater is "
f
"required, but found
{
current_version
}
"
)
from
ray.dag
import
InputNode
,
MultiOutputNode
assert
self
.
parallel_config
.
worker_use_ray
# Right now, compiled DAG requires at least 1 arg. We send
# a dummy value for now. It will be fixed soon.
with
InputNode
()
as
input_data
:
forward_dag
=
MultiOutputNode
([
worker
.
execute_model_compiled_dag_remote
.
bind
(
# type: ignore[attr-defined]
input_data
)
for
worker
in
self
.
workers
])
return
forward_dag
.
experimental_compile
()
def
check_health
(
self
)
->
None
:
"""Raises an error if engine is unhealthy."""
self
.
_check_if_any_actor_is_dead
()
def
_check_if_any_actor_is_dead
(
self
):
if
not
self
.
workers
:
return
dead_actors
=
[]
for
actor
in
self
.
workers
:
actor_state
=
ray
.
state
.
actors
(
actor
.
_ray_actor_id
.
hex
())
# pylint: disable=protected-access
if
actor_state
[
"State"
]
==
"DEAD"
:
dead_actors
.
append
(
actor
)
if
dead_actors
:
raise
RuntimeError
(
"At least one Worker is dead. "
f
"Dead Workers:
{
dead_actors
}
. "
)
class
RayXPUExecutorAsync
(
RayXPUExecutor
,
DistributedGPUExecutorAsync
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
driver_exec_method
=
make_async
(
self
.
driver_worker
.
execute_method
)
async
def
_driver_execute_model_async
(
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
List
[
SamplerOutput
]:
return
await
self
.
driver_exec_method
(
"execute_model"
,
execute_model_req
)
async
def
_start_worker_execution_loop
(
self
):
coros
=
[
worker
.
execute_method
.
remote
(
"start_worker_execution_loop"
)
for
worker
in
self
.
workers
]
return
await
asyncio
.
gather
(
*
coros
)
vllm/executor/tpu_executor.py
0 → 100644
View file @
705f6a35
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Set
,
Tuple
import
torch
from
vllm.executor.executor_base
import
ExecutorAsyncBase
,
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
logger
=
init_logger
(
__name__
)
class
TPUExecutor
(
ExecutorBase
):
def
_init_executor
(
self
)
->
None
:
assert
not
self
.
scheduler_config
.
chunked_prefill_enabled
,
(
"Chunked prefill is not yet supported for TPU backend"
)
assert
not
self
.
speculative_config
,
(
"Speculative decoding is not yet supported for TPU backend"
)
if
self
.
model_config
.
dtype
in
(
torch
.
float16
,
torch
.
float32
):
logger
.
warning
(
"The TPU backend currently does not support %s. "
"Using bfloat16 instead."
,
self
.
model_config
.
dtype
)
self
.
model_config
.
dtype
=
torch
.
bfloat16
# Instantiate the worker and load the model to the device.
self
.
driver_worker
=
self
.
_create_worker
()
self
.
driver_worker
.
init_device
()
self
.
driver_worker
.
load_model
()
def
_get_worker_kwargs
(
self
,
local_rank
:
int
=
0
,
rank
:
int
=
0
,
distributed_init_method
:
Optional
[
str
]
=
None
,
)
->
Dict
[
str
,
Any
]:
"""Return worker init args for a given rank."""
if
distributed_init_method
is
None
:
distributed_init_method
=
get_distributed_init_method
(
get_ip
(),
get_open_port
())
return
dict
(
model_config
=
self
.
model_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
,
device_config
=
self
.
device_config
,
cache_config
=
self
.
cache_config
,
load_config
=
self
.
load_config
,
local_rank
=
local_rank
,
rank
=
rank
,
distributed_init_method
=
distributed_init_method
,
multimodal_config
=
self
.
multimodal_config
,
is_driver_worker
=
rank
==
0
,
)
def
_create_worker
(
self
,
local_rank
:
int
=
0
,
rank
:
int
=
0
,
distributed_init_method
:
Optional
[
str
]
=
None
,
):
from
vllm.worker.tpu_worker
import
TPUWorker
worker
=
TPUWorker
(
**
self
.
_get_worker_kwargs
(
local_rank
,
rank
,
distributed_init_method
))
return
worker
def
initialize_cache
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
)
->
None
:
"""Initialize the KV cache by invoking the underlying worker."""
# NOTE: This is logged in the executor because there can be >1 worker
# with other executors. We could log in the engine level, but work
# remains to abstract away the device for non-GPU configurations.
logger
.
info
(
"# TPU blocks: %d, # CPU blocks: %d"
,
num_gpu_blocks
,
num_cpu_blocks
)
self
.
driver_worker
.
initialize_cache
(
num_gpu_blocks
,
num_cpu_blocks
)
def
determine_num_available_blocks
(
self
)
->
Tuple
[
int
,
int
]:
"""Determine the number of available KV blocks by invoking the
underlying worker."""
return
self
.
driver_worker
.
determine_num_available_blocks
()
def
execute_model
(
self
,
execute_model_req
:
ExecuteModelRequest
,
)
->
List
[
SamplerOutput
]:
output
=
self
.
driver_worker
.
execute_model
(
execute_model_req
)
return
output
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
raise
NotImplementedError
(
"LoRA is currently not supported by the TPU backend."
)
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
raise
NotImplementedError
(
"LoRA is currently not supported by the TPU backend."
)
def
pin_lora
(
self
,
lora_id
:
int
)
->
bool
:
raise
NotImplementedError
(
"LoRA is currently not supported by the TPU backend."
)
def
list_loras
(
self
)
->
Set
[
int
]:
raise
NotImplementedError
(
"LoRA is currently not supported by the TPU backend."
)
def
add_prompt_adapter
(
self
,
prompt_adapter_request
)
->
bool
:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the TPU backend."
)
def
remove_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the TPU backend."
)
def
pin_prompt_adapter
(
self
,
prompt_adapter_id
:
int
)
->
bool
:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the TPU backend."
)
def
list_prompt_adapters
(
self
)
->
Set
[
int
]:
raise
NotImplementedError
(
"Soft prompt is currently not supported by the TPU backend."
)
def
check_health
(
self
)
->
None
:
# TPUExecutor will always be healthy as long as it's running.
return
class
TPUExecutorAsync
(
TPUExecutor
,
ExecutorAsyncBase
):
async
def
execute_model_async
(
self
,
sexecute_model_req
:
ExecuteModelRequest
,
)
->
SamplerOutput
:
output
=
await
make_async
(
self
.
driver_worker
.
execute_model
)(
sexecute_model_req
)
return
output
vllm/executor/xpu_executor.py
0 → 100644
View file @
705f6a35
from
typing
import
List
,
Optional
import
torch
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
from
vllm.executor.executor_base
import
ExecutorAsyncBase
from
vllm.executor.gpu_executor
import
GPUExecutor
from
vllm.logger
import
init_logger
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
make_async
from
vllm.worker.worker_base
import
WorkerWrapperBase
logger
=
init_logger
(
__name__
)
class
XPUExecutor
(
GPUExecutor
):
def
__init__
(
self
,
model_config
:
ModelConfig
,
cache_config
:
CacheConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
)
->
None
:
assert
device_config
.
device_type
==
"xpu"
assert
(
not
speculative_config
),
"Speculative decoding not yet supported for XPU backend"
model_config
=
_verify_and_get_model_config
(
model_config
)
self
.
model_config
=
model_config
self
.
cache_config
=
cache_config
self
.
load_config
=
load_config
self
.
lora_config
=
lora_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
multimodal_config
=
multimodal_config
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
speculative_config
=
None
# Instantiate the worker and load the model to GPU.
self
.
_init_executor
()
def
_create_worker
(
self
,
local_rank
:
int
=
0
,
rank
:
int
=
0
,
distributed_init_method
:
Optional
[
str
]
=
None
):
if
self
.
speculative_config
is
None
:
worker_module_name
=
"vllm.worker.xpu_worker"
worker_class_name
=
"XPUWorker"
else
:
raise
NotImplementedError
(
"XPU does not support speculative decoding"
)
wrapper
=
WorkerWrapperBase
(
worker_module_name
=
worker_module_name
,
worker_class_name
=
worker_class_name
,
)
wrapper
.
init_worker
(
**
self
.
_get_worker_kwargs
(
local_rank
,
rank
,
distributed_init_method
))
return
wrapper
.
worker
def
execute_model
(
self
,
execute_model_req
:
ExecuteModelRequest
)
->
List
[
SamplerOutput
]:
output
=
self
.
driver_worker
.
execute_model
(
execute_model_req
)
return
output
class
XPUExecutorAsync
(
XPUExecutor
,
ExecutorAsyncBase
):
async
def
execute_model_async
(
self
,
execute_model_req
:
ExecuteModelRequest
,
)
->
List
[
SamplerOutput
]:
output
=
await
make_async
(
self
.
driver_worker
.
execute_model
)(
execute_model_req
=
execute_model_req
)
return
output
def
_verify_and_get_model_config
(
config
:
ModelConfig
)
->
ModelConfig
:
if
config
.
dtype
==
torch
.
bfloat16
:
logger
.
warning
(
"bfloat16 is not fully supported on XPU, casting to float16."
)
config
.
dtype
=
torch
.
float16
if
not
config
.
enforce_eager
:
logger
.
warning
(
"CUDA graph is not supported on XPU, fallback to the eager "
"mode."
)
config
.
enforce_eager
=
True
return
config
Prev
1
…
15
16
17
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment