Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f48954a4
Commit
f48954a4
authored
Jun 12, 2024
by
zhuwenwen
Browse files
merge v0.5.0
parents
1dba29d3
8f89d720
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
590 additions
and
219 deletions
+590
-219
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+19
-11
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+2
-1
vllm/engine/output_processor/multi_step.py
vllm/engine/output_processor/multi_step.py
+1
-1
vllm/engine/output_processor/single_step.py
vllm/engine/output_processor/single_step.py
+4
-4
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+43
-41
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+11
-1
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+96
-10
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+174
-38
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+35
-17
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+9
-6
vllm/envs.py
vllm/envs.py
+9
-5
vllm/executor/multiproc_gpu_executor.py
vllm/executor/multiproc_gpu_executor.py
+6
-5
vllm/executor/multiproc_worker_utils.py
vllm/executor/multiproc_worker_utils.py
+5
-4
vllm/executor/ray_gpu_executor.py
vllm/executor/ray_gpu_executor.py
+0
-17
vllm/lora/layers.py
vllm/lora/layers.py
+57
-19
vllm/lora/punica.py
vllm/lora/punica.py
+19
-26
vllm/lora/utils.py
vllm/lora/utils.py
+9
-9
vllm/lora/worker_manager.py
vllm/lora/worker_manager.py
+3
-2
vllm/model_executor/custom_op.py
vllm/model_executor/custom_op.py
+60
-0
vllm/model_executor/guided_decoding/__init__.py
vllm/model_executor/guided_decoding/__init__.py
+28
-2
No files found.
vllm/engine/async_llm_engine.py
View file @
f48954a4
...
...
@@ -29,23 +29,32 @@ class AsyncEngineDeadError(RuntimeError):
pass
def
_raise_exception_on_finish
(
task
:
asyncio
.
Task
,
error_callback
:
Callable
[[
Exception
],
None
])
->
None
:
msg
=
(
"Task finished unexpectedly. This should never happen! "
"Please open an issue on Github."
)
def
_log_task_completion
(
task
:
asyncio
.
Task
,
error_callback
:
Callable
[[
Exception
],
None
])
->
None
:
"""This function is only intended for the `engine.run_engine_loop()` task.
In particular, that task runs a `while True` loop that can only exit if
there is an exception.
"""
exception
=
None
try
:
task
.
result
()
# NOTE: This will be thrown if task exits normally (which it should not)
raise
AsyncEngineDeadError
(
msg
)
return_value
=
task
.
result
()
raise
AssertionError
(
f
"The engine background task should never finish without an "
f
"exception.
{
return_value
}
"
)
except
asyncio
.
exceptions
.
CancelledError
:
# We assume that if the task is cancelled, we are gracefully shutting
# down. This should only happen on program exit.
logger
.
info
(
"Engine is gracefully shutting down."
)
except
Exception
as
e
:
exception
=
e
logger
.
error
(
"Engine background task failed"
,
exc_info
=
e
)
error_callback
(
exception
)
raise
AsyncEngineDeadError
(
msg
+
" See stack trace above for the actual cause."
)
from
e
"Task finished unexpectedly. This should never happen! "
"Please open an issue on Github. See stack trace above for the"
"actual cause."
)
from
e
class
AsyncStream
:
...
...
@@ -438,8 +447,7 @@ class AsyncLLMEngine:
self
.
_background_loop_unshielded
=
asyncio
.
get_event_loop
(
).
create_task
(
self
.
run_engine_loop
())
self
.
_background_loop_unshielded
.
add_done_callback
(
partial
(
_raise_exception_on_finish
,
error_callback
=
self
.
_error_callback
))
partial
(
_log_task_completion
,
error_callback
=
self
.
_error_callback
))
self
.
background_loop
=
asyncio
.
shield
(
self
.
_background_loop_unshielded
)
def
_init_engine
(
self
,
*
args
,
...
...
vllm/engine/llm_engine.py
View file @
f48954a4
...
...
@@ -162,7 +162,7 @@ class LLMEngine:
"Initializing an LLM engine (v%s) with config: "
"model=%r, speculative_config=%r, tokenizer=%r, "
"skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
"rope_scaling=%r, tokenizer_revision=%s, "
"rope_scaling=%r,
rope_theta=%r,
tokenizer_revision=%s, "
"trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
"download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
"disable_custom_all_reduce=%s, quantization=%s, "
...
...
@@ -177,6 +177,7 @@ class LLMEngine:
model_config
.
tokenizer_mode
,
model_config
.
revision
,
model_config
.
rope_scaling
,
model_config
.
rope_theta
,
model_config
.
tokenizer_revision
,
model_config
.
trust_remote_code
,
model_config
.
dtype
,
...
...
vllm/engine/output_processor/multi_step.py
View file @
f48954a4
...
...
@@ -78,7 +78,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
# Since there's only one sequence per sequence group, we can take the
# first sample.
samples
=
[
output
s
[
step
]
.
samples
[
0
]
for
step
in
range
(
len
(
outputs
))
]
samples
=
[
output
.
samples
[
0
]
for
output
in
outputs
]
# -1 means the output token is not valid (eg. due to spec decode
# rejecting tokens).
...
...
vllm/engine/output_processor/single_step.py
View file @
f48954a4
...
...
@@ -60,10 +60,10 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
assert
len
(
outputs
)
==
1
,
(
"Single step should only has 1 output."
)
output
=
outputs
[
0
]
prompt_logprobs
=
output
.
prompt_logprobs
if
(
prompt_logprobs
is
not
None
and
seq_group
.
sampling_params
.
detokenize
and
self
.
detokenizer
)
:
self
.
detokenizer
.
decode_prompt_logprobs_inplace
(
seq_group
,
prompt_logprobs
)
if
prompt_logprobs
is
not
None
:
if
seq_group
.
sampling_params
.
detokenize
and
self
.
detokenizer
:
self
.
detokenizer
.
decode_prompt_logprobs_inplace
(
seq_group
,
prompt_logprobs
)
if
not
seq_group
.
prompt_logprobs
:
# The first prompt token's logprob is None because it doesn't
# have tokens that are precedent.
...
...
vllm/entrypoints/llm.py
View file @
f48954a4
...
...
@@ -14,7 +14,7 @@ from vllm.lora.request import LoRARequest
from
vllm.outputs
import
EmbeddingRequestOutput
,
RequestOutput
from
vllm.pooling_params
import
PoolingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.
sequence
import
MultiModalData
from
vllm.
transformers_utils.tokenizer
import
get_cached_tokenizer
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
Counter
,
deprecate_kwargs
...
...
@@ -153,7 +153,14 @@ class LLM:
self
,
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
)
->
None
:
self
.
llm_engine
.
tokenizer
.
tokenizer
=
tokenizer
# While CachedTokenizer is dynamic, have no choice but
# compare class name. Misjudgment will arise from
# user-defined tokenizer started with 'Cached'
if
tokenizer
.
__class__
.
__name__
.
startswith
(
"Cached"
):
self
.
llm_engine
.
tokenizer
.
tokenizer
=
tokenizer
else
:
self
.
llm_engine
.
tokenizer
.
tokenizer
=
get_cached_tokenizer
(
tokenizer
)
@
overload
# LEGACY: single (prompt + optional token ids)
def
generate
(
...
...
@@ -163,8 +170,7 @@ class LLM:
List
[
SamplingParams
]]]
=
None
,
prompt_token_ids
:
Optional
[
List
[
int
]]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
RequestOutput
]:
...
...
...
@@ -176,8 +182,7 @@ class LLM:
List
[
SamplingParams
]]]
=
None
,
prompt_token_ids
:
Optional
[
List
[
List
[
int
]]]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
RequestOutput
]:
...
...
...
@@ -190,8 +195,7 @@ class LLM:
*
,
prompt_token_ids
:
List
[
int
],
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
RequestOutput
]:
...
...
...
@@ -204,8 +208,7 @@ class LLM:
*
,
prompt_token_ids
:
List
[
List
[
int
]],
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
RequestOutput
]:
...
...
...
@@ -216,8 +219,7 @@ class LLM:
sampling_params
:
None
,
prompt_token_ids
:
Union
[
List
[
int
],
List
[
List
[
int
]]],
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
RequestOutput
]:
...
...
...
@@ -230,13 +232,12 @@ class LLM:
sampling_params
:
Optional
[
Union
[
SamplingParams
,
Sequence
[
SamplingParams
]]]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]
]
=
None
,
)
->
List
[
RequestOutput
]:
...
@
deprecate_kwargs
(
"prompts"
,
"prompt_token_ids"
,
"multi_modal_data"
,
is_deprecated
=
lambda
:
LLM
.
DEPRECATE_LEGACY
,
additional_message
=
"Please use the 'inputs' parameter "
"instead."
)
...
...
@@ -248,8 +249,7 @@ class LLM:
Sequence
[
SamplingParams
]]]
=
None
,
prompt_token_ids
:
Optional
[
Union
[
List
[
int
],
List
[
List
[
int
]]]]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
RequestOutput
]:
"""Generates the completions for the input prompts.
...
...
@@ -276,11 +276,15 @@ class LLM:
considered legacy and may be deprecated in the future. You should
instead pass them via the ``inputs`` parameter.
"""
if
prompt_token_ids
is
not
None
or
multi_modal_data
is
not
None
:
if
self
.
llm_engine
.
model_config
.
embedding_mode
:
raise
ValueError
(
"LLM.generate() is only supported for generation models "
"(XForCausalLM)."
)
if
prompt_token_ids
is
not
None
:
inputs
=
self
.
_convert_v1_inputs
(
prompts
=
cast
(
Optional
[
Union
[
str
,
List
[
str
]]],
prompts
),
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
multi_modal_data
,
)
else
:
inputs
=
cast
(
...
...
@@ -308,8 +312,7 @@ class LLM:
Sequence
[
PoolingParams
]]]
=
None
,
prompt_token_ids
:
Optional
[
List
[
int
]]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
EmbeddingRequestOutput
]:
...
...
...
@@ -321,8 +324,7 @@ class LLM:
Sequence
[
PoolingParams
]]]
=
None
,
prompt_token_ids
:
Optional
[
List
[
List
[
int
]]]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
EmbeddingRequestOutput
]:
...
...
...
@@ -335,8 +337,7 @@ class LLM:
*
,
prompt_token_ids
:
List
[
int
],
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
EmbeddingRequestOutput
]:
...
...
...
@@ -349,8 +350,7 @@ class LLM:
*
,
prompt_token_ids
:
List
[
List
[
int
]],
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
EmbeddingRequestOutput
]:
...
...
...
@@ -361,8 +361,7 @@ class LLM:
pooling_params
:
None
,
prompt_token_ids
:
Union
[
List
[
int
],
List
[
List
[
int
]]],
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
EmbeddingRequestOutput
]:
...
...
...
@@ -375,13 +374,12 @@ class LLM:
pooling_params
:
Optional
[
Union
[
PoolingParams
,
Sequence
[
PoolingParams
]]]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]
]
=
None
,
)
->
List
[
EmbeddingRequestOutput
]:
...
@
deprecate_kwargs
(
"prompts"
,
"prompt_token_ids"
,
"multi_modal_data"
,
is_deprecated
=
lambda
:
LLM
.
DEPRECATE_LEGACY
,
additional_message
=
"Please use the 'inputs' parameter "
"instead."
)
...
...
@@ -393,8 +391,7 @@ class LLM:
Sequence
[
PoolingParams
]]]
=
None
,
prompt_token_ids
:
Optional
[
Union
[
List
[
int
],
List
[
List
[
int
]]]]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
)
->
List
[
EmbeddingRequestOutput
]:
"""Generates the completions for the input prompts.
...
...
@@ -420,11 +417,15 @@ class LLM:
considered legacy and may be deprecated in the future. You should
instead pass them via the ``inputs`` parameter.
"""
if
prompt_token_ids
is
not
None
or
multi_modal_data
is
not
None
:
if
not
self
.
llm_engine
.
model_config
.
embedding_mode
:
raise
ValueError
(
"LLM.encode() is only supported for embedding models (XModel)."
)
if
prompt_token_ids
is
not
None
:
inputs
=
self
.
_convert_v1_inputs
(
prompts
=
cast
(
Optional
[
Union
[
str
,
List
[
str
]]],
prompts
),
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
multi_modal_data
,
)
else
:
inputs
=
cast
(
...
...
@@ -449,7 +450,6 @@ class LLM:
self
,
prompts
:
Optional
[
Union
[
str
,
List
[
str
]]],
prompt_token_ids
:
Optional
[
Union
[
List
[
int
],
List
[
List
[
int
]]]],
multi_modal_data
:
Optional
[
MultiModalData
],
):
# skip_tokenizer_init is now checked in engine
...
...
@@ -489,9 +489,6 @@ class LLM:
else
:
raise
AssertionError
if
multi_modal_data
is
not
None
:
item
[
"multi_modal_data"
]
=
multi_modal_data
inputs
.
append
(
item
)
return
inputs
...
...
@@ -501,7 +498,7 @@ class LLM:
inputs
:
Union
[
PromptStrictInputs
,
Sequence
[
PromptStrictInputs
]],
params
:
Union
[
SamplingParams
,
Sequence
[
SamplingParams
],
PoolingParams
,
Sequence
[
PoolingParams
]],
lora_request
:
Optional
[
LoRARequest
],
lora_request
:
Optional
[
Union
[
Sequence
[
LoRARequest
],
LoRARequest
]
],
)
->
None
:
if
isinstance
(
inputs
,
(
str
,
dict
)):
# Convert a single prompt to a list.
...
...
@@ -512,20 +509,25 @@ class LLM:
if
isinstance
(
params
,
list
)
and
len
(
params
)
!=
num_requests
:
raise
ValueError
(
"The lengths of prompts and params "
"must be the same."
)
if
isinstance
(
lora_request
,
list
)
and
len
(
lora_request
)
!=
num_requests
:
raise
ValueError
(
"The lengths of prompts and lora_request "
"must be the same."
)
# Add requests to the engine.
for
i
,
request_inputs
in
enumerate
(
inputs
):
self
.
_add_request
(
request_inputs
,
params
[
i
]
if
isinstance
(
params
,
Sequence
)
else
params
,
lora_request
=
lora_request
,
lora_request
=
lora_request
[
i
]
if
isinstance
(
lora_request
,
Sequence
)
else
lora_request
,
)
def
_add_request
(
self
,
inputs
:
PromptInputs
,
params
:
Union
[
SamplingParams
,
PoolingParams
],
lora_request
:
Optional
[
LoRARequest
]
=
None
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]
]
=
None
,
)
->
None
:
request_id
=
str
(
next
(
self
.
request_counter
))
self
.
llm_engine
.
add_request
(
request_id
,
...
...
vllm/entrypoints/openai/api_server.py
View file @
f48954a4
...
...
@@ -36,7 +36,7 @@ openai_serving_chat: OpenAIServingChat
openai_serving_completion
:
OpenAIServingCompletion
openai_serving_embedding
:
OpenAIServingEmbedding
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
'vllm.entrypoints.openai.api_server'
)
_running_tasks
:
Set
[
asyncio
.
Task
]
=
set
()
...
...
@@ -183,6 +183,16 @@ if __name__ == "__main__":
served_model_names
=
[
args
.
model
]
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
# Enforce pixel values as image input type for vision language models
# when serving with API server
if
engine_args
.
image_input_type
is
not
None
and
\
engine_args
.
image_input_type
.
upper
()
!=
"PIXEL_VALUES"
:
raise
ValueError
(
f
"Invalid image_input_type:
{
engine_args
.
image_input_type
}
. "
"Only --image-input-type 'pixel_values' is supported for serving "
"vision language models with the vLLM API server."
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
,
usage_context
=
UsageContext
.
OPENAI_API_SERVER
)
...
...
vllm/entrypoints/openai/protocol.py
View file @
f48954a4
...
...
@@ -82,6 +82,7 @@ class ModelCard(OpenAIBaseModel):
owned_by
:
str
=
"vllm"
root
:
Optional
[
str
]
=
None
parent
:
Optional
[
str
]
=
None
max_model_len
:
Optional
[
int
]
=
None
permission
:
List
[
ModelPermission
]
=
Field
(
default_factory
=
list
)
...
...
@@ -101,6 +102,30 @@ class ResponseFormat(OpenAIBaseModel):
type
:
Literal
[
"text"
,
"json_object"
]
class
StreamOptions
(
OpenAIBaseModel
):
include_usage
:
Optional
[
bool
]
class
FunctionDefinition
(
OpenAIBaseModel
):
name
:
str
description
:
Optional
[
str
]
=
None
parameters
:
Optional
[
Dict
[
str
,
Any
]]
=
None
class
ChatCompletionToolsParam
(
OpenAIBaseModel
):
type
:
Literal
[
"function"
]
=
"function"
function
:
FunctionDefinition
class
ChatCompletionNamedFunction
(
OpenAIBaseModel
):
name
:
str
class
ChatCompletionNamedToolChoiceParam
(
OpenAIBaseModel
):
function
:
ChatCompletionNamedFunction
type
:
Literal
[
"function"
]
=
"function"
class
ChatCompletionRequest
(
OpenAIBaseModel
):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/chat/create
...
...
@@ -119,8 +144,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
le
=
torch
.
iinfo
(
torch
.
long
).
max
)
stop
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
Field
(
default_factory
=
list
)
stream
:
Optional
[
bool
]
=
False
stream_options
:
Optional
[
StreamOptions
]
=
None
temperature
:
Optional
[
float
]
=
0.7
top_p
:
Optional
[
float
]
=
1.0
tools
:
Optional
[
List
[
ChatCompletionToolsParam
]]
=
None
tool_choice
:
Optional
[
Union
[
Literal
[
"none"
],
ChatCompletionNamedToolChoiceParam
]]
=
"none"
user
:
Optional
[
str
]
=
None
# doc: begin-chat-completion-sampling-params
...
...
@@ -152,6 +181,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
"This is a parameter used by chat template in tokenizer config of the "
"model."
),
)
add_special_tokens
:
Optional
[
bool
]
=
Field
(
default
=
False
,
description
=
(
"If true, special tokens (e.g. BOS) will be added to the prompt "
"on top of what is added by the chat template. "
"For most models, the chat template takes care of adding the "
"special tokens so this should be set to False (as is the "
"default)."
),
)
include_stop_str_in_output
:
Optional
[
bool
]
=
Field
(
default
=
False
,
description
=
(
...
...
@@ -236,6 +274,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
logits_processors
=
logits_processors
,
)
@
model_validator
(
mode
=
'before'
)
@
classmethod
def
validate_stream_options
(
cls
,
values
):
if
(
values
.
get
(
'stream_options'
)
is
not
None
and
not
values
.
get
(
'stream'
)):
raise
ValueError
(
"stream_options can only be set if stream is true"
)
return
values
@
model_validator
(
mode
=
"before"
)
@
classmethod
def
check_guided_decoding_count
(
cls
,
data
):
...
...
@@ -244,10 +291,27 @@ class ChatCompletionRequest(OpenAIBaseModel):
"guided_regex"
in
data
and
data
[
"guided_regex"
]
is
not
None
,
"guided_choice"
in
data
and
data
[
"guided_choice"
]
is
not
None
])
# you can only use one kind of guided decoding
if
guide_count
>
1
:
raise
ValueError
(
"You can only use one kind of guided decoding "
"('guided_json', 'guided_regex' or 'guided_choice')."
)
# you can only either use guided decoding or tools, not both
if
guide_count
>
1
and
"tool_choice"
in
data
and
data
[
"tool_choice"
]
!=
"none"
:
raise
ValueError
(
"You can only either use guided decoding or tools, not both."
)
return
data
@
model_validator
(
mode
=
"before"
)
@
classmethod
def
check_tool_choice
(
cls
,
data
):
if
"tool_choice"
in
data
and
data
[
"tool_choice"
]
!=
"none"
:
if
not
isinstance
(
data
[
"tool_choice"
],
dict
):
raise
ValueError
(
"Currently only named tools are supported."
)
if
"tools"
not
in
data
or
data
[
"tools"
]
is
None
:
raise
ValueError
(
"When using `tool_choice`, `tools` must be set."
)
return
data
@
model_validator
(
mode
=
"before"
)
...
...
@@ -258,9 +322,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
raise
ValueError
(
"when using `top_logprobs`, `logprobs` must be set to true."
)
elif
not
0
<=
data
[
"top_logprobs"
]
<
=
2
0
:
elif
data
[
"top_logprobs"
]
<
0
:
raise
ValueError
(
"`top_logprobs` must be a value
in the interval [0, 20]
."
)
"`top_logprobs` must be a value
a positive value
."
)
return
data
...
...
@@ -282,6 +346,7 @@ class CompletionRequest(OpenAIBaseModel):
le
=
torch
.
iinfo
(
torch
.
long
).
max
)
stop
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
Field
(
default_factory
=
list
)
stream
:
Optional
[
bool
]
=
False
stream_options
:
Optional
[
StreamOptions
]
=
None
suffix
:
Optional
[
str
]
=
None
temperature
:
Optional
[
float
]
=
1.0
top_p
:
Optional
[
float
]
=
1.0
...
...
@@ -413,9 +478,16 @@ class CompletionRequest(OpenAIBaseModel):
@
classmethod
def
check_logprobs
(
cls
,
data
):
if
"logprobs"
in
data
and
data
[
"logprobs"
]
is
not
None
and
not
0
<=
data
[
"logprobs"
]
<=
5
:
raise
ValueError
((
"if passed, `logprobs` must be a value"
,
" in the interval [0, 5]."
))
"logprobs"
]
is
not
None
and
not
data
[
"logprobs"
]
>=
0
:
raise
ValueError
(
"if passed, `logprobs` must be a positive value."
)
return
data
@
model_validator
(
mode
=
"before"
)
@
classmethod
def
validate_stream_options
(
cls
,
data
):
if
data
.
get
(
"stream_options"
)
and
not
data
.
get
(
"stream"
):
raise
ValueError
(
"Stream options can only be defined when stream is True."
)
return
data
...
...
@@ -441,7 +513,8 @@ class CompletionLogProbs(OpenAIBaseModel):
text_offset
:
List
[
int
]
=
Field
(
default_factory
=
list
)
token_logprobs
:
List
[
Optional
[
float
]]
=
Field
(
default_factory
=
list
)
tokens
:
List
[
str
]
=
Field
(
default_factory
=
list
)
top_logprobs
:
Optional
[
List
[
Optional
[
Dict
[
str
,
float
]]]]
=
None
top_logprobs
:
List
[
Optional
[
Dict
[
str
,
float
]]]
=
Field
(
default_factory
=
list
)
class
CompletionResponseChoice
(
OpenAIBaseModel
):
...
...
@@ -505,9 +578,21 @@ class EmbeddingResponse(BaseModel):
usage
:
UsageInfo
class
FunctionCall
(
OpenAIBaseModel
):
name
:
str
arguments
:
str
class
ToolCall
(
OpenAIBaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"chatcmpl-tool-
{
random_uuid
()
}
"
)
type
:
Literal
[
"function"
]
=
"function"
function
:
FunctionCall
class
ChatMessage
(
OpenAIBaseModel
):
role
:
str
content
:
str
tool_calls
:
List
[
ToolCall
]
=
Field
(
default_factory
=
list
)
class
ChatCompletionLogProb
(
OpenAIBaseModel
):
...
...
@@ -528,13 +613,13 @@ class ChatCompletionResponseChoice(OpenAIBaseModel):
index
:
int
message
:
ChatMessage
logprobs
:
Optional
[
ChatCompletionLogProbs
]
=
None
finish_reason
:
Optional
[
Literal
[
"stop"
,
"length"
,
"tool_calls"
]
]
=
None
finish_reason
:
Optional
[
str
]
=
None
stop_reason
:
Optional
[
Union
[
int
,
str
]]
=
None
class
ChatCompletionResponse
(
OpenAIBaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"chatcmpl-
{
random_uuid
()
}
"
)
object
:
str
=
"chat.completion"
object
:
Literal
[
"chat.completion"
]
=
"chat.completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
List
[
ChatCompletionResponseChoice
]
...
...
@@ -544,19 +629,20 @@ class ChatCompletionResponse(OpenAIBaseModel):
class
DeltaMessage
(
OpenAIBaseModel
):
role
:
Optional
[
str
]
=
None
content
:
Optional
[
str
]
=
None
tool_calls
:
List
[
ToolCall
]
=
Field
(
default_factory
=
list
)
class
ChatCompletionResponseStreamChoice
(
OpenAIBaseModel
):
index
:
int
delta
:
DeltaMessage
logprobs
:
Optional
[
ChatCompletionLogProbs
]
=
None
finish_reason
:
Optional
[
Literal
[
"stop"
,
"length"
,
"tool_calls"
]
]
=
None
finish_reason
:
Optional
[
str
]
=
None
stop_reason
:
Optional
[
Union
[
int
,
str
]]
=
None
class
ChatCompletionStreamResponse
(
OpenAIBaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"chatcmpl-
{
random_uuid
()
}
"
)
object
:
str
=
"chat.completion.chunk"
object
:
Literal
[
"chat.completion.chunk"
]
=
"chat.completion.chunk"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
List
[
ChatCompletionResponseStreamChoice
]
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
f48954a4
import
codecs
import
time
from
dataclasses
import
dataclass
from
typing
import
(
AsyncGenerator
,
AsyncIterator
,
Dict
,
Iterable
,
List
,
Optional
)
from
dataclasses
import
dataclass
,
field
from
typing
import
(
AsyncGenerator
,
AsyncIterator
,
Awaitable
,
Dict
,
Iterable
,
List
,
Optional
)
from
typing
import
Sequence
as
GenericSequence
from
typing
import
TypedDict
,
Union
,
cast
,
final
from
fastapi
import
Request
from
openai.types.chat
import
ChatCompletionContentPartTextParam
from
openai.types.chat
import
(
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartTextParam
)
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
,
VisionLanguageConfig
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionContentPartParam
,
ChatCompletionLogProb
,
ChatCompletionLogProbs
,
ChatCompletionLogProbsContent
,
ChatCompletionMessageParam
,
ChatCompletionRequest
,
ChatCompletionResponse
,
ChatCompletionMessageParam
,
ChatCompletionNamedToolChoiceParam
,
ChatCompletionRequest
,
ChatCompletionResponse
,
ChatCompletionResponseChoice
,
ChatCompletionResponseStreamChoice
,
ChatCompletionStreamResponse
,
ChatMessage
,
DeltaMessage
,
ErrorResponse
,
UsageInfo
)
FunctionCall
,
ToolCall
,
UsageInfo
)
from
vllm.entrypoints.openai.serving_engine
import
(
LoRAModulePath
,
OpenAIServing
)
from
vllm.inputs
import
PromptInputs
from
vllm.logger
import
init_logger
from
vllm.model_executor.guided_decoding
import
(
get_guided_decoding_logits_processor
)
from
vllm.multimodal.image
import
ImagePixelData
from
vllm.multimodal.utils
import
(
async_get_and_parse_image
,
get_full_image_text_prompt
)
from
vllm.outputs
import
RequestOutput
from
vllm.sequence
import
Logprob
from
vllm.utils
import
random_uuid
...
...
@@ -39,6 +45,8 @@ class ConversationMessage(TypedDict):
@
dataclass
(
frozen
=
True
)
class
ChatMessageParseResult
:
messages
:
List
[
ConversationMessage
]
image_futures
:
List
[
Awaitable
[
ImagePixelData
]]
=
field
(
default_factory
=
list
)
class
OpenAIServingChat
(
OpenAIServing
):
...
...
@@ -93,19 +101,76 @@ class OpenAIServingChat(OpenAIServing):
parts
:
Iterable
[
ChatCompletionContentPartParam
],
)
->
ChatMessageParseResult
:
texts
:
List
[
str
]
=
[]
image_futures
:
List
[
Awaitable
[
ImagePixelData
]]
=
[]
for
_
,
part
in
enumerate
(
parts
):
vlm_config
:
Optional
[
VisionLanguageConfig
]
=
getattr
(
self
.
engine
.
engine
,
"vision_language_config"
,
None
)
model_config
=
getattr
(
self
.
engine
.
engine
,
"model_config"
,
None
)
for
part
in
parts
:
part_type
=
part
[
"type"
]
if
part_type
==
"text"
:
text
=
cast
(
ChatCompletionContentPartTextParam
,
part
)[
"text"
]
texts
.
append
(
text
)
elif
part_type
==
"image_url"
:
if
vlm_config
is
None
:
raise
ValueError
(
"'image_url' input is not supported as the loaded "
"model is not multimodal."
)
elif
len
(
image_futures
)
==
0
:
assert
self
.
tokenizer
is
not
None
image_url
=
cast
(
ChatCompletionContentPartImageParam
,
part
)[
"image_url"
]
if
image_url
.
get
(
"detail"
,
"auto"
)
!=
"auto"
:
logger
.
warning
(
"'image_url.detail' is currently not supported and "
"will be ignored."
)
image_future
=
async_get_and_parse_image
(
image_url
[
"url"
])
image_futures
.
append
(
image_future
)
else
:
raise
NotImplementedError
(
"Multiple 'image_url' input is currently not supported."
)
else
:
raise
NotImplementedError
(
f
"Unknown part type:
{
part_type
}
"
)
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
"
\n
"
.
join
(
texts
))]
text_prompt
=
"
\n
"
.
join
(
texts
)
if
vlm_config
is
not
None
and
len
(
image_futures
):
(
image_token_prompt
,
image_token_str
)
=
vlm_config
.
get_image_token_text
(
self
.
tokenizer
)
# NOTE: If image token string (e.g, <image>) is already present
# in the text prompt, we assume it follows the same format required
# by the engine.
if
image_token_str
in
text_prompt
:
logger
.
warning
(
"Detected image token string in the text prompt. "
"Skipping prompt formatting."
)
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
text_prompt
)
]
else
:
full_prompt
=
get_full_image_text_prompt
(
image_prompt
=
image_token_prompt
,
text_prompt
=
text_prompt
,
config
=
model_config
)
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
full_prompt
)
]
else
:
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
text_prompt
)]
return
ChatMessageParseResult
(
messages
=
messages
)
return
ChatMessageParseResult
(
messages
=
messages
,
image_futures
=
image_futures
)
def
_parse_chat_message_content
(
self
,
...
...
@@ -115,10 +180,10 @@ class OpenAIServingChat(OpenAIServing):
content
=
message
.
get
(
"content"
)
if
content
is
None
:
return
ChatMessageParseResult
(
messages
=
[])
return
ChatMessageParseResult
(
messages
=
[]
,
image_futures
=
[]
)
if
isinstance
(
content
,
str
):
messages
=
[
ConversationMessage
(
role
=
role
,
content
=
content
)]
return
ChatMessageParseResult
(
messages
=
messages
)
return
ChatMessageParseResult
(
messages
=
messages
,
image_futures
=
[]
)
return
self
.
_parse_chat_message_content_parts
(
role
,
content
)
...
...
@@ -143,11 +208,13 @@ class OpenAIServingChat(OpenAIServing):
try
:
conversation
:
List
[
ConversationMessage
]
=
[]
image_futures
:
List
[
Awaitable
[
ImagePixelData
]]
=
[]
for
msg
in
request
.
messages
:
parsed_
msg
=
self
.
_parse_chat_message_content
(
msg
)
chat_
parsed_
result
=
self
.
_parse_chat_message_content
(
msg
)
conversation
.
extend
(
parsed_msg
.
messages
)
conversation
.
extend
(
chat_parsed_result
.
messages
)
image_futures
.
extend
(
chat_parsed_result
.
image_futures
)
prompt
=
self
.
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
...
...
@@ -158,11 +225,24 @@ class OpenAIServingChat(OpenAIServing):
logger
.
error
(
"Error in applying chat template from request: %s"
,
e
)
return
self
.
create_error_response
(
str
(
e
))
# Fetch image data
image_data
:
Optional
[
ImagePixelData
]
=
None
try
:
if
len
(
image_futures
):
# since we support only single image currently
assert
len
(
image_futures
)
==
1
image_data
=
await
image_futures
[
0
]
except
Exception
as
e
:
logger
.
error
(
"Error in loading image data: %s"
,
e
)
return
self
.
create_error_response
(
str
(
e
))
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
try
:
# Tokenize/detokenize depending on prompt format (string/token list)
prompt_ids
,
prompt_text
=
self
.
_validate_prompt_and_tokenize
(
request
,
prompt
=
prompt
,
add_special_tokens
=
False
)
request
,
prompt
=
prompt
,
add_special_tokens
=
request
.
add_special_tokens
)
sampling_params
=
request
.
to_sampling_params
()
lora_request
=
self
.
_maybe_get_lora
(
request
)
decoding_config
=
await
self
.
engine
.
get_decoding_config
()
...
...
@@ -180,11 +260,15 @@ class OpenAIServingChat(OpenAIServing):
except
ValueError
as
e
:
return
self
.
create_error_response
(
str
(
e
))
inputs
:
PromptInputs
=
{
"prompt"
:
prompt_text
,
"prompt_token_ids"
:
prompt_ids
,
}
if
image_data
is
not
None
:
inputs
[
"multi_modal_data"
]
=
image_data
result_generator
=
self
.
engine
.
generate
(
{
"prompt"
:
prompt_text
,
"prompt_token_ids"
:
prompt_ids
},
inputs
,
sampling_params
,
request_id
,
lora_request
,
...
...
@@ -244,6 +328,9 @@ class OpenAIServingChat(OpenAIServing):
created
=
created_time
,
choices
=
[
choice_data
],
model
=
model_name
)
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
chunk
.
usage
=
None
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
yield
f
"data:
{
data
}
\n\n
"
...
...
@@ -271,6 +358,9 @@ class OpenAIServingChat(OpenAIServing):
choices
=
[
choice_data
],
logprobs
=
None
,
model
=
model_name
)
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
chunk
.
usage
=
None
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
yield
f
"data:
{
data
}
\n\n
"
...
...
@@ -283,13 +373,15 @@ class OpenAIServingChat(OpenAIServing):
continue
delta_token_ids
=
output
.
token_ids
[
previous_num_tokens
[
i
]:]
top
_logprobs
=
output
.
logprobs
[
out
_logprobs
=
output
.
logprobs
[
previous_num_tokens
[
i
]:]
if
output
.
logprobs
else
None
if
request
.
logprobs
:
if
request
.
logprobs
and
request
.
top_logprobs
is
not
None
:
assert
out_logprobs
is
not
None
,
(
"Did not output logprobs"
)
logprobs
=
self
.
_create_chat_logprobs
(
token_ids
=
delta_token_ids
,
top_logprobs
=
top
_logprobs
,
top_logprobs
=
out
_logprobs
,
num_output_top_logprobs
=
request
.
top_logprobs
,
)
else
:
...
...
@@ -298,11 +390,24 @@ class OpenAIServingChat(OpenAIServing):
delta_text
=
output
.
text
[
len
(
previous_texts
[
i
]):]
previous_texts
[
i
]
=
output
.
text
previous_num_tokens
[
i
]
=
len
(
output
.
token_ids
)
if
request
.
tool_choice
and
type
(
request
.
tool_choice
)
is
ChatCompletionNamedToolChoiceParam
:
delta_message
=
DeltaMessage
(
tool_calls
=
[
ToolCall
(
function
=
FunctionCall
(
name
=
request
.
tool_choice
.
function
.
name
,
arguments
=
delta_text
))
])
else
:
delta_message
=
DeltaMessage
(
content
=
delta_text
)
if
output
.
finish_reason
is
None
:
# Send token-by-token response for each request.n
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
i
,
delta
=
D
elta
M
essage
(
content
=
delta_text
)
,
delta
=
d
elta
_m
essage
,
logprobs
=
logprobs
,
finish_reason
=
None
)
chunk
=
ChatCompletionStreamResponse
(
...
...
@@ -311,20 +416,17 @@ class OpenAIServingChat(OpenAIServing):
created
=
created_time
,
choices
=
[
choice_data
],
model
=
model_name
)
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
chunk
.
usage
=
None
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
yield
f
"data:
{
data
}
\n\n
"
else
:
# Send the finish response for each request.n only once
prompt_tokens
=
len
(
res
.
prompt_token_ids
)
final_usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
previous_num_tokens
[
i
],
total_tokens
=
prompt_tokens
+
previous_num_tokens
[
i
],
)
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
i
,
delta
=
D
elta
M
essage
(
content
=
delta_text
)
,
delta
=
d
elta
_m
essage
,
logprobs
=
logprobs
,
finish_reason
=
output
.
finish_reason
,
stop_reason
=
output
.
stop_reason
)
...
...
@@ -334,12 +436,32 @@ class OpenAIServingChat(OpenAIServing):
created
=
created_time
,
choices
=
[
choice_data
],
model
=
model_name
)
if
final_usage
is
not
None
:
chunk
.
usage
=
final
_usage
data
=
chunk
.
model_dump_json
(
exclude_unset
=
True
,
exclude_
none
=
True
)
if
(
request
.
stream_options
and
request
.
stream_options
.
include
_usage
):
chunk
.
usage
=
None
data
=
chunk
.
model_dump_json
(
exclude_
unset
=
True
)
yield
f
"data:
{
data
}
\n\n
"
finish_reason_sent
[
i
]
=
True
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
final_usage
=
UsageInfo
(
prompt_tokens
=
prompt_tokens
,
completion_tokens
=
previous_num_tokens
[
i
],
total_tokens
=
prompt_tokens
+
previous_num_tokens
[
i
],
)
final_usage_chunk
=
ChatCompletionStreamResponse
(
id
=
request_id
,
object
=
chunk_object_type
,
created
=
created_time
,
choices
=
[],
model
=
model_name
,
usage
=
final_usage
)
final_usage_data
=
(
final_usage_chunk
.
model_dump_json
(
exclude_unset
=
True
,
exclude_none
=
True
))
yield
f
"data:
{
final_usage_data
}
\n\n
"
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
data
=
self
.
create_streaming_error_response
(
str
(
e
))
...
...
@@ -370,20 +492,34 @@ class OpenAIServingChat(OpenAIServing):
role
=
self
.
get_chat_request_role
(
request
)
for
output
in
final_res
.
outputs
:
token_ids
=
output
.
token_ids
top
_logprobs
=
output
.
logprobs
out
_logprobs
=
output
.
logprobs
if
request
.
logprobs
:
if
request
.
logprobs
and
request
.
top_logprobs
is
not
None
:
assert
out_logprobs
is
not
None
,
"Did not output logprobs"
logprobs
=
self
.
_create_chat_logprobs
(
token_ids
=
token_ids
,
top_logprobs
=
top
_logprobs
,
top_logprobs
=
out
_logprobs
,
num_output_top_logprobs
=
request
.
top_logprobs
,
)
else
:
logprobs
=
None
if
request
.
tool_choice
and
type
(
request
.
tool_choice
)
is
ChatCompletionNamedToolChoiceParam
:
message
=
ChatMessage
(
role
=
role
,
content
=
""
,
tool_calls
=
[
ToolCall
(
function
=
FunctionCall
(
name
=
request
.
tool_choice
.
function
.
name
,
arguments
=
output
.
text
))
])
elif
not
request
.
tool_choice
or
request
.
tool_choice
==
"none"
:
message
=
ChatMessage
(
role
=
role
,
content
=
output
.
text
)
choice_data
=
ChatCompletionResponseChoice
(
index
=
output
.
index
,
message
=
ChatM
essage
(
role
=
role
,
content
=
output
.
text
)
,
message
=
m
essage
,
logprobs
=
logprobs
,
finish_reason
=
output
.
finish_reason
,
stop_reason
=
output
.
stop_reason
)
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
f48954a4
...
...
@@ -8,6 +8,7 @@ from fastapi import Request
from
vllm.config
import
ModelConfig
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.entrypoints.openai.protocol
import
(
CompletionLogProbs
,
CompletionRequest
,
...
...
@@ -16,7 +17,6 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
CompletionResponseStreamChoice
,
CompletionStreamResponse
,
UsageInfo
)
# yapf: enable
from
vllm.entrypoints.openai.serving_engine
import
(
LoRAModulePath
,
OpenAIServing
)
from
vllm.logger
import
init_logger
...
...
@@ -221,7 +221,7 @@ class OpenAIServingCompletion(OpenAIServing):
# only return the prompt
delta_text
=
res
.
prompt
delta_token_ids
=
res
.
prompt_token_ids
top
_logprobs
=
res
.
prompt_logprobs
out
_logprobs
=
res
.
prompt_logprobs
has_echoed
[
i
]
=
True
elif
(
request
.
echo
and
request
.
max_tokens
>
0
and
not
has_echoed
[
i
]):
...
...
@@ -229,7 +229,7 @@ class OpenAIServingCompletion(OpenAIServing):
delta_text
=
res
.
prompt
+
output
.
text
delta_token_ids
=
(
res
.
prompt_token_ids
+
output
.
token_ids
)
top
_logprobs
=
res
.
prompt_logprobs
+
(
output
.
logprobs
out
_logprobs
=
res
.
prompt_logprobs
+
(
output
.
logprobs
or
[])
has_echoed
[
i
]
=
True
else
:
...
...
@@ -237,13 +237,15 @@ class OpenAIServingCompletion(OpenAIServing):
delta_text
=
output
.
text
[
len
(
previous_texts
[
i
]):]
delta_token_ids
=
output
.
token_ids
[
previous_num_tokens
[
i
]:]
top
_logprobs
=
output
.
logprobs
[
previous_num_tokens
[
out
_logprobs
=
output
.
logprobs
[
previous_num_tokens
[
i
]:]
if
output
.
logprobs
else
None
if
request
.
logprobs
is
not
None
:
assert
out_logprobs
is
not
None
,
(
"Did not output logprobs"
)
logprobs
=
self
.
_create_completion_logprobs
(
token_ids
=
delta_token_ids
,
top_logprobs
=
top
_logprobs
,
top_logprobs
=
out
_logprobs
,
num_output_top_logprobs
=
request
.
logprobs
,
initial_text_offset
=
len
(
previous_texts
[
i
]),
)
...
...
@@ -264,7 +266,8 @@ class OpenAIServingCompletion(OpenAIServing):
)
else
:
final_usage
=
None
response_json
=
CompletionStreamResponse
(
chunk
=
CompletionStreamResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
...
...
@@ -276,10 +279,27 @@ class OpenAIServingCompletion(OpenAIServing):
finish_reason
=
finish_reason
,
stop_reason
=
stop_reason
,
)
],
usage
=
final_usage
,
).
model_dump_json
(
exclude_unset
=
True
)
])
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
chunk
.
usage
=
None
response_json
=
chunk
.
model_dump_json
(
exclude_unset
=
True
)
yield
f
"data:
{
response_json
}
\n\n
"
if
(
request
.
stream_options
and
request
.
stream_options
.
include_usage
):
final_usage_chunk
=
CompletionStreamResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
choices
=
[],
usage
=
final_usage
,
)
final_usage_data
=
(
final_usage_chunk
.
model_dump_json
(
exclude_unset
=
True
,
exclude_none
=
True
))
yield
f
"data:
{
final_usage_data
}
\n\n
"
except
ValueError
as
e
:
# TODO: Use a vllm-specific Validation Error
data
=
self
.
create_streaming_error_response
(
str
(
e
))
...
...
@@ -307,25 +327,23 @@ class OpenAIServingCompletion(OpenAIServing):
assert
request
.
max_tokens
is
not
None
if
request
.
echo
and
request
.
max_tokens
==
0
:
token_ids
=
prompt_token_ids
top
_logprobs
=
prompt_logprobs
out
_logprobs
=
prompt_logprobs
output_text
=
prompt_text
elif
request
.
echo
and
request
.
max_tokens
>
0
:
token_ids
=
prompt_token_ids
+
output
.
token_ids
top
_logprobs
=
(
prompt_logprobs
+
output
.
logprobs
if
request
.
logprobs
else
None
)
out
_logprobs
=
(
prompt_logprobs
+
output
.
logprobs
if
request
.
logprobs
is
not
None
else
None
)
output_text
=
prompt_text
+
output
.
text
else
:
token_ids
=
output
.
token_ids
top
_logprobs
=
output
.
logprobs
out
_logprobs
=
output
.
logprobs
output_text
=
output
.
text
if
request
.
logprobs
is
not
None
:
assert
top_logprobs
is
not
None
,
(
"top_logprobs must be provided when logprobs "
"is requested"
)
assert
out_logprobs
is
not
None
,
"Did not output logprobs"
logprobs
=
self
.
_create_completion_logprobs
(
token_ids
=
token_ids
,
top_logprobs
=
top
_logprobs
,
top_logprobs
=
out
_logprobs
,
num_output_top_logprobs
=
request
.
logprobs
,
)
else
:
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
f48954a4
...
...
@@ -62,6 +62,7 @@ class OpenAIServing:
"""Show available models. Right now we only have one model."""
model_cards
=
[
ModelCard
(
id
=
served_model_name
,
max_model_len
=
self
.
max_model_len
,
root
=
self
.
served_model_names
[
0
],
permission
=
[
ModelPermission
()])
for
served_model_name
in
self
.
served_model_names
...
...
@@ -130,7 +131,8 @@ class OpenAIServing:
prompt_ids
:
Optional
[
List
[
int
]]
=
None
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
True
)
->
Tuple
[
List
[
int
],
str
]:
add_special_tokens
:
Optional
[
bool
]
=
True
)
->
Tuple
[
List
[
int
],
str
]:
if
not
(
prompt
or
prompt_ids
):
raise
ValueError
(
"Either prompt or prompt_ids should be provided."
)
if
(
prompt
and
prompt_ids
):
...
...
@@ -138,11 +140,12 @@ class OpenAIServing:
"Only one of prompt or prompt_ids should be provided."
)
if
prompt_ids
is
None
:
# When using OpenAIServingChat for chat completions, the
# special tokens (e.g., BOS) have already been added by the
# chat template. Therefore, we do not need to add them again.
# Set add_special_tokens to False to avoid adding the BOS tokens
# again.
# When using OpenAIServingChat for chat completions, for
# most models the special tokens (e.g., BOS) have already
# been added by the chat template. Therefore, we do not
# need to add them again.
# Set add_special_tokens to False (by default) to avoid
# adding the BOS tokens again.
tokenizer_kwargs
:
Dict
[
str
,
Any
]
=
{
"add_special_tokens"
:
add_special_tokens
}
...
...
vllm/envs.py
View file @
f48954a4
...
...
@@ -29,10 +29,10 @@ if TYPE_CHECKING:
VLLM_CPU_KVCACHE_SPACE
:
int
=
0
VLLM_USE_RAY_COMPILED_DAG
:
bool
=
False
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"spawn"
VLLM_IMAGE_FETCH_TIMEOUT
:
int
=
5
VLLM_TARGET_DEVICE
:
str
=
"cuda"
MAX_JOBS
:
Optional
[
str
]
=
None
NVCC_THREADS
:
Optional
[
str
]
=
None
VLLM_BUILD_WITH_NEURON
:
bool
=
False
VLLM_USE_PRECOMPILED
:
bool
=
False
VLLM_INSTALL_PUNICA_KERNELS
:
bool
=
False
CMAKE_BUILD_TYPE
:
Optional
[
str
]
=
None
...
...
@@ -62,10 +62,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"NVCC_THREADS"
:
lambda
:
os
.
getenv
(
"NVCC_THREADS"
,
None
),
# If set, vllm will build with Neuron support
"VLLM_BUILD_WITH_NEURON"
:
lambda
:
bool
(
os
.
environ
.
get
(
"VLLM_BUILD_WITH_NEURON"
,
False
)),
# If set, vllm will use precompiled binaries (*.so)
"VLLM_USE_PRECOMPILED"
:
lambda
:
bool
(
os
.
environ
.
get
(
"VLLM_USE_PRECOMPILED"
)),
...
...
@@ -99,6 +95,9 @@ environment_variables: Dict[str, Callable[[], Any]] = {
lambda
:
os
.
getenv
(
'VLLM_HOST_IP'
,
""
)
or
os
.
getenv
(
"HOST_IP"
,
""
),
# used in distributed environment to manually set the communication port
# Note: if VLLM_PORT is set, and some code asks for multiple ports, the
# VLLM_PORT will be used as the first port, and the rest will be generated
# by incrementing the VLLM_PORT value.
# '0' is used to make mypy happy
'VLLM_PORT'
:
lambda
:
int
(
os
.
getenv
(
'VLLM_PORT'
,
'0'
))
...
...
@@ -213,6 +212,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# Both spawn and fork work
"VLLM_WORKER_MULTIPROC_METHOD"
:
lambda
:
os
.
getenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
),
# Timeout for fetching images when serving multimodal models
# Default is 5 seconds
"VLLM_IMAGE_FETCH_TIMEOUT"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_IMAGE_FETCH_TIMEOUT"
,
"5"
)),
}
# end-env-vars-definition
...
...
vllm/executor/multiproc_gpu_executor.py
View file @
f48954a4
...
...
@@ -19,10 +19,6 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
"""Python multiprocessing-based multi-GPU executor"""
def
_init_executor
(
self
)
->
None
:
assert
(
not
self
.
speculative_config
),
"Speculative decoding not yet supported for MultiProcGPU backend."
# Create the parallel GPU workers.
world_size
=
self
.
parallel_config
.
tensor_parallel_size
...
...
@@ -34,6 +30,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
# Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
os
.
environ
[
"VLLM_INSTANCE_ID"
]
=
get_vllm_instance_id
()
# Disable torch async compiling which won't work with daemonic processes
os
.
environ
[
"TORCHINDUCTOR_COMPILE_THREADS"
]
=
"1"
from
torch.cuda
import
device_count
assert
world_size
<=
device_count
(),
(
"please set tensor_parallel_size to less than max local gpu count"
)
...
...
@@ -43,6 +42,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
if
world_size
==
1
:
self
.
workers
=
[]
self
.
worker_monitor
=
None
else
:
result_handler
=
ResultHandler
()
self
.
workers
=
[
...
...
@@ -124,7 +124,8 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
def
check_health
(
self
)
->
None
:
"""Raises an error if engine is unhealthy."""
if
not
self
.
worker_monitor
.
is_alive
():
if
self
.
worker_monitor
is
not
None
and
not
self
.
worker_monitor
.
is_alive
(
):
raise
RuntimeError
(
"Worker processes are not running"
)
def
_wait_for_tasks_completion
(
self
,
parallel_worker_tasks
:
Any
)
->
None
:
...
...
vllm/executor/multiproc_worker_utils.py
View file @
f48954a4
...
...
@@ -65,10 +65,11 @@ def _set_future_result(future: Union[ResultFuture, asyncio.Future],
future
.
set_result
(
result
)
return
loop
=
future
.
get_loop
()
if
result
.
exception
is
not
None
:
loop
.
call_soon_threadsafe
(
future
.
set_exception
,
result
.
exception
)
else
:
loop
.
call_soon_threadsafe
(
future
.
set_result
,
result
.
value
)
if
not
loop
.
is_closed
():
if
result
.
exception
is
not
None
:
loop
.
call_soon_threadsafe
(
future
.
set_exception
,
result
.
exception
)
else
:
loop
.
call_soon_threadsafe
(
future
.
set_result
,
result
.
value
)
class
ResultHandler
(
threading
.
Thread
):
...
...
vllm/executor/ray_gpu_executor.py
View file @
f48954a4
...
...
@@ -293,23 +293,6 @@ class RayGPUExecutor(DistributedGPUExecutor):
])
return
forward_dag
.
experimental_compile
()
def
check_health
(
self
)
->
None
:
"""Raises an error if engine is unhealthy."""
self
.
_check_if_any_actor_is_dead
()
def
_check_if_any_actor_is_dead
(
self
):
if
not
self
.
workers
:
return
dead_actors
=
[]
for
actor
in
self
.
workers
:
actor_state
=
ray
.
state
.
actors
(
actor
.
_ray_actor_id
.
hex
())
# pylint: disable=protected-access
if
actor_state
[
"State"
]
==
"DEAD"
:
dead_actors
.
append
(
actor
)
if
dead_actors
:
raise
RuntimeError
(
"At least one Worker is dead. "
f
"Dead Workers:
{
dead_actors
}
. "
)
class
RayGPUExecutorAsync
(
RayGPUExecutor
,
DistributedGPUExecutorAsync
):
...
...
vllm/lora/layers.py
View file @
f48954a4
...
...
@@ -215,19 +215,19 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
lora_config
:
LoRAConfig
,
model_config
:
Optional
[
PretrainedConfig
]
=
None
)
->
None
:
lora_vocab_start_idx
=
self
.
base_layer
.
org_vocab_size
weights_idx
=
None
if
self
.
base_layer
.
vocab_end_index
>
lora_vocab_start_idx
:
if
self
.
base_layer
.
num_added_embeddings_per_partition
>
0
:
# We can start adding lora weights
weights_idx
=
max
(
lora_vocab_start_idx
-
self
.
base_layer
.
vocab_start_index
,
0
)
self
.
embeddings_slice
=
(
self
.
base_layer
.
vocab_start_index
-
self
.
base_layer
.
org_vocab_size
+
weights_idx
,
self
.
base_layer
.
vocab_end_index
-
self
.
base_layer
.
org_vocab_size
)
self
.
embeddings_weights
=
self
.
base_layer
.
weight
.
data
[
weights_idx
:]
self
.
embeddings_weights
.
fill_
(
0
)
self
.
embeddings_weights
=
self
.
base_layer
.
weight
.
data
[
self
.
base_layer
.
num_org_embeddings_per_partition
:
self
.
base_layer
.
num_org_embeddings_per_partition
+
self
.
base_layer
.
num_added_embeddings_per_partition
]
self
.
embeddings_slice
=
(
self
.
base_layer
.
shard_indices
.
added_vocab_start_index
-
self
.
base_layer
.
org_vocab_size
,
self
.
base_layer
.
shard_indices
.
added_vocab_end_index
-
self
.
base_layer
.
org_vocab_size
)
self
.
base_layer
.
weight
.
data
[
self
.
base_layer
.
num_org_embeddings_per_partition
:].
fill_
(
0
)
else
:
self
.
embeddings_slice
=
None
self
.
embeddings_weights
=
None
...
...
@@ -1025,19 +1025,31 @@ class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
class
LogitsProcessorWithLoRA
(
BaseLayerWithLoRA
):
"""
LoRA wrapper for LogitsProcessor, with extra logic to handle the
application of the LoRA adapter and added LoRA vocabulary.
Args:
base_layer: LogitsProcessor layer
hidden_size: hidden size of the model
dtype: data type of the model
device: device of the model
sharded_to_full_mapping: index mapping from sharded vocab to full vocab
received from base_layer.get_sharded_to_full_mapping(). If None,
no reindexing will be done.
"""
def
__init__
(
self
,
base_layer
:
LogitsProcessor
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
,
)
->
None
:
def
__init__
(
self
,
base_layer
:
LogitsProcessor
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
,
sharded_to_full_mapping
:
Optional
[
List
[
int
]])
->
None
:
super
().
__init__
()
self
.
base_layer
=
base_layer
self
.
hidden_size
=
hidden_size
self
.
dtype
=
dtype
self
.
device
=
device
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
tp_rank
=
get_tensor_model_parallel_rank
()
self
.
sharded_to_full_mapping
=
sharded_to_full_mapping
@
property
def
logits_as_input
(
self
):
...
...
@@ -1098,6 +1110,13 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
dtype
=
self
.
dtype
,
device
=
self
.
device
,
)
if
self
.
sharded_to_full_mapping
is
not
None
:
self
.
sharded_to_full_mapping_gpu
=
torch
.
tensor
(
self
.
sharded_to_full_mapping
,
device
=
self
.
device
,
dtype
=
torch
.
long
)
else
:
self
.
sharded_to_full_mapping_gpu
=
None
# Lazily initialized.
self
.
indices
:
torch
.
Tensor
self
.
indices_len
:
List
[
int
]
...
...
@@ -1154,6 +1173,25 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
if
logits
is
None
:
return
None
if
self
.
sharded_to_full_mapping_gpu
is
not
None
:
# Reindex full logits tensor to ensure 1:1 mapping between
# index and token_id
# Example for:
# org_vocab_size = 4
# added_vocab_size = 2
# pad_to_size = 8
# tp_size = 2
# indices: [0, 1, 2, 3, 4, 5, 6, 7]
# token_id: [0, 1, 4, -1, 2, 3, 5, -1]
# Therefore, the mapping is expected to be:
# [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex,
# we get:
# indices: [0, 1, 2, 3, 4, 5, 6, 7]
# token_id: [0, 1, 2, 3, 4, 5, -1, -1]
logits
=
logits
[:,
self
.
sharded_to_full_mapping_gpu
]
lora_logits
=
torch
.
empty
(
self
.
embeddings_tensors
.
shape
[
0
]
+
1
,
self
.
embeddings_tensors
.
shape
[
1
],
...
...
vllm/lora/punica.py
View file @
f48954a4
...
...
@@ -4,16 +4,21 @@ from typing import Optional
import
torch
from
vllm
import
_custom_ops
as
ops
def
_check_punica_support
():
if
ops
.
is_custom_op_supported
(
"_punica_C::dispatch_bgmv"
):
return
def
_raise_import_error
(
e
):
if
torch
.
cuda
.
get_device_capability
()
<
(
8
,
0
):
raise
ImportError
(
"punica LoRA kernels require compute capability >= 8.0"
)
from
e
"punica LoRA kernels require compute capability >= 8.0"
)
else
:
raise
ImportError
(
"punica LoRA kernels could not be imported. If you built vLLM "
"from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
"was set."
)
from
e
"was set."
)
def
bgmv
(
...
...
@@ -41,12 +46,9 @@ def bgmv(
layer_idx: Layer index of the weight matrices.
scale: Scaling factor.
"""
try
:
import
vllm._punica_C
as
punica_kernels
except
ImportError
as
e
:
_raise_import_error
(
e
)
_check_punica_support
()
p
unica_kernel
s
.
dispatch_bgmv
(
y
,
x
,
w_t_all
,
indicies
,
layer_idx
,
scale
)
o
ps
.
dispatch_bgmv
(
y
,
x
,
w_t_all
,
indicies
,
layer_idx
,
scale
)
def
dispatch_bgmv_low_level
(
y
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
...
...
@@ -75,11 +77,9 @@ def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor,
y_offset: Offset to apply to the starting column of y.
y_slice_size: Size of the y column slice.
"""
try
:
import
vllm._punica_C
as
punica_kernels
except
ImportError
as
e
:
_raise_import_error
(
e
)
punica_kernels
.
dispatch_bgmv_low_level
(
_check_punica_support
()
ops
.
dispatch_bgmv_low_level
(
y
,
x
,
w_t_all
,
...
...
@@ -122,10 +122,7 @@ def add_lora(y: torch.Tensor,
scale: Scaling factor.
buffer: Optional. Shape: `[B, R]`. Temporary buffer.
"""
try
:
import
vllm._punica_C
as
punica_kernels
except
ImportError
as
e
:
_raise_import_error
(
e
)
_check_punica_support
()
r
=
wb_t_all
.
size
(
-
1
)
if
buffer
is
None
:
...
...
@@ -135,9 +132,8 @@ def add_lora(y: torch.Tensor,
buffer
=
torch
.
zeros
((
x
.
size
(
0
),
r
),
dtype
=
torch
.
float32
,
device
=
x
.
device
)
punica_kernels
.
dispatch_bgmv
(
buffer
,
x
,
wa_t_all
,
indicies
,
layer_idx
,
1.0
)
punica_kernels
.
dispatch_bgmv
(
y
,
buffer
,
wb_t_all
,
indicies
,
layer_idx
,
scale
)
ops
.
dispatch_bgmv
(
buffer
,
x
,
wa_t_all
,
indicies
,
layer_idx
,
1.0
)
ops
.
dispatch_bgmv
(
y
,
buffer
,
wb_t_all
,
indicies
,
layer_idx
,
scale
)
def
add_lora_slice
(
y
:
torch
.
Tensor
,
...
...
@@ -176,10 +172,7 @@ def add_lora_slice(y: torch.Tensor,
y_offset: Offset to apply to the starting column of y.
y_slice_size: Size of the y column slice.
"""
try
:
import
vllm._punica_C
as
punica_kernels
except
ImportError
as
e
:
_raise_import_error
(
e
)
_check_punica_support
()
r
=
wb_t_all
.
size
(
-
1
)
if
buffer
is
None
:
...
...
@@ -189,7 +182,7 @@ def add_lora_slice(y: torch.Tensor,
buffer
=
torch
.
zeros
((
x
.
size
(
0
),
r
),
dtype
=
torch
.
float32
,
device
=
x
.
device
)
p
unica_kernel
s
.
dispatch_bgmv_low_level
(
o
ps
.
dispatch_bgmv_low_level
(
buffer
,
x
,
wa_t_all
,
...
...
@@ -200,7 +193,7 @@ def add_lora_slice(y: torch.Tensor,
buffer
.
size
(
1
),
0
,
)
p
unica_kernel
s
.
dispatch_bgmv_low_level
(
o
ps
.
dispatch_bgmv_low_level
(
y
,
buffer
,
wb_t_all
,
...
...
vllm/lora/utils.py
View file @
f48954a4
...
...
@@ -67,7 +67,8 @@ def from_layer_logits_processor(
model_config
:
Optional
[
PretrainedConfig
]
=
None
,
)
->
LogitsProcessorWithLoRA
:
ret
=
LogitsProcessorWithLoRA
(
layer
,
lm_head
.
embedding_dim
,
lm_head
.
weight
.
dtype
,
lm_head
.
weight
.
device
)
lm_head
.
weight
.
dtype
,
lm_head
.
weight
.
device
,
lm_head
.
get_sharded_to_full_mapping
())
ret
.
create_lora_weights
(
max_loras
,
lora_config
,
model_config
)
return
ret
...
...
@@ -93,13 +94,12 @@ def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]:
is_lora_a whether the tensor is lora_a or lora_b.
"""
parts
=
name
.
split
(
"."
)
assert
parts
[
0
]
==
"base_model"
assert
parts
[
1
]
==
"model"
if
parts
[
-
1
]
==
"weight"
:
assert
parts
[
-
2
]
==
"lora_A"
or
parts
[
-
2
]
==
"lora_B"
return
"."
.
join
(
parts
[
2
:
-
2
]),
parts
[
-
2
]
==
"lora_A"
if
parts
[
-
1
]
==
"lora_embedding_A"
or
parts
[
-
1
]
==
"lora_embedding_B"
:
return
"."
.
join
(
parts
[
2
:
-
1
]),
parts
[
-
1
]
==
"lora_embedding_A"
if
len
(
parts
)
>=
2
and
parts
[
0
]
==
"base_model"
and
parts
[
1
]
==
"model"
:
if
parts
[
-
1
]
==
"weight"
:
if
parts
[
-
2
]
==
"lora_A"
or
parts
[
-
2
]
==
"lora_B"
:
return
"."
.
join
(
parts
[
2
:
-
2
]),
parts
[
-
2
]
==
"lora_A"
elif
parts
[
-
1
]
==
"lora_embedding_A"
or
parts
[
-
1
]
==
"lora_embedding_B"
:
return
"."
.
join
(
parts
[
2
:
-
1
]),
parts
[
-
1
]
==
"lora_embedding_A"
raise
ValueError
(
f
"
{
name
}
is unsupported
forma
t"
)
raise
ValueError
(
f
"
{
name
}
is unsupported
LoRA weigh
t"
)
vllm/lora/worker_manager.py
View file @
f48954a4
from
abc
import
ABC
,
abstractmethod
,
abstractproperty
from
abc
import
ABC
,
abstractmethod
from
contextlib
import
contextmanager
from
typing
import
Any
,
Dict
,
List
,
Literal
,
Optional
,
Set
,
Type
,
Union
...
...
@@ -42,7 +42,8 @@ class AbstractWorkerLoRAManager(ABC):
yield
self
.
_cached_dummy_lora
=
False
@
abstractproperty
@
property
@
abstractmethod
def
is_enabled
(
self
)
->
bool
:
...
...
...
vllm/model_executor/custom_op.py
0 → 100644
View file @
f48954a4
import
torch.nn
as
nn
from
vllm.utils
import
is_cpu
,
is_hip
class
CustomOp
(
nn
.
Module
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
()
self
.
_forward_method
=
self
.
dispatch_forward
()
def
forward
(
self
,
*
args
,
**
kwargs
):
return
self
.
_forward_method
(
*
args
,
**
kwargs
)
def
forward_native
(
self
,
*
args
,
**
kwargs
):
"""PyTorch-native implementation of the forward method.
This method is optional. If implemented, it can be used with compilers
such as torch.compile or PyTorch XLA. Also, it can be used for testing
purposes.
"""
raise
NotImplementedError
def
forward_cuda
(
self
,
*
args
,
**
kwargs
):
raise
NotImplementedError
def
forward_hip
(
self
,
*
args
,
**
kwargs
):
# By default, we assume that HIP ops are compatible with CUDA ops.
return
self
.
forward_cuda
(
*
args
,
**
kwargs
)
def
forward_xpu
(
self
,
*
args
,
**
kwargs
):
# By default, we assume that XPU ops are compatible with CUDA ops.
# NOTE(woosuk): This is a placeholder for future extensions.
return
self
.
forward_cuda
(
*
args
,
**
kwargs
)
def
forward_cpu
(
self
,
*
args
,
**
kwargs
):
# By default, we assume that CPU ops are compatible with CUDA ops.
return
self
.
forward_cuda
(
*
args
,
**
kwargs
)
def
forward_tpu
(
self
,
*
args
,
**
kwargs
):
# By default, we assume that TPU ops are compatible with the
# PyTorch-native implementation.
# NOTE(woosuk): This is a placeholder for future extensions.
return
self
.
forward_native
(
*
args
,
**
kwargs
)
def
forward_gaudi
(
self
,
*
args
,
**
kwargs
):
# By default, we assume that Gaudi ops are compatible with the
# PyTorch-native implementation.
# NOTE(woosuk): This is a placeholder for future extensions.
return
self
.
forward_native
(
*
args
,
**
kwargs
)
def
dispatch_forward
(
self
):
# NOTE(woosuk): Here we assume that vLLM was built for only one
# specific backend. Currently, we do not support dynamic dispatching.
if
is_hip
():
return
self
.
forward_hip
elif
is_cpu
():
return
self
.
forward_cpu
else
:
return
self
.
forward_cuda
vllm/model_executor/guided_decoding/__init__.py
View file @
f48954a4
from
typing
import
Optional
,
Union
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
CompletionRequest
)
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionNamedToolChoiceParam
,
ChatCompletionRequest
,
CompletionRequest
)
from
vllm.model_executor.guided_decoding.lm_format_enforcer_decoding
import
(
get_lm_format_enforcer_guided_decoding_logits_processor
)
from
vllm.model_executor.guided_decoding.outlines_decoding
import
(
...
...
@@ -13,6 +14,8 @@ async def get_guided_decoding_logits_processor(
guided_decoding_backend
:
str
,
request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
],
tokenizer
)
->
Optional
[
LogitsProcessor
]:
request
=
_adapt_request_for_tool_use
(
request
)
if
guided_decoding_backend
==
'outlines'
:
return
await
get_outlines_guided_decoding_logits_processor
(
request
,
tokenizer
)
...
...
@@ -23,3 +26,26 @@ async def get_guided_decoding_logits_processor(
raise
ValueError
(
f
"Unknown guided decoding backend '
{
guided_decoding_backend
}
'. "
"Must be one of 'outlines, 'lm-format-enforcer'"
)
def
_adapt_request_for_tool_use
(
request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
]):
# the legacy completion API does not support tool use
if
type
(
request
)
is
CompletionRequest
:
return
request
# user has chosen to not use any tool
if
request
.
tool_choice
==
"none"
:
return
request
# user has chosen to use a named tool
if
type
(
request
.
tool_choice
)
is
ChatCompletionNamedToolChoiceParam
:
tool_name
=
request
.
tool_choice
.
function
.
name
tools
=
{
tool
.
function
.
name
:
tool
.
function
for
tool
in
request
.
tools
}
if
tool_name
not
in
tools
:
raise
ValueError
(
f
"Tool '
{
tool_name
}
' has not been passed in `tools`."
)
tool
=
tools
[
tool_name
]
request
.
guided_json
=
tool
.
parameters
return
request
Prev
1
…
6
7
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment