Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8f10d5e3
Unverified
Commit
8f10d5e3
authored
Dec 11, 2024
by
Cyrus Leung
Committed by
GitHub
Dec 11, 2024
Browse files
[Misc] Split up pooling tasks (#10820)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
40766ca1
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
39 additions
and
34 deletions
+39
-34
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+29
-24
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+4
-4
vllm/entrypoints/openai/run_batch.py
vllm/entrypoints/openai/run_batch.py
+2
-2
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/utils.py
+1
-1
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+1
-1
vllm/worker/cpu_worker.py
vllm/worker/cpu_worker.py
+1
-1
vllm/worker/worker.py
vllm/worker/worker.py
+1
-1
No files found.
vllm/entrypoints/llm.py
View file @
8f10d5e3
...
...
@@ -381,19 +381,20 @@ class LLM:
considered legacy and may be deprecated in the future. You should
instead pass them via the ``inputs`` parameter.
"""
task
=
self
.
llm_engine
.
model_config
.
task
if
task
!=
"generate"
:
runner_type
=
self
.
llm_engine
.
model_config
.
runner_type
if
runner_type
!=
"generate"
:
messages
=
[
"LLM.generate() is only supported for (conditional) generation "
"models (XForCausalLM, XForConditionalGeneration)."
,
]
supported_tasks
=
self
.
llm_engine
.
model_config
.
supported_tasks
if
"generate"
in
supported_tasks
:
supported_runner_types
=
self
.
llm_engine
.
model_config
\
.
supported_runner_types
if
"generate"
in
supported_runner_types
:
messages
.
append
(
"Your model supports the 'generate'
task
, but is "
f
"currently initialized for the '
{
task
}
' task. Please
"
"initialize
the model
using `--task generate`."
)
"Your model supports the 'generate'
runner
, but is "
f
"currently initialized for the '
{
runner_type
}
' runner.
"
"
Please
initialize
vLLM
using `--task generate`."
)
raise
ValueError
(
" "
.
join
(
messages
))
...
...
@@ -793,16 +794,18 @@ class LLM:
considered legacy and may be deprecated in the future. You should
instead pass them via the ``inputs`` parameter.
"""
task
=
self
.
llm_engine
.
model_config
.
task
if
task
!=
"embedd
ing"
:
messages
=
[
"LLM.encode() is only supported for
embedd
ing models."
]
runner_type
=
self
.
llm_engine
.
model_config
.
runner_type
if
runner_type
!=
"pool
ing"
:
messages
=
[
"LLM.encode() is only supported for
pool
ing models."
]
supported_tasks
=
self
.
llm_engine
.
model_config
.
supported_tasks
if
"embedding"
in
supported_tasks
:
supported_runner_types
=
self
.
llm_engine
.
model_config
\
.
supported_runner_types
if
"pooling"
in
supported_runner_types
:
messages
.
append
(
"Your model supports the 'embedding' task, but is "
f
"currently initialized for the '
{
task
}
' task. Please "
"initialize the model using `--task embedding`."
)
"Your model supports the 'pooling' runner, but is "
f
"currently initialized for the '
{
runner_type
}
' runner. "
"Please initialize vLLM using `--task embed`, "
"`--task classify`, `--task score` etc."
)
raise
ValueError
(
" "
.
join
(
messages
))
...
...
@@ -864,21 +867,23 @@ class LLM:
A list of ``PoolingRequestOutput`` objects containing the
generated scores in the same order as the input prompts.
"""
task
=
self
.
llm_engine
.
model_config
.
task
if
task
!=
"embedd
ing"
:
messages
=
[
"LLM.score() is only supported for
embedd
ing models."
]
runner_type
=
self
.
llm_engine
.
model_config
.
runner_type
if
runner_type
!=
"pool
ing"
:
messages
=
[
"LLM.score() is only supported for
pool
ing models."
]
supported_tasks
=
self
.
llm_engine
.
model_config
.
supported_tasks
if
"embedding"
in
supported_tasks
:
supported_runner_types
=
self
.
llm_engine
.
model_config
\
.
supported_runner_types
if
"pooling"
in
supported_runner_types
:
messages
.
append
(
"Your model supports the 'embedding' task, but is "
f
"currently initialized for the '
{
task
}
' task. Please "
"initialize the model using `--task embedding`."
)
"Your model supports the 'pooling' runner, but is "
f
"currently initialized for the '
{
runner_type
}
' runner. "
"Please initialize vLLM using `--task embed`, "
"`--task classify`, `--task score` etc."
)
raise
ValueError
(
" "
.
join
(
messages
))
if
not
self
.
llm_engine
.
model_config
.
is_cross_encoder
:
raise
ValueError
(
"Your model does not support
the
cross encoding"
)
raise
ValueError
(
"Your model does not support cross encoding"
)
tokenizer
=
self
.
llm_engine
.
get_tokenizer
()
...
...
vllm/entrypoints/openai/api_server.py
View file @
8f10d5e3
...
...
@@ -573,7 +573,7 @@ def init_app_state(
enable_auto_tools
=
args
.
enable_auto_tool_choice
,
tool_parser
=
args
.
tool_call_parser
,
enable_prompt_tokens_details
=
args
.
enable_prompt_tokens_details
,
)
if
model_config
.
task
==
"generate"
else
None
)
if
model_config
.
runner_type
==
"generate"
else
None
state
.
openai_serving_completion
=
OpenAIServingCompletion
(
engine_client
,
model_config
,
...
...
@@ -582,7 +582,7 @@ def init_app_state(
prompt_adapters
=
args
.
prompt_adapters
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
args
.
return_tokens_as_token_ids
,
)
if
model_config
.
task
==
"generate"
else
None
)
if
model_config
.
runner_type
==
"generate"
else
None
state
.
openai_serving_embedding
=
OpenAIServingEmbedding
(
engine_client
,
model_config
,
...
...
@@ -590,13 +590,13 @@ def init_app_state(
request_logger
=
request_logger
,
chat_template
=
resolved_chat_template
,
chat_template_content_format
=
args
.
chat_template_content_format
,
)
if
model_config
.
task
==
"embedd
ing"
else
None
)
if
model_config
.
runner_type
==
"pool
ing"
else
None
state
.
openai_serving_scores
=
OpenAIServingScores
(
engine_client
,
model_config
,
base_model_paths
,
request_logger
=
request_logger
)
if
(
model_config
.
task
==
"embedd
ing"
\
)
if
(
model_config
.
runner_type
==
"pool
ing"
\
and
model_config
.
is_cross_encoder
)
else
None
state
.
openai_serving_tokenization
=
OpenAIServingTokenization
(
engine_client
,
...
...
vllm/entrypoints/openai/run_batch.py
View file @
8f10d5e3
...
...
@@ -224,7 +224,7 @@ async def main(args):
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
enable_prompt_tokens_details
=
args
.
enable_prompt_tokens_details
,
)
if
model_config
.
task
==
"generate"
else
None
)
if
model_config
.
runner_type
==
"generate"
else
None
openai_serving_embedding
=
OpenAIServingEmbedding
(
engine
,
model_config
,
...
...
@@ -232,7 +232,7 @@ async def main(args):
request_logger
=
request_logger
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
if
model_config
.
task
==
"embedd
ing"
else
None
)
if
model_config
.
runner_type
==
"pool
ing"
else
None
tracker
=
BatchProgressTracker
()
logger
.
info
(
"Reading batch from %s..."
,
args
.
input_file
)
...
...
vllm/model_executor/model_loader/utils.py
View file @
8f10d5e3
...
...
@@ -35,7 +35,7 @@ def get_model_architecture(
architectures
=
[
"QuantMixtralForCausalLM"
]
model_cls
,
arch
=
ModelRegistry
.
resolve_model_cls
(
architectures
)
if
model_config
.
task
==
"embedd
ing"
:
if
model_config
.
runner_type
==
"pool
ing"
:
model_cls
=
as_embedding_model
(
model_cls
)
return
model_cls
,
arch
...
...
vllm/v1/engine/core.py
View file @
8f10d5e3
...
...
@@ -42,7 +42,7 @@ class EngineCore:
executor_class
:
Type
[
Executor
],
usage_context
:
UsageContext
,
):
assert
vllm_config
.
model_config
.
task
!=
"embedd
ing"
assert
vllm_config
.
model_config
.
runner_type
!=
"pool
ing"
logger
.
info
(
"Initializing an LLM engine (v%s) with config: %s"
,
VLLM_VERSION
,
vllm_config
)
...
...
vllm/worker/cpu_worker.py
View file @
8f10d5e3
...
...
@@ -163,7 +163,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
not
in
[
"medusa"
,
"mlp_speculator"
,
"eagle"
])
\
else
{
"return_hidden_states"
:
True
}
ModelRunnerClass
:
Type
[
CPUModelRunnerBase
]
=
CPUModelRunner
if
self
.
model_config
.
task
==
"embedd
ing"
:
if
self
.
model_config
.
runner_type
==
"pool
ing"
:
ModelRunnerClass
=
CPUPoolingModelRunner
elif
self
.
model_config
.
is_encoder_decoder
:
ModelRunnerClass
=
CPUEncoderDecoderModelRunner
...
...
vllm/worker/worker.py
View file @
8f10d5e3
...
...
@@ -75,7 +75,7 @@ class Worker(LocalOrDistributedWorkerBase):
else
{
"return_hidden_states"
:
True
}
ModelRunnerClass
:
Type
[
GPUModelRunnerBase
]
=
ModelRunner
if
model_config
.
task
==
"embedd
ing"
:
if
model_config
.
runner_type
==
"pool
ing"
:
ModelRunnerClass
=
PoolingModelRunner
elif
self
.
model_config
.
is_encoder_decoder
:
ModelRunnerClass
=
EncoderDecoderModelRunner
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment