Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d2f058e7
Unverified
Commit
d2f058e7
authored
Dec 01, 2024
by
Cyrus Leung
Committed by
GitHub
Dec 01, 2024
Browse files
[Misc] Rename embedding classes to pooling (#10801)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
f877a7d1
Changes
25
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
13 additions
and
13 deletions
+13
-13
vllm/v1/engine/async_stream.py
vllm/v1/engine/async_stream.py
+4
-4
vllm/worker/cpu_pooling_model_runner.py
vllm/worker/cpu_pooling_model_runner.py
+2
-2
vllm/worker/cpu_worker.py
vllm/worker/cpu_worker.py
+2
-2
vllm/worker/pooling_model_runner.py
vllm/worker/pooling_model_runner.py
+3
-3
vllm/worker/worker.py
vllm/worker/worker.py
+2
-2
No files found.
vllm/v1/engine/async_stream.py
View file @
d2f058e7
import
asyncio
import
asyncio
from
typing
import
Any
,
AsyncGenerator
,
Callable
,
Optional
,
Type
,
Union
from
typing
import
Any
,
AsyncGenerator
,
Callable
,
Optional
,
Type
,
Union
from
vllm.outputs
import
Embedd
ingRequestOutput
,
RequestOutput
from
vllm.outputs
import
Pool
ingRequestOutput
,
RequestOutput
class
AsyncStream
:
class
AsyncStream
:
"""A stream of RequestOutputs or
Embedd
ingRequestOutputs for a request
"""A stream of RequestOutputs or
Pool
ingRequestOutputs for a request
that can be iterated over asynchronously via an async generator."""
that can be iterated over asynchronously via an async generator."""
STOP_ITERATION
=
Exception
()
# Sentinel
STOP_ITERATION
=
Exception
()
# Sentinel
...
@@ -16,7 +16,7 @@ class AsyncStream:
...
@@ -16,7 +16,7 @@ class AsyncStream:
self
.
_queue
:
asyncio
.
Queue
=
asyncio
.
Queue
()
self
.
_queue
:
asyncio
.
Queue
=
asyncio
.
Queue
()
self
.
_finished
=
False
self
.
_finished
=
False
def
put
(
self
,
item
:
Union
[
RequestOutput
,
Embedd
ingRequestOutput
,
def
put
(
self
,
item
:
Union
[
RequestOutput
,
Pool
ingRequestOutput
,
Exception
])
->
None
:
Exception
])
->
None
:
if
not
self
.
_finished
:
if
not
self
.
_finished
:
self
.
_queue
.
put_nowait
(
item
)
self
.
_queue
.
put_nowait
(
item
)
...
@@ -32,7 +32,7 @@ class AsyncStream:
...
@@ -32,7 +32,7 @@ class AsyncStream:
async
def
generator
(
async
def
generator
(
self
self
)
->
AsyncGenerator
[
Union
[
RequestOutput
,
Embedd
ingRequestOutput
],
None
]:
)
->
AsyncGenerator
[
Union
[
RequestOutput
,
Pool
ingRequestOutput
],
None
]:
finished
=
False
finished
=
False
try
:
try
:
while
True
:
while
True
:
...
...
vllm/worker/cpu_
embedd
ing_model_runner.py
→
vllm/worker/cpu_
pool
ing_model_runner.py
View file @
d2f058e7
...
@@ -16,12 +16,12 @@ from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU,
...
@@ -16,12 +16,12 @@ from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU,
@
dataclasses
.
dataclass
(
frozen
=
True
)
@
dataclasses
.
dataclass
(
frozen
=
True
)
class
ModelInputForCPUWithPoolingMetadata
(
ModelInputForCPU
):
class
ModelInputForCPUWithPoolingMetadata
(
ModelInputForCPU
):
"""
"""
Used by the CPU
Embedd
ingModelRunner.
Used by the CPU
Pool
ingModelRunner.
"""
"""
pooling_metadata
:
Optional
[
"PoolingMetadata"
]
=
None
pooling_metadata
:
Optional
[
"PoolingMetadata"
]
=
None
class
CPU
Embedd
ingModelRunner
(
class
CPU
Pool
ingModelRunner
(
CPUModelRunnerBase
[
ModelInputForCPUWithPoolingMetadata
]):
CPUModelRunnerBase
[
ModelInputForCPUWithPoolingMetadata
]):
_model_input_cls
:
Type
[
ModelInputForCPUWithPoolingMetadata
]
=
(
_model_input_cls
:
Type
[
ModelInputForCPUWithPoolingMetadata
]
=
(
ModelInputForCPUWithPoolingMetadata
)
ModelInputForCPUWithPoolingMetadata
)
...
...
vllm/worker/cpu_worker.py
View file @
d2f058e7
...
@@ -14,9 +14,9 @@ from vllm.logger import init_logger
...
@@ -14,9 +14,9 @@ from vllm.logger import init_logger
from
vllm.model_executor
import
set_random_seed
from
vllm.model_executor
import
set_random_seed
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.worker.cpu_embedding_model_runner
import
CPUEmbeddingModelRunner
from
vllm.worker.cpu_enc_dec_model_runner
import
CPUEncoderDecoderModelRunner
from
vllm.worker.cpu_enc_dec_model_runner
import
CPUEncoderDecoderModelRunner
from
vllm.worker.cpu_model_runner
import
CPUModelRunner
,
CPUModelRunnerBase
from
vllm.worker.cpu_model_runner
import
CPUModelRunner
,
CPUModelRunnerBase
from
vllm.worker.cpu_pooling_model_runner
import
CPUPoolingModelRunner
from
vllm.worker.worker_base
import
(
LocalOrDistributedWorkerBase
,
from
vllm.worker.worker_base
import
(
LocalOrDistributedWorkerBase
,
LoraNotSupportedWorkerBase
,
WorkerBase
,
LoraNotSupportedWorkerBase
,
WorkerBase
,
WorkerInput
)
WorkerInput
)
...
@@ -164,7 +164,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
...
@@ -164,7 +164,7 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
else
{
"return_hidden_states"
:
True
}
else
{
"return_hidden_states"
:
True
}
ModelRunnerClass
:
Type
[
CPUModelRunnerBase
]
=
CPUModelRunner
ModelRunnerClass
:
Type
[
CPUModelRunnerBase
]
=
CPUModelRunner
if
self
.
model_config
.
task
==
"embedding"
:
if
self
.
model_config
.
task
==
"embedding"
:
ModelRunnerClass
=
CPU
Embedd
ingModelRunner
ModelRunnerClass
=
CPU
Pool
ingModelRunner
elif
self
.
model_config
.
is_encoder_decoder
:
elif
self
.
model_config
.
is_encoder_decoder
:
ModelRunnerClass
=
CPUEncoderDecoderModelRunner
ModelRunnerClass
=
CPUEncoderDecoderModelRunner
self
.
model_runner
:
CPUModelRunnerBase
=
ModelRunnerClass
(
self
.
model_runner
:
CPUModelRunnerBase
=
ModelRunnerClass
(
...
...
vllm/worker/
embedd
ing_model_runner.py
→
vllm/worker/
pool
ing_model_runner.py
View file @
d2f058e7
...
@@ -21,12 +21,12 @@ logger = init_logger(__name__)
...
@@ -21,12 +21,12 @@ logger = init_logger(__name__)
@
dataclasses
.
dataclass
(
frozen
=
True
)
@
dataclasses
.
dataclass
(
frozen
=
True
)
class
ModelInputForGPUWithPoolingMetadata
(
ModelInputForGPU
):
class
ModelInputForGPUWithPoolingMetadata
(
ModelInputForGPU
):
"""
"""
Used by the
Embedd
ingModelRunner.
Used by the
Pool
ingModelRunner.
"""
"""
pooling_metadata
:
Optional
[
"PoolingMetadata"
]
=
None
pooling_metadata
:
Optional
[
"PoolingMetadata"
]
=
None
class
Embedd
ingModelRunner
(
class
Pool
ingModelRunner
(
GPUModelRunnerBase
[
ModelInputForGPUWithPoolingMetadata
]):
GPUModelRunnerBase
[
ModelInputForGPUWithPoolingMetadata
]):
_model_input_cls
:
Type
[
ModelInputForGPUWithPoolingMetadata
]
=
(
_model_input_cls
:
Type
[
ModelInputForGPUWithPoolingMetadata
]
=
(
ModelInputForGPUWithPoolingMetadata
)
ModelInputForGPUWithPoolingMetadata
)
...
@@ -52,7 +52,7 @@ class EmbeddingModelRunner(
...
@@ -52,7 +52,7 @@ class EmbeddingModelRunner(
)
->
Optional
[
Union
[
List
[
PoolerOutput
],
IntermediateTensors
]]:
)
->
Optional
[
Union
[
List
[
PoolerOutput
],
IntermediateTensors
]]:
if
num_steps
>
1
:
if
num_steps
>
1
:
raise
ValueError
(
raise
ValueError
(
"
Embedd
ingModelRunner does not support multi-step execution."
)
"
Pool
ingModelRunner does not support multi-step execution."
)
if
self
.
lora_config
:
if
self
.
lora_config
:
assert
model_input
.
lora_requests
is
not
None
assert
model_input
.
lora_requests
is
not
None
...
...
vllm/worker/worker.py
View file @
d2f058e7
...
@@ -22,9 +22,9 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
...
@@ -22,9 +22,9 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
from
vllm.sequence
import
(
ExecuteModelRequest
,
IntermediateTensors
,
from
vllm.sequence
import
(
ExecuteModelRequest
,
IntermediateTensors
,
SequenceGroupMetadata
,
SequenceGroupMetadataDelta
)
SequenceGroupMetadata
,
SequenceGroupMetadataDelta
)
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.worker.embedding_model_runner
import
EmbeddingModelRunner
from
vllm.worker.enc_dec_model_runner
import
EncoderDecoderModelRunner
from
vllm.worker.enc_dec_model_runner
import
EncoderDecoderModelRunner
from
vllm.worker.model_runner
import
GPUModelRunnerBase
,
ModelRunner
from
vllm.worker.model_runner
import
GPUModelRunnerBase
,
ModelRunner
from
vllm.worker.pooling_model_runner
import
PoolingModelRunner
from
vllm.worker.worker_base
import
(
LocalOrDistributedWorkerBase
,
WorkerBase
,
from
vllm.worker.worker_base
import
(
LocalOrDistributedWorkerBase
,
WorkerBase
,
WorkerInput
)
WorkerInput
)
...
@@ -75,7 +75,7 @@ class Worker(LocalOrDistributedWorkerBase):
...
@@ -75,7 +75,7 @@ class Worker(LocalOrDistributedWorkerBase):
ModelRunnerClass
:
Type
[
GPUModelRunnerBase
]
=
ModelRunner
ModelRunnerClass
:
Type
[
GPUModelRunnerBase
]
=
ModelRunner
if
model_config
.
task
==
"embedding"
:
if
model_config
.
task
==
"embedding"
:
ModelRunnerClass
=
Embedd
ingModelRunner
ModelRunnerClass
=
Pool
ingModelRunner
elif
self
.
model_config
.
is_encoder_decoder
:
elif
self
.
model_config
.
is_encoder_decoder
:
ModelRunnerClass
=
EncoderDecoderModelRunner
ModelRunnerClass
=
EncoderDecoderModelRunner
self
.
model_runner
:
GPUModelRunnerBase
=
ModelRunnerClass
(
self
.
model_runner
:
GPUModelRunnerBase
=
ModelRunnerClass
(
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment