Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d58268c5
Unverified
Commit
d58268c5
authored
Nov 06, 2024
by
Joe Runde
Committed by
GitHub
Nov 06, 2024
Browse files
[V1] Make v1 more testable (#9888)
Signed-off-by:
Joe Runde
<
Joseph.Runde@ibm.com
>
parent
87bd7e05
Changes
75
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
39 additions
and
36 deletions
+39
-36
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+2
-2
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+2
-2
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+2
-2
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+2
-2
vllm/model_executor/models/qwen2_moe.py
vllm/model_executor/models/qwen2_moe.py
+2
-2
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+2
-2
vllm/model_executor/models/solar.py
vllm/model_executor/models/solar.py
+2
-2
vllm/model_executor/models/stablelm.py
vllm/model_executor/models/stablelm.py
+2
-2
vllm/model_executor/models/starcoder2.py
vllm/model_executor/models/starcoder2.py
+2
-2
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+2
-2
vllm/model_executor/models/xverse.py
vllm/model_executor/models/xverse.py
+2
-2
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+6
-6
vllm/v1/engine/llm_engine.py
vllm/v1/engine/llm_engine.py
+6
-0
vllm/v1/tokenizer/detokenizer.py
vllm/v1/tokenizer/detokenizer.py
+4
-4
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-4
No files found.
vllm/model_executor/models/pixtral.py
View file @
d58268c5
...
...
@@ -25,7 +25,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
QKVParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.utils
import
merge_multimodal_embeddings
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
...
...
@@ -190,7 +190,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
if
hasattr
(
self
.
language_model
,
"sampler"
):
return
self
.
language_model
.
sampler
return
S
ampler
()
return
get_s
ampler
()
def
forward
(
self
,
...
...
vllm/model_executor/models/qwen.py
View file @
d58268c5
...
...
@@ -36,7 +36,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.resampler
import
Resampler2
,
get_abs_pos
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -884,7 +884,7 @@ class QWenBaseModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
transformer
.
wte
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
S
ampler
()
self
.
sampler
=
get_s
ampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
transformer
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/qwen2.py
View file @
d58268c5
...
...
@@ -39,7 +39,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -444,7 +444,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
prefix
,
"lm_head"
))
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
S
ampler
()
self
.
sampler
=
get_s
ampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/qwen2_audio.py
View file @
d58268c5
...
...
@@ -36,7 +36,7 @@ from vllm.logger import init_logger
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
...
...
@@ -295,7 +295,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
text_config
.
vocab_size
,
logit_scale
)
self
.
sampler
=
S
ampler
()
self
.
sampler
=
get_s
ampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
language_model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/qwen2_moe.py
View file @
d58268c5
...
...
@@ -44,7 +44,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -393,7 +393,7 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
S
ampler
()
self
.
sampler
=
get_s
ampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
d58268c5
...
...
@@ -52,7 +52,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.qwen2
import
Qwen2Model
...
...
@@ -990,7 +990,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
self
.
lm_head
=
PPMissingLayer
()
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
S
ampler
()
self
.
sampler
=
get_s
ampler
()
self
.
make_empty_intermediate_tensors
=
(
make_empty_intermediate_tensors_factory
(
[
"hidden_states"
,
"residual"
],
config
.
hidden_size
))
...
...
vllm/model_executor/models/solar.py
View file @
d58268c5
...
...
@@ -42,7 +42,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.quantization.compressed_tensors.utils
import
(
get_compressed_tensors_cache_scale
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -449,7 +449,7 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
)
self
.
sampler
=
S
ampler
()
self
.
sampler
=
get_s
ampler
()
else
:
self
.
lm_head
=
PPMissingLayer
()
...
...
vllm/model_executor/models/stablelm.py
View file @
d58268c5
...
...
@@ -34,7 +34,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -261,7 +261,7 @@ class StablelmForCausalLM(nn.Module, SupportsPP):
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
S
ampler
()
self
.
sampler
=
get_s
ampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/starcoder2.py
View file @
d58268c5
...
...
@@ -34,7 +34,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -269,7 +269,7 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
sampler
=
S
ampler
()
self
.
sampler
=
get_s
ampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/ultravox.py
View file @
d58268c5
...
...
@@ -21,7 +21,7 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
from
vllm.model_executor.layers.activation
import
SiluAndMul
,
get_act_fn
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.model_loader.loader
import
DefaultModelLoader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
MultiModalInputs
,
...
...
@@ -379,7 +379,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
if
hasattr
(
self
.
language_model
,
"sampler"
):
return
self
.
language_model
.
sampler
return
S
ampler
()
return
get_s
ampler
()
def
_audio_features_to_embeddings
(
self
,
input_features
:
torch
.
Tensor
)
->
torch
.
Tensor
:
...
...
vllm/model_executor/models/xverse.py
View file @
d58268c5
...
...
@@ -37,7 +37,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
Sampler
Output
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -334,7 +334,7 @@ class XverseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
if
self
.
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
S
ampler
()
self
.
sampler
=
get_s
ampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/v1/attention/backends/flash_attn.py
View file @
d58268c5
...
...
@@ -136,7 +136,7 @@ class FlashAttentionImpl(AttentionImpl):
"key/v_scale is not supported in FlashAttention."
)
output
=
torch
.
empty_like
(
query
)
torch
.
ops
.
vllm
.
unified_flash_attention
(
torch
.
ops
.
vllm
.
unified_
v1_
flash_attention
(
output
,
query
,
key
,
...
...
@@ -156,7 +156,7 @@ class FlashAttentionImpl(AttentionImpl):
return
output
def
unified_flash_attention
(
def
unified_
v1_
flash_attention
(
output
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
...
...
@@ -222,7 +222,7 @@ def unified_flash_attention(
output
[:
num_actual_tokens
].
copy_
(
attn_output
)
def
unified_flash_attention_fake
(
def
unified_
v1_
flash_attention_fake
(
output
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
...
...
@@ -243,8 +243,8 @@ def unified_flash_attention_fake(
direct_register_custom_op
(
op_name
=
"unified_flash_attention"
,
op_func
=
unified_flash_attention
,
op_name
=
"unified_
v1_
flash_attention"
,
op_func
=
unified_
v1_
flash_attention
,
mutates_args
=
[
"kv_cache"
,
"output"
],
fake_impl
=
unified_flash_attention_fake
,
fake_impl
=
unified_
v1_
flash_attention_fake
,
)
vllm/v1/engine/llm_engine.py
View file @
d58268c5
...
...
@@ -155,6 +155,12 @@ class LLMEngine:
# GPU and CPU blocks, which are profiled in the distributed executor.
self
.
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
)
def
__del__
(
self
):
# Small hack- implicit clean up of resources on garbage collect
# TODO: this should probably be explicitly invoked when we're done with
# the engine
self
.
terminate_detokenizer
()
def
_initialize_kv_caches
(
self
)
->
None
:
num_gpu_blocks
,
_
=
self
.
model_executor
.
determine_num_available_blocks
(
)
...
...
vllm/v1/tokenizer/detokenizer.py
View file @
d58268c5
...
...
@@ -73,7 +73,7 @@ class Detokenizer:
return
None
def
terminate
(
self
)
->
None
:
self
.
push_socket
.
send
(
b
""
,
flags
=
zmq
.
NOBLOCK
)
self
.
detokenizer
.
kill
(
)
self
.
detokenizer
.
join
()
...
...
@@ -108,10 +108,10 @@ class DetokenizerProc(multiprocessing.Process):
self
.
push_socket
.
bind
(
f
"tcp://*:
{
self
.
push_port
}
"
)
while
True
:
if
self
.
pull_socket
.
poll
(
timeout
=
1000
)
==
0
:
# Nothing to read
continue
message
=
self
.
pull_socket
.
recv
()
if
message
==
b
""
:
# Terminate signal.
break
inputs
=
self
.
msgpack_decoder
.
decode
(
message
)
for
req_id
in
inputs
.
free_req_ids
:
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
d58268c5
...
...
@@ -2,7 +2,6 @@ import os
import
time
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Dict
,
List
,
Optional
,
Set
from
unittest.mock
import
patch
import
numpy
as
np
import
torch
...
...
@@ -26,7 +25,6 @@ from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
FlashAttentionMetadata
)
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.sample.sampler
import
Sampler
if
TYPE_CHECKING
:
from
vllm.v1.core.scheduler
import
SchedulerOutput
...
...
@@ -418,8 +416,7 @@ class GPUModelRunner:
logger
.
info
(
"Starting to load model %s..."
,
self
.
model_config
.
model
)
with
DeviceMemoryProfiler
()
as
m
:
# noqa: SIM117
with
patch
(
"vllm.model_executor.layers.sampler.Sampler"
,
Sampler
):
self
.
model
=
get_model
(
vllm_config
=
self
.
vllm_config
)
self
.
model
=
get_model
(
vllm_config
=
self
.
vllm_config
)
self
.
model_memory_usage
=
m
.
consumed_memory
logger
.
info
(
"Loading model weights took %.4f GB"
,
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment