Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1c3ffdbe
Unverified
Commit
1c3ffdbe
authored
Sep 21, 2025
by
Woosuk Kwon
Committed by
GitHub
Sep 21, 2025
Browse files
[V0 Deprecation] Remove V0 sampling metadata (#25345)
Signed-off-by:
Woosuk Kwon
<
woosuk@thinkingmachines.ai
>
parent
c438b295
Changes
141
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
20 additions
and
78 deletions
+20
-78
vllm/model_executor/models/ovis2_5.py
vllm/model_executor/models/ovis2_5.py
+1
-3
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+1
-4
vllm/model_executor/models/persimmon.py
vllm/model_executor/models/persimmon.py
+1
-4
vllm/model_executor/models/phi.py
vllm/model_executor/models/phi.py
+1
-3
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+1
-4
vllm/model_executor/models/phi4_multimodal.py
vllm/model_executor/models/phi4_multimodal.py
+1
-4
vllm/model_executor/models/phi4flash.py
vllm/model_executor/models/phi4flash.py
+0
-3
vllm/model_executor/models/phi4mm.py
vllm/model_executor/models/phi4mm.py
+1
-4
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/phimoe.py
+2
-5
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+1
-4
vllm/model_executor/models/plamo2.py
vllm/model_executor/models/plamo2.py
+1
-4
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+1
-4
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+1
-4
vllm/model_executor/models/qwen2_5_omni_thinker.py
vllm/model_executor/models/qwen2_5_omni_thinker.py
+1
-4
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_5_vl.py
+1
-4
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+1
-4
vllm/model_executor/models/qwen2_moe.py
vllm/model_executor/models/qwen2_moe.py
+1
-4
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+1
-4
vllm/model_executor/models/qwen3.py
vllm/model_executor/models/qwen3.py
+1
-4
vllm/model_executor/models/qwen3_moe.py
vllm/model_executor/models/qwen3_moe.py
+1
-4
No files found.
vllm/model_executor/models/ovis2_5.py
View file @
1c3ffdbe
...
@@ -19,7 +19,6 @@ from vllm.model_executor.models.siglip2navit import Siglip2NavitModel
...
@@ -19,7 +19,6 @@ from vllm.model_executor.models.siglip2navit import Siglip2NavitModel
from
vllm.model_executor.models.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
from
vllm.model_executor.models.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
init_vllm_registered_model
,
init_vllm_registered_model
,
maybe_prefix
)
maybe_prefix
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
)
MultiModalKwargsItems
)
...
@@ -630,9 +629,8 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -630,9 +629,8 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
llm
.
compute_logits
(
hidden_states
,
sampling_metadata
)
logits
=
self
.
llm
.
compute_logits
(
hidden_states
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/paligemma.py
View file @
1c3ffdbe
...
@@ -9,7 +9,6 @@ from transformers import BatchFeature, PaliGemmaConfig
...
@@ -9,7 +9,6 @@ from transformers import BatchFeature, PaliGemmaConfig
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalInputs
,
MultiModalKwargsItems
,
MultiModalInputs
,
MultiModalKwargsItems
,
...
@@ -403,10 +402,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -403,10 +402,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/persimmon.py
View file @
1c3ffdbe
...
@@ -44,7 +44,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
...
@@ -44,7 +44,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsPP
from
.interfaces
import
SupportsPP
...
@@ -334,10 +333,8 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
...
@@ -334,10 +333,8 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/phi.py
View file @
1c3ffdbe
...
@@ -59,7 +59,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
...
@@ -59,7 +59,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsLoRA
,
SupportsPP
...
@@ -346,10 +345,9 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -346,10 +345,9 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
,
self
.
lm_head
.
bias
)
self
.
lm_head
.
bias
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/phi3v.py
View file @
1c3ffdbe
...
@@ -29,7 +29,6 @@ from vllm.logger import init_logger
...
@@ -29,7 +29,6 @@ from vllm.logger import init_logger
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
VocabParallelEmbedding
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
)
MultiModalKwargsItems
)
...
@@ -681,10 +680,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
...
@@ -681,10 +680,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/phi4_multimodal.py
View file @
1c3ffdbe
...
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
...
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
,
NestedTensors
)
MultiModalKwargsItems
,
NestedTensors
)
...
@@ -1451,10 +1450,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
...
@@ -1451,10 +1450,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/phi4flash.py
View file @
1c3ffdbe
...
@@ -29,7 +29,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
...
@@ -29,7 +29,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
SupportsV0Only
)
SupportsV0Only
)
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
MambaCacheParams
)
MambaCacheParams
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.utils
import
make_layers
,
maybe_prefix
from
.utils
import
make_layers
,
maybe_prefix
...
@@ -695,12 +694,10 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
...
@@ -695,12 +694,10 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
processed_logits
=
self
.
logits_processor
(
processed_logits
=
self
.
logits_processor
(
self
.
lm_head
,
self
.
lm_head
,
hidden_states
,
hidden_states
,
sampling_metadata
,
self
.
embedding_bias
,
self
.
embedding_bias
,
)
)
return
processed_logits
return
processed_logits
...
...
vllm/model_executor/models/phi4mm.py
View file @
1c3ffdbe
...
@@ -18,7 +18,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -18,7 +18,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
)
from
vllm.model_executor.models.llama
import
LlamaModel
from
vllm.model_executor.models.llama
import
LlamaModel
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
,
NestedTensors
)
MultiModalKwargsItems
,
NestedTensors
)
...
@@ -1257,10 +1256,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
...
@@ -1257,10 +1256,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/phimoe.py
View file @
1c3ffdbe
...
@@ -47,7 +47,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -47,7 +47,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsLoRA
,
SupportsPP
...
@@ -667,10 +666,8 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -667,10 +666,8 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
inputs_embeds
)
inputs_embeds
)
return
hidden_states
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/pixtral.py
View file @
1c3ffdbe
...
@@ -32,7 +32,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
...
@@ -32,7 +32,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
RowParallelLinear
)
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargsItems
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargsItems
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalUUIDDict
,
NestedTensors
)
MultiModalUUIDDict
,
NestedTensors
)
...
@@ -480,10 +479,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -480,10 +479,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
...
...
vllm/model_executor/models/plamo2.py
View file @
1c3ffdbe
...
@@ -52,7 +52,6 @@ from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
...
@@ -52,7 +52,6 @@ from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
from
vllm.model_executor.models.utils
import
(
from
vllm.model_executor.models.utils
import
(
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
make_layers
,
maybe_prefix
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -1022,10 +1021,8 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
...
@@ -1022,10 +1021,8 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
...
...
vllm/model_executor/models/qwen.py
View file @
1c3ffdbe
...
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
...
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsLoRA
,
SupportsPP
...
@@ -282,10 +281,8 @@ class QWenBaseModel(nn.Module):
...
@@ -282,10 +281,8 @@ class QWenBaseModel(nn.Module):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/qwen2.py
View file @
1c3ffdbe
...
@@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.config
import
is_interleaved
from
vllm.transformers_utils.config
import
is_interleaved
...
@@ -510,10 +509,8 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
...
@@ -510,10 +509,8 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/qwen2_5_omni_thinker.py
View file @
1c3ffdbe
...
@@ -50,7 +50,6 @@ from vllm.model_executor.models.qwen2_5_vl import (
...
@@ -50,7 +50,6 @@ from vllm.model_executor.models.qwen2_5_vl import (
from
vllm.model_executor.models.qwen2_audio
import
(
from
vllm.model_executor.models.qwen2_audio
import
(
Qwen2AudioProcessingInfo
,
_get_feat_extract_output_lengths
)
Qwen2AudioProcessingInfo
,
_get_feat_extract_output_lengths
)
from
vllm.model_executor.models.qwen2_vl
import
Qwen2VLMultiModalDataParser
from
vllm.model_executor.models.qwen2_vl
import
Qwen2VLMultiModalDataParser
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
ImageItem
,
ModalityData
,
from
vllm.multimodal.inputs
import
(
ImageItem
,
ModalityData
,
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalDataDict
,
MultiModalFieldConfig
,
...
@@ -955,10 +954,8 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
...
@@ -955,10 +954,8 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/qwen2_5_vl.py
View file @
1c3ffdbe
...
@@ -43,7 +43,6 @@ from vllm.config import VllmConfig
...
@@ -43,7 +43,6 @@ from vllm.config import VllmConfig
from
vllm.distributed
import
parallel_state
from
vllm.distributed
import
parallel_state
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.activation
import
get_act_and_mul_fn
from
vllm.model_executor.layers.activation
import
get_act_and_mul_fn
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
# yapf: disable
# yapf: disable
...
@@ -1256,10 +1255,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1256,10 +1255,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/qwen2_audio.py
View file @
1c3ffdbe
...
@@ -34,7 +34,6 @@ from transformers.models.qwen2_audio import (Qwen2AudioConfig,
...
@@ -34,7 +34,6 @@ from transformers.models.qwen2_audio import (Qwen2AudioConfig,
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
transformers.models.whisper
import
WhisperFeatureExtractor
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
AudioItem
,
ModalityData
,
from
vllm.multimodal.inputs
import
(
AudioItem
,
ModalityData
,
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalDataDict
,
MultiModalFieldConfig
,
...
@@ -481,10 +480,8 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -481,10 +480,8 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/qwen2_moe.py
View file @
1c3ffdbe
...
@@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
...
@@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsLoRA
,
SupportsPP
...
@@ -546,10 +545,8 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
...
@@ -546,10 +545,8 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
1c3ffdbe
...
@@ -46,7 +46,6 @@ from vllm.config import VllmConfig
...
@@ -46,7 +46,6 @@ from vllm.config import VllmConfig
from
vllm.distributed
import
parallel_state
,
tensor_model_parallel_all_gather
from
vllm.distributed
import
parallel_state
,
tensor_model_parallel_all_gather
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.activation
import
QuickGELU
from
vllm.model_executor.layers.activation
import
QuickGELU
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
RowParallelLinear
)
RowParallelLinear
)
...
@@ -1527,10 +1526,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -1527,10 +1526,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/qwen3.py
View file @
1c3ffdbe
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
...
@@ -328,10 +327,8 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
...
@@ -328,10 +327,8 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/qwen3_moe.py
View file @
1c3ffdbe
...
@@ -54,7 +54,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -54,7 +54,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
MixtureOfExperts
,
SupportsLoRA
,
SupportsPP
from
.interfaces
import
MixtureOfExperts
,
SupportsLoRA
,
SupportsPP
...
@@ -690,10 +689,8 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
...
@@ -690,10 +689,8 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment