Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1c3ffdbe
Unverified
Commit
1c3ffdbe
authored
Sep 21, 2025
by
Woosuk Kwon
Committed by
GitHub
Sep 21, 2025
Browse files
[V0 Deprecation] Remove V0 sampling metadata (#25345)
Signed-off-by:
Woosuk Kwon
<
woosuk@thinkingmachines.ai
>
parent
c438b295
Changes
141
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
24 additions
and
82 deletions
+24
-82
vllm/model_executor/models/interns1.py
vllm/model_executor/models/interns1.py
+1
-4
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+1
-4
vllm/model_executor/models/jais.py
vllm/model_executor/models/jais.py
+1
-4
vllm/model_executor/models/jamba.py
vllm/model_executor/models/jamba.py
+1
-4
vllm/model_executor/models/keye.py
vllm/model_executor/models/keye.py
+1
-4
vllm/model_executor/models/kimi_vl.py
vllm/model_executor/models/kimi_vl.py
+1
-4
vllm/model_executor/models/lfm2.py
vllm/model_executor/models/lfm2.py
+2
-5
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+1
-4
vllm/model_executor/models/llama_eagle3.py
vllm/model_executor/models/llama_eagle3.py
+1
-4
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+1
-4
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+1
-4
vllm/model_executor/models/llava_next_video.py
vllm/model_executor/models/llava_next_video.py
+1
-4
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/llava_onevision.py
+1
-4
vllm/model_executor/models/mamba.py
vllm/model_executor/models/mamba.py
+2
-5
vllm/model_executor/models/mamba2.py
vllm/model_executor/models/mamba2.py
+2
-5
vllm/model_executor/models/medusa.py
vllm/model_executor/models/medusa.py
+1
-2
vllm/model_executor/models/midashenglm.py
vllm/model_executor/models/midashenglm.py
+1
-3
vllm/model_executor/models/mimo.py
vllm/model_executor/models/mimo.py
+1
-4
vllm/model_executor/models/mimo_mtp.py
vllm/model_executor/models/mimo_mtp.py
+2
-6
vllm/model_executor/models/minicpm.py
vllm/model_executor/models/minicpm.py
+1
-4
No files found.
vllm/model_executor/models/interns1.py
View file @
1c3ffdbe
...
@@ -21,7 +21,6 @@ from vllm.config import VllmConfig
...
@@ -21,7 +21,6 @@ from vllm.config import VllmConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.models.interns1_vit
import
InternS1VisionModel
from
vllm.model_executor.models.interns1_vit
import
InternS1VisionModel
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
,
NestedTensors
)
MultiModalKwargsItems
,
NestedTensors
)
...
@@ -812,10 +811,8 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -812,10 +811,8 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/internvl.py
View file @
1c3ffdbe
...
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization.awq import AWQConfig
...
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization.awq import AWQConfig
from
vllm.model_executor.models.intern_vit
import
(
InternVisionModel
,
from
vllm.model_executor.models.intern_vit
import
(
InternVisionModel
,
InternVisionPatchModel
)
InternVisionPatchModel
)
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
...
@@ -1399,10 +1398,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
...
@@ -1399,10 +1398,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/jais.py
View file @
1c3ffdbe
...
@@ -42,7 +42,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
...
@@ -42,7 +42,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
JAISConfig
from
vllm.transformers_utils.configs
import
JAISConfig
...
@@ -332,10 +331,8 @@ class JAISLMHeadModel(nn.Module, SupportsPP):
...
@@ -332,10 +331,8 @@ class JAISLMHeadModel(nn.Module, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/jamba.py
View file @
1c3ffdbe
...
@@ -32,7 +32,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
...
@@ -32,7 +32,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from
vllm.model_executor.models.llama
import
LlamaMLP
as
JambaMLP
from
vllm.model_executor.models.llama
import
LlamaMLP
as
JambaMLP
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
MambaCacheParams
)
MambaCacheParams
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
LayerBlockType
from
vllm.utils
import
LayerBlockType
...
@@ -581,10 +580,8 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
...
@@ -581,10 +580,8 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/keye.py
View file @
1c3ffdbe
...
@@ -21,7 +21,6 @@ from vllm.attention.layer import check_upstream_fa_availability
...
@@ -21,7 +21,6 @@ from vllm.attention.layer import check_upstream_fa_availability
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
RowParallelLinear
)
RowParallelLinear
)
...
@@ -1556,10 +1555,8 @@ class BaseKeyeModule(nn.Module):
...
@@ -1556,10 +1555,8 @@ class BaseKeyeModule(nn.Module):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/kimi_vl.py
View file @
1c3ffdbe
...
@@ -67,7 +67,6 @@ from vllm.model_executor.models.interfaces import (SupportsMultiModal,
...
@@ -67,7 +67,6 @@ from vllm.model_executor.models.interfaces import (SupportsMultiModal,
SupportsPP
)
SupportsPP
)
from
vllm.model_executor.models.moonvit
import
MoonVitPretrainedModel
from
vllm.model_executor.models.moonvit
import
MoonVitPretrainedModel
from
vllm.model_executor.models.utils
import
merge_multimodal_embeddings
from
vllm.model_executor.models.utils
import
merge_multimodal_embeddings
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
,
NestedTensors
)
MultiModalKwargsItems
,
NestedTensors
)
...
@@ -484,10 +483,8 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -484,10 +483,8 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
return
hidden_states
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
**
kwargs
)
->
torch
.
Tensor
:
**
kwargs
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
**
kwargs
)
sampling_metadata
,
**
kwargs
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
...
...
vllm/model_executor/models/lfm2.py
View file @
1c3ffdbe
...
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
...
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
(
HasInnerState
,
IsHybrid
,
SupportsLoRA
,
SupportsPP
,
from
.interfaces
import
(
HasInnerState
,
IsHybrid
,
SupportsLoRA
,
SupportsPP
,
...
@@ -542,10 +541,8 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
...
@@ -542,10 +541,8 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
inputs_embeds
)
inputs_embeds
)
return
hidden_states
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/llama.py
View file @
1c3ffdbe
...
@@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsEagle3
,
SupportsLoRA
,
SupportsPP
...
@@ -601,10 +600,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
...
@@ -601,10 +600,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/llama_eagle3.py
View file @
1c3ffdbe
...
@@ -21,7 +21,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -21,7 +21,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.llama
import
(
LlamaDecoderLayer
,
from
vllm.model_executor.models.llama
import
(
LlamaDecoderLayer
,
LlamaForCausalLM
)
LlamaForCausalLM
)
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
.utils
import
AutoWeightsLoader
,
maybe_prefix
from
.utils
import
AutoWeightsLoader
,
maybe_prefix
...
@@ -244,10 +243,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
...
@@ -244,10 +243,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
if
self
.
draft_id_to_target_id
is
None
:
if
self
.
draft_id_to_target_id
is
None
:
assert
logits
.
shape
[
1
]
==
self
.
config
.
vocab_size
,
\
assert
logits
.
shape
[
1
]
==
self
.
config
.
vocab_size
,
\
"Expected logits to have shape "
\
"Expected logits to have shape "
\
...
...
vllm/model_executor/models/llava.py
View file @
1c3ffdbe
...
@@ -20,7 +20,6 @@ from vllm.model_executor.layers.activation import get_act_fn
...
@@ -20,7 +20,6 @@ from vllm.model_executor.layers.activation import get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
RowParallelLinear
)
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.cache
import
BaseMultiModalProcessorCache
from
vllm.multimodal.cache
import
BaseMultiModalProcessorCache
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
...
@@ -760,10 +759,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -760,10 +759,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/llava_next.py
View file @
1c3ffdbe
...
@@ -13,7 +13,6 @@ from transformers.models.llava_next.modeling_llava_next import (
...
@@ -13,7 +13,6 @@ from transformers.models.llava_next.modeling_llava_next import (
get_anyres_image_grid_shape
,
unpad_image
)
get_anyres_image_grid_shape
,
unpad_image
)
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.parse
import
ImageSize
...
@@ -563,10 +562,8 @@ model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens].
...
@@ -563,10 +562,8 @@ model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens].
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/llava_next_video.py
View file @
1c3ffdbe
...
@@ -13,7 +13,6 @@ from transformers import (BatchFeature, LlavaNextVideoConfig,
...
@@ -13,7 +13,6 @@ from transformers import (BatchFeature, LlavaNextVideoConfig,
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.models.clip
import
CLIPVisionModel
from
vllm.model_executor.models.clip
import
CLIPVisionModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
)
MultiModalKwargsItems
)
...
@@ -464,10 +463,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -464,10 +463,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/llava_onevision.py
View file @
1c3ffdbe
...
@@ -14,7 +14,6 @@ from transformers.models.llava_onevision.modeling_llava_onevision import (
...
@@ -14,7 +14,6 @@ from transformers.models.llava_onevision.modeling_llava_onevision import (
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
)
MultiModalKwargsItems
)
...
@@ -934,10 +933,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -934,10 +933,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/mamba.py
View file @
1c3ffdbe
...
@@ -26,7 +26,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState,
...
@@ -26,7 +26,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState,
IsAttentionFree
,
SupportsPP
)
IsAttentionFree
,
SupportsPP
)
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
MambaCacheParams
)
MambaCacheParams
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
LayerBlockType
from
vllm.utils
import
LayerBlockType
...
@@ -299,10 +298,8 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
...
@@ -299,10 +298,8 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
def
get_seqlen_agnostic_capture_inputs
(
self
,
batch_size
:
int
):
def
get_seqlen_agnostic_capture_inputs
(
self
,
batch_size
:
int
):
return
self
.
mamba_cache
.
get_seqlen_agnostic_capture_inputs
(
batch_size
)
return
self
.
mamba_cache
.
get_seqlen_agnostic_capture_inputs
(
batch_size
)
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/mamba2.py
View file @
1c3ffdbe
...
@@ -30,7 +30,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState,
...
@@ -30,7 +30,6 @@ from vllm.model_executor.models.interfaces import (HasInnerState,
IsAttentionFree
)
IsAttentionFree
)
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
MambaCacheParams
)
MambaCacheParams
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
LayerBlockType
from
vllm.utils
import
LayerBlockType
...
@@ -335,10 +334,8 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
...
@@ -335,10 +334,8 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
def
get_seqlen_agnostic_capture_inputs
(
self
,
batch_size
:
int
):
def
get_seqlen_agnostic_capture_inputs
(
self
,
batch_size
:
int
):
return
self
.
mamba_cache
.
get_seqlen_agnostic_capture_inputs
(
batch_size
)
return
self
.
mamba_cache
.
get_seqlen_agnostic_capture_inputs
(
batch_size
)
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/medusa.py
View file @
1c3ffdbe
...
@@ -104,12 +104,11 @@ class Medusa(nn.Module):
...
@@ -104,12 +104,11 @@ class Medusa(nn.Module):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
list
[
torch
.
Tensor
],
hidden_states
:
list
[
torch
.
Tensor
],
sampling_metadata
,
)
->
list
[
torch
.
Tensor
]:
)
->
list
[
torch
.
Tensor
]:
logits_lst
:
list
[
torch
.
Tensor
]
=
[]
logits_lst
:
list
[
torch
.
Tensor
]
=
[]
for
hs
,
lm_head
in
zip
(
hidden_states
,
self
.
lm_heads
):
for
hs
,
lm_head
in
zip
(
hidden_states
,
self
.
lm_heads
):
_logits
=
self
.
logits_processor
(
lm_head
,
hs
,
sampling_metadata
)
_logits
=
self
.
logits_processor
(
lm_head
,
hs
)
if
_logits
is
None
:
if
_logits
is
None
:
# _logits should only be None on rank > 0, in which case
# _logits should only be None on rank > 0, in which case
...
...
vllm/model_executor/models/midashenglm.py
View file @
1c3ffdbe
...
@@ -42,7 +42,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
...
@@ -42,7 +42,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear
)
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.model_loader.utils
import
set_default_torch_dtype
from
vllm.model_executor.model_loader.utils
import
set_default_torch_dtype
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
)
MultiModalKwargsItems
)
...
@@ -784,9 +783,8 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -784,9 +783,8 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
decoder
.
compute_logits
(
hidden_states
,
sampling_metadata
)
return
self
.
decoder
.
compute_logits
(
hidden_states
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/mimo.py
View file @
1c3ffdbe
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.models.qwen2
import
Qwen2ForCausalLM
,
Qwen2Model
from
vllm.model_executor.models.qwen2
import
Qwen2ForCausalLM
,
Qwen2Model
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.utils
import
PPMissingLayer
,
is_pp_missing_parameter
,
maybe_prefix
from
.utils
import
PPMissingLayer
,
is_pp_missing_parameter
,
maybe_prefix
...
@@ -183,9 +182,7 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
...
@@ -183,9 +182,7 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
hidden_states
=
self
.
model
.
norm
(
hidden_states
)
hidden_states
=
self
.
model
.
norm
(
hidden_states
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
vllm/model_executor/models/mimo_mtp.py
View file @
1c3ffdbe
...
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.qwen2
import
Qwen2DecoderLayer
from
vllm.model_executor.models.qwen2
import
Qwen2DecoderLayer
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.utils
import
maybe_prefix
from
.utils
import
maybe_prefix
...
@@ -140,12 +139,10 @@ class MiMoMultiTokenPredictor(nn.Module):
...
@@ -140,12 +139,10 @@ class MiMoMultiTokenPredictor(nn.Module):
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
lm_head
:
ParallelLMHead
,
lm_head
:
ParallelLMHead
,
sampling_metadata
:
SamplingMetadata
,
spec_step_idx
:
int
=
0
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
self
.
mtp_layers
[
str
(
self
.
mtp_start_layer_idx
+
spec_step_idx
)]
self
.
mtp_layers
[
str
(
self
.
mtp_start_layer_idx
+
spec_step_idx
)]
logits
=
self
.
logits_processor
(
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
...
@@ -178,11 +175,10 @@ class MiMoMTP(nn.Module):
...
@@ -178,11 +175,10 @@ class MiMoMTP(nn.Module):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
spec_step_idx
:
int
=
0
,
spec_step_idx
:
int
=
0
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
model
.
compute_logits
(
hidden_states
,
self
.
lm_head
,
return
self
.
model
.
compute_logits
(
hidden_states
,
self
.
lm_head
,
sampling_metadata
,
spec_step_idx
)
spec_step_idx
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/minicpm.py
View file @
1c3ffdbe
...
@@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
...
@@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -583,10 +582,8 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -583,10 +582,8 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment