Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1c3ffdbe
Unverified
Commit
1c3ffdbe
authored
Sep 21, 2025
by
Woosuk Kwon
Committed by
GitHub
Sep 21, 2025
Browse files
[V0 Deprecation] Remove V0 sampling metadata (#25345)
Signed-off-by:
Woosuk Kwon
<
woosuk@thinkingmachines.ai
>
parent
c438b295
Changes
141
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
23 additions
and
81 deletions
+23
-81
vllm/model_executor/models/minicpm_eagle.py
vllm/model_executor/models/minicpm_eagle.py
+1
-4
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+1
-3
vllm/model_executor/models/minimax_text_01.py
vllm/model_executor/models/minimax_text_01.py
+2
-5
vllm/model_executor/models/minimax_vl_01.py
vllm/model_executor/models/minimax_vl_01.py
+1
-4
vllm/model_executor/models/mistral3.py
vllm/model_executor/models/mistral3.py
+1
-4
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral.py
+1
-4
vllm/model_executor/models/mllama4.py
vllm/model_executor/models/mllama4.py
+1
-4
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+2
-5
vllm/model_executor/models/mpt.py
vllm/model_executor/models/mpt.py
+1
-4
vllm/model_executor/models/nano_nemotron_vl.py
vllm/model_executor/models/nano_nemotron_vl.py
+1
-4
vllm/model_executor/models/nemotron.py
vllm/model_executor/models/nemotron.py
+1
-4
vllm/model_executor/models/nemotron_h.py
vllm/model_executor/models/nemotron_h.py
+1
-4
vllm/model_executor/models/nemotron_nas.py
vllm/model_executor/models/nemotron_nas.py
+1
-4
vllm/model_executor/models/nemotron_vl.py
vllm/model_executor/models/nemotron_vl.py
+1
-4
vllm/model_executor/models/olmo.py
vllm/model_executor/models/olmo.py
+1
-4
vllm/model_executor/models/olmo2.py
vllm/model_executor/models/olmo2.py
+1
-4
vllm/model_executor/models/olmoe.py
vllm/model_executor/models/olmoe.py
+2
-5
vllm/model_executor/models/opt.py
vllm/model_executor/models/opt.py
+1
-4
vllm/model_executor/models/orion.py
vllm/model_executor/models/orion.py
+1
-4
vllm/model_executor/models/ovis.py
vllm/model_executor/models/ovis.py
+1
-3
No files found.
vllm/model_executor/models/minicpm_eagle.py
View file @
1c3ffdbe
...
@@ -39,7 +39,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
...
@@ -39,7 +39,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsLoRA
,
SupportsPP
...
@@ -376,10 +375,8 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -376,10 +375,8 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/minicpmv.py
View file @
1c3ffdbe
...
@@ -50,7 +50,6 @@ from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
...
@@ -50,7 +50,6 @@ from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.qwen2
import
Qwen2ForCausalLM
from
vllm.model_executor.models.qwen2
import
Qwen2ForCausalLM
from
vllm.model_executor.models.qwen3
import
Qwen3ForCausalLM
from
vllm.model_executor.models.qwen3
import
Qwen3ForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
,
NestedTensors
)
MultiModalKwargsItems
,
NestedTensors
)
...
@@ -1194,9 +1193,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -1194,9 +1193,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
llm
.
compute_logits
(
hidden_states
,
sampling_metadata
)
return
self
.
llm
.
compute_logits
(
hidden_states
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/minimax_text_01.py
View file @
1c3ffdbe
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.utils
import
maybe_prefix
from
vllm.model_executor.models.utils
import
maybe_prefix
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
HasInnerState
,
IsHybrid
from
.interfaces
import
HasInnerState
,
IsHybrid
...
@@ -742,10 +741,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
...
@@ -742,10 +741,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
return
hidden_states
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
.
float
())
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
.
float
(),
sampling_metadata
)
return
logits
return
logits
...
...
vllm/model_executor/models/minimax_vl_01.py
View file @
1c3ffdbe
...
@@ -14,7 +14,6 @@ from vllm.model_executor.layers.activation import get_act_fn
...
@@ -14,7 +14,6 @@ from vllm.model_executor.layers.activation import get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
RowParallelLinear
)
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
from
vllm.multimodal.inputs
import
MultiModalFieldConfig
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -420,10 +419,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -420,10 +419,8 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/mistral3.py
View file @
1c3ffdbe
...
@@ -20,7 +20,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
...
@@ -20,7 +20,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear
)
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.cache
import
BaseMultiModalProcessorCache
from
vllm.multimodal.cache
import
BaseMultiModalProcessorCache
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
...
@@ -606,10 +605,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
...
@@ -606,10 +605,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/mixtral.py
View file @
1c3ffdbe
...
@@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -49,7 +49,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
MixtureOfExperts
,
SupportsLoRA
,
SupportsPP
from
.interfaces
import
MixtureOfExperts
,
SupportsLoRA
,
SupportsPP
...
@@ -594,10 +593,8 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
...
@@ -594,10 +593,8 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/mllama4.py
View file @
1c3ffdbe
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.model_loader.utils
import
initialize_model
from
vllm.model_executor.model_loader.utils
import
initialize_model
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
,
NestedTensors
)
MultiModalKwargsItems
,
NestedTensors
)
...
@@ -856,10 +855,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -856,10 +855,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
separate_weights
(
def
separate_weights
(
self
,
self
,
...
...
vllm/model_executor/models/molmo.py
View file @
1c3ffdbe
...
@@ -26,7 +26,6 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
...
@@ -26,7 +26,6 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
split_tensor_along_last_dim
,
split_tensor_along_last_dim
,
tensor_model_parallel_all_gather
)
tensor_model_parallel_all_gather
)
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.activation
import
(
MulAndSilu
,
QuickGELU
,
from
vllm.model_executor.layers.activation
import
(
MulAndSilu
,
QuickGELU
,
SiluAndMul
)
SiluAndMul
)
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
...
@@ -1527,10 +1526,8 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
...
@@ -1527,10 +1526,8 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
return
hidden_states
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
...
...
vllm/model_executor/models/mpt.py
View file @
1c3ffdbe
...
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
...
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsPP
from
.interfaces
import
SupportsPP
...
@@ -320,10 +319,8 @@ class MPTForCausalLM(nn.Module, SupportsPP):
...
@@ -320,10 +319,8 @@ class MPTForCausalLM(nn.Module, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/nano_nemotron_vl.py
View file @
1c3ffdbe
...
@@ -37,7 +37,6 @@ from vllm.model_executor.models.utils import (flatten_bn,
...
@@ -37,7 +37,6 @@ from vllm.model_executor.models.utils import (flatten_bn,
init_vllm_registered_model
,
init_vllm_registered_model
,
maybe_prefix
,
maybe_prefix
,
merge_multimodal_embeddings
)
merge_multimodal_embeddings
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargs
,
MultiModalKwargsItems
,
MultiModalKwargs
,
MultiModalKwargsItems
,
...
@@ -1192,10 +1191,8 @@ class NemotronH_Nano_VL(nn.Module, HasInnerState, IsHybrid,
...
@@ -1192,10 +1191,8 @@ class NemotronH_Nano_VL(nn.Module, HasInnerState, IsHybrid,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
adapter_dict
=
dict
(
self
.
mlp1
.
named_parameters
())
adapter_dict
=
dict
(
self
.
mlp1
.
named_parameters
())
...
...
vllm/model_executor/models/nemotron.py
View file @
1c3ffdbe
...
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
NemotronConfig
from
vllm.transformers_utils.configs
import
NemotronConfig
...
@@ -498,10 +497,8 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -498,10 +497,8 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/nemotron_h.py
View file @
1c3ffdbe
...
@@ -54,7 +54,6 @@ from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
...
@@ -54,7 +54,6 @@ from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
from
vllm.model_executor.models.utils
import
(
from
vllm.model_executor.models.utils
import
(
AutoWeightsLoader
,
WeightsMapper
,
make_empty_intermediate_tensors_factory
,
AutoWeightsLoader
,
WeightsMapper
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
make_layers
,
maybe_prefix
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
NemotronHConfig
from
vllm.transformers_utils.configs
import
NemotronHConfig
from
vllm.utils
import
LayerBlockType
from
vllm.utils
import
LayerBlockType
...
@@ -622,10 +621,8 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
...
@@ -622,10 +621,8 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/nemotron_nas.py
View file @
1c3ffdbe
...
@@ -44,7 +44,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -44,7 +44,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.models.llama
import
LlamaAttention
,
LlamaMLP
from
vllm.model_executor.models.llama
import
LlamaAttention
,
LlamaMLP
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
HasNoOps
,
SupportsLoRA
,
SupportsPP
from
.interfaces
import
HasNoOps
,
SupportsLoRA
,
SupportsPP
...
@@ -468,10 +467,8 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
...
@@ -468,10 +467,8 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/nemotron_vl.py
View file @
1c3ffdbe
...
@@ -26,7 +26,6 @@ from vllm.model_executor.models.internvl import (
...
@@ -26,7 +26,6 @@ from vllm.model_executor.models.internvl import (
BaseInternVLProcessingInfo
,
InternVLImageEmbeddingInputs
,
BaseInternVLProcessingInfo
,
InternVLImageEmbeddingInputs
,
InternVLImageInputs
,
InternVLImagePixelInputs
,
InternVLProcessor
)
InternVLImageInputs
,
InternVLImagePixelInputs
,
InternVLProcessor
)
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.inputs
import
NestedTensors
...
@@ -632,10 +631,8 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
...
@@ -632,10 +631,8 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
return
self
.
language_model
.
compute_logits
(
hidden_states
)
sampling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/olmo.py
View file @
1c3ffdbe
...
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
...
@@ -45,7 +45,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.interfaces
import
SupportsLoRA
,
SupportsPP
...
@@ -391,10 +390,8 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
...
@@ -391,10 +390,8 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/olmo2.py
View file @
1c3ffdbe
...
@@ -54,7 +54,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
...
@@ -54,7 +54,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
from
vllm.model_executor.models.utils
import
(
from
vllm.model_executor.models.utils
import
(
AutoWeightsLoader
,
extract_layer_index
,
is_pp_missing_parameter
,
AutoWeightsLoader
,
extract_layer_index
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
Olmo3Config
from
vllm.transformers_utils.configs
import
Olmo3Config
...
@@ -427,10 +426,8 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
...
@@ -427,10 +426,8 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
...
...
vllm/model_executor/models/olmoe.py
View file @
1c3ffdbe
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsPP
from
.interfaces
import
SupportsPP
...
@@ -471,10 +470,8 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
...
@@ -471,10 +470,8 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
inputs_embeds
)
inputs_embeds
)
return
hidden_states
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/opt.py
View file @
1c3ffdbe
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsPP
from
.interfaces
import
SupportsPP
...
@@ -399,10 +398,8 @@ class OPTForCausalLM(nn.Module, SupportsPP):
...
@@ -399,10 +398,8 @@ class OPTForCausalLM(nn.Module, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/orion.py
View file @
1c3ffdbe
...
@@ -28,7 +28,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
...
@@ -28,7 +28,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsPP
from
.interfaces
import
SupportsPP
...
@@ -339,10 +338,8 @@ class OrionForCausalLM(nn.Module, SupportsPP):
...
@@ -339,10 +338,8 @@ class OrionForCausalLM(nn.Module, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
sampling_metadata
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/ovis.py
View file @
1c3ffdbe
...
@@ -39,7 +39,6 @@ from vllm.model_executor.models.siglip import SiglipVisionModel
...
@@ -39,7 +39,6 @@ from vllm.model_executor.models.siglip import SiglipVisionModel
from
vllm.model_executor.models.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
from
vllm.model_executor.models.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
init_vllm_registered_model
,
init_vllm_registered_model
,
maybe_prefix
)
maybe_prefix
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
)
MultiModalKwargsItems
)
...
@@ -558,9 +557,8 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -558,9 +557,8 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
llm
.
compute_logits
(
hidden_states
,
sampling_metadata
)
logits
=
self
.
llm
.
compute_logits
(
hidden_states
)
return
logits
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment