Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ac2bf41e
Unverified
Commit
ac2bf41e
authored
Jul 17, 2025
by
Cyrus Leung
Committed by
GitHub
Jul 16, 2025
Browse files
[Model] Remove model sampler (#21059)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
a931b4cd
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
0 additions
and
45 deletions
+0
-45
vllm/model_executor/models/bailing_moe.py
vllm/model_executor/models/bailing_moe.py
+0
-10
vllm/model_executor/models/granite_speech.py
vllm/model_executor/models/granite_speech.py
+0
-2
vllm/model_executor/models/hunyuan_v1_moe.py
vllm/model_executor/models/hunyuan_v1_moe.py
+0
-10
vllm/model_executor/models/mimo.py
vllm/model_executor/models/mimo.py
+0
-2
vllm/model_executor/models/mimo_mtp.py
vllm/model_executor/models/mimo_mtp.py
+0
-11
vllm/model_executor/models/phi4flash.py
vllm/model_executor/models/phi4flash.py
+0
-10
No files found.
vllm/model_executor/models/bailing_moe.py
View file @
ac2bf41e
...
@@ -47,7 +47,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
...
@@ -47,7 +47,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
@@ -485,7 +484,6 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP):
...
@@ -485,7 +484,6 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP):
else
:
else
:
self
.
lm_head
=
PPMissingLayer
()
self
.
lm_head
=
PPMissingLayer
()
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
self
.
model
.
make_empty_intermediate_tensors
)
...
@@ -512,14 +510,6 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP):
...
@@ -512,14 +510,6 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP):
sampling_metadata
)
sampling_metadata
)
return
logits
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
loader
=
AutoWeightsLoader
(
...
...
vllm/model_executor/models/granite_speech.py
View file @
ac2bf41e
...
@@ -36,7 +36,6 @@ from vllm.config import CacheConfig, VllmConfig
...
@@ -36,7 +36,6 @@ from vllm.config import CacheConfig, VllmConfig
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
RowParallelLinear
)
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
get_sampler
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
...
@@ -549,7 +548,6 @@ class GraniteSpeechForConditionalGeneration(
...
@@ -549,7 +548,6 @@ class GraniteSpeechForConditionalGeneration(
self
.
config
=
config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
quant_config
=
quant_config
self
.
cache_config
=
cache_config
self
.
cache_config
=
cache_config
self
.
sampler
=
get_sampler
()
# The language model is typically a Granite LLM
# The language model is typically a Granite LLM
self
.
language_model
=
init_vllm_registered_model
(
self
.
language_model
=
init_vllm_registered_model
(
...
...
vllm/model_executor/models/hunyuan_v1_moe.py
View file @
ac2bf41e
...
@@ -49,7 +49,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
...
@@ -49,7 +49,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
...
@@ -661,7 +660,6 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
...
@@ -661,7 +660,6 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
config
.
vocab_size
,
logit_scale
)
logit_scale
)
self
.
sampler
=
get_sampler
()
else
:
else
:
self
.
lm_head
=
PPMissingLayer
()
self
.
lm_head
=
PPMissingLayer
()
...
@@ -685,14 +683,6 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
...
@@ -685,14 +683,6 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
sampling_metadata
)
sampling_metadata
)
return
logits
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
make_empty_intermediate_tensors
(
def
make_empty_intermediate_tensors
(
self
,
batch_size
:
int
,
dtype
:
torch
.
dtype
,
self
,
batch_size
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
)
->
IntermediateTensors
:
device
:
torch
.
device
)
->
IntermediateTensors
:
...
...
vllm/model_executor/models/mimo.py
View file @
ac2bf41e
...
@@ -36,7 +36,6 @@ from vllm.config import VllmConfig
...
@@ -36,7 +36,6 @@ from vllm.config import VllmConfig
from
vllm.distributed
import
get_pp_group
from
vllm.distributed
import
get_pp_group
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
(
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
default_weight_loader
,
maybe_remap_kv_scale_name
)
...
@@ -176,7 +175,6 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
...
@@ -176,7 +175,6 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
self
.
lm_head
=
PPMissingLayer
()
self
.
lm_head
=
PPMissingLayer
()
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
self
.
model
.
make_empty_intermediate_tensors
)
...
...
vllm/model_executor/models/mimo_mtp.py
View file @
ac2bf41e
...
@@ -30,7 +30,6 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
...
@@ -30,7 +30,6 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
@@ -161,8 +160,6 @@ class MiMoMTP(nn.Module):
...
@@ -161,8 +160,6 @@ class MiMoMTP(nn.Module):
self
.
lm_head
=
ParallelLMHead
(
self
.
config
.
vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
self
.
config
.
vocab_size
,
self
.
config
.
hidden_size
)
self
.
config
.
hidden_size
)
self
.
sampler
=
get_sampler
()
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
...
@@ -187,14 +184,6 @@ class MiMoMTP(nn.Module):
...
@@ -187,14 +184,6 @@ class MiMoMTP(nn.Module):
return
self
.
model
.
compute_logits
(
hidden_states
,
self
.
lm_head
,
return
self
.
model
.
compute_logits
(
hidden_states
,
self
.
lm_head
,
sampling_metadata
,
spec_step_idx
)
sampling_metadata
,
spec_step_idx
)
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
torch
.
Tensor
]])
->
set
[
str
]:
stacked_params_mapping
=
[
stacked_params_mapping
=
[
...
...
vllm/model_executor/models/phi4flash.py
View file @
ac2bf41e
...
@@ -23,7 +23,6 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
...
@@ -23,7 +23,6 @@ from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
causal_conv1d_fn
,
causal_conv1d_update
)
causal_conv1d_fn
,
causal_conv1d_update
)
from
vllm.model_executor.layers.mamba.ops.mamba_ssm
import
(
from
vllm.model_executor.layers.mamba.ops.mamba_ssm
import
(
selective_scan_fn
,
selective_state_update
)
selective_scan_fn
,
selective_state_update
)
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.models.interfaces
import
(
HasInnerState
,
IsHybrid
,
from
vllm.model_executor.models.interfaces
import
(
HasInnerState
,
IsHybrid
,
...
@@ -641,7 +640,6 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
...
@@ -641,7 +640,6 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
config
.
vocab_size
,
logits_as_input
=
False
)
logits_as_input
=
False
)
self
.
sampler
=
get_sampler
()
def
forward
(
def
forward
(
self
,
self
,
...
@@ -709,14 +707,6 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
...
@@ -709,14 +707,6 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
prune_hidden_states
=
prune_hidden_states
)
prune_hidden_states
=
prune_hidden_states
)
return
processed_logits
return
processed_logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
def
load_weights
(
self
,
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]],
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment