Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7a6ebcbf
Unverified
Commit
7a6ebcbf
authored
Mar 19, 2026
by
Cyrus Leung
Committed by
GitHub
Mar 19, 2026
Browse files
[Model] Remove unnecessary `get_language_model` (#37545)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
c7bc12c2
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
93 additions
and
95 deletions
+93
-95
vllm/model_executor/models/cohere_asr.py
vllm/model_executor/models/cohere_asr.py
+21
-8
vllm/model_executor/models/ernie45_vl.py
vllm/model_executor/models/ernie45_vl.py
+0
-1
vllm/model_executor/models/fireredasr2.py
vllm/model_executor/models/fireredasr2.py
+10
-5
vllm/model_executor/models/hyperclovax_vision_v2.py
vllm/model_executor/models/hyperclovax_vision_v2.py
+14
-23
vllm/model_executor/models/interns1_pro.py
vllm/model_executor/models/interns1_pro.py
+7
-8
vllm/model_executor/models/kimi_audio.py
vllm/model_executor/models/kimi_audio.py
+18
-28
vllm/model_executor/models/lightonocr.py
vllm/model_executor/models/lightonocr.py
+23
-22
No files found.
vllm/model_executor/models/cohere_asr.py
View file @
7a6ebcbf
...
@@ -1704,6 +1704,12 @@ class ConformerEncoder(nn.Module):
...
@@ -1704,6 +1704,12 @@ class ConformerEncoder(nn.Module):
# ----- Encoder END -----
# ----- Encoder END -----
# This subclass is specific to vLLM in order for
# `_mark_composite_model` to target this module
class
CohereASRProjector
(
nn
.
Linear
):
pass
class
CohereASRModel
(
nn
.
Module
):
class
CohereASRModel
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
...
@@ -1714,7 +1720,7 @@ class CohereASRModel(nn.Module):
...
@@ -1714,7 +1720,7 @@ class CohereASRModel(nn.Module):
)
)
if
self
.
encoder
.
d_model
!=
self
.
decoder
.
hidden_size
:
if
self
.
encoder
.
d_model
!=
self
.
decoder
.
hidden_size
:
self
.
encoder_decoder_proj
=
torch
.
nn
.
Linea
r
(
self
.
encoder_decoder_proj
=
CohereASRProjecto
r
(
self
.
encoder
.
d_model
,
self
.
decoder
.
hidden_size
self
.
encoder
.
d_model
,
self
.
decoder
.
hidden_size
)
)
...
@@ -2096,18 +2102,25 @@ class CohereASRForConditionalGeneration(
...
@@ -2096,18 +2102,25 @@ class CohereASRForConditionalGeneration(
self
.
config
=
config
self
.
config
=
config
self
.
dtype
=
vllm_config
.
model_config
.
dtype
self
.
dtype
=
vllm_config
.
model_config
.
dtype
with
self
.
_mark_composite_model
(
vllm_config
,
language_targets
=
CohereASRDecoder
,
tower_targets
=
{
"audio"
:
(
ConformerEncoder
,
CohereASRProjector
)},
):
self
.
model
=
CohereASRModel
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
self
.
model
=
CohereASRModel
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
lm_head_config
=
config
.
head
self
.
unpadded_vocab_size
=
lm_head_config
[
"num_classes"
]
head_config
=
config
.
head
self
.
proj_out
=
ParallelLMHead
(
self
.
proj_out
=
ParallelLMHead
(
lm_
head_config
[
"num_classes"
],
head_config
[
"num_classes"
],
lm_
head_config
[
"hidden_size"
],
head_config
[
"hidden_size"
],
quant_config
=
quant_config
,
quant_config
=
quant_config
,
bias
=
True
,
bias
=
True
,
)
# NOTE: bias is True
)
# NOTE: bias is True
logit_scale
=
getattr
(
lm_head_config
,
"logit_scale"
,
1.0
)
logit_scale
=
getattr
(
head_config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
lm_
head_config
[
"num_classes"
],
logit_scale
head_config
[
"num_classes"
],
scale
=
logit_scale
)
)
def
forward
(
def
forward
(
...
...
vllm/model_executor/models/ernie45_vl.py
View file @
7a6ebcbf
...
@@ -1373,7 +1373,6 @@ class Ernie4_5_VLMoeForConditionalGeneration(
...
@@ -1373,7 +1373,6 @@ class Ernie4_5_VLMoeForConditionalGeneration(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
)
->
torch
.
Tensor
|
None
:
)
->
torch
.
Tensor
|
None
:
"""compute logits"""
return
self
.
language_model
.
compute_logits
(
hidden_states
)
return
self
.
language_model
.
compute_logits
(
hidden_states
)
def
_vision_forward
(
def
_vision_forward
(
...
...
vllm/model_executor/models/fireredasr2.py
View file @
7a6ebcbf
...
@@ -754,12 +754,17 @@ class FireRedASR2ForConditionalGeneration(
...
@@ -754,12 +754,17 @@ class FireRedASR2ForConditionalGeneration(
self
.
config
=
config
self
.
config
=
config
self
.
dtype
=
vllm_config
.
model_config
.
dtype
self
.
dtype
=
vllm_config
.
model_config
.
dtype
with
self
.
_mark_composite_model
(
vllm_config
,
language_targets
=
Qwen2ForCausalLM
,
tower_targets
=
{
"audio"
:
(
FireRedASR2Encoder
,
FireRedASR2Adapter
)},
):
self
.
model
=
FireRedASR2Model
(
self
.
model
=
FireRedASR2Model
(
vllm_config
=
vllm_config
,
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
),
prefix
=
maybe_prefix
(
prefix
,
"model"
),
)
)
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
,
scale
=
logit_scale
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
,
scale
=
logit_scale
)
def
forward
(
def
forward
(
...
...
vllm/model_executor/models/hyperclovax_vision_v2.py
View file @
7a6ebcbf
...
@@ -470,15 +470,6 @@ class HCXVisionV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -470,15 +470,6 @@ class HCXVisionV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self
.
vision_config
=
vision_config
self
.
vision_config
=
vision_config
self
.
text_config
=
text_config
self
.
text_config
=
text_config
self
.
vllm_config
=
vllm_config
self
.
vllm_config
=
vllm_config
self
.
dtype
=
vllm_config
.
model_config
.
dtype
# Initialize Qwen2.5 Vision Transformer
self
.
visual
=
Qwen2_5_VisionTransformer
(
vision_config
=
vision_config
,
norm_eps
=
getattr
(
config
,
"rms_norm_eps"
,
1e-6
),
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"visual"
),
)
# Linear projector (vision_hidden_size -> text_hidden_size)
# Linear projector (vision_hidden_size -> text_hidden_size)
# For V2 model: mm_projector_type is "linear"
# For V2 model: mm_projector_type is "linear"
...
@@ -492,13 +483,16 @@ class HCXVisionV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -492,13 +483,16 @@ class HCXVisionV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
else
:
else
:
out_hidden
=
vision_hidden_size
out_hidden
=
vision_hidden_size
# Always create Linear projector since HF checkpoint has mm_projector weights
with
self
.
_mark_tower_model
(
vllm_config
,
{
"image"
,
"video"
}):
self
.
visual
=
Qwen2_5_VisionTransformer
(
vision_config
=
vision_config
,
norm_eps
=
getattr
(
config
,
"rms_norm_eps"
,
1e-6
),
quant_config
=
quant_config
,
prefix
=
maybe_prefix
(
prefix
,
"visual"
),
)
self
.
mm_projector
=
nn
.
Linear
(
out_hidden
,
text_hidden_size
)
self
.
mm_projector
=
nn
.
Linear
(
out_hidden
,
text_hidden_size
)
# Language model
with
self
.
_mark_language_model
(
vllm_config
):
self
.
lm_head_vocab_size
=
getattr
(
text_config
,
"padded_vocab_size"
,
text_config
.
vocab_size
)
self
.
language_model
=
init_vllm_registered_model
(
self
.
language_model
=
init_vllm_registered_model
(
vllm_config
=
vllm_config
,
vllm_config
=
vllm_config
,
hf_config
=
text_config
,
hf_config
=
text_config
,
...
@@ -633,9 +627,6 @@ class HCXVisionV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -633,9 +627,6 @@ class HCXVisionV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
return
modalities
return
modalities
def
get_language_model
(
self
)
->
torch
.
nn
.
Module
:
return
self
.
language_model
def
embed_multimodal
(
def
embed_multimodal
(
self
,
self
,
**
kwargs
:
object
,
**
kwargs
:
object
,
...
...
vllm/model_executor/models/interns1_pro.py
View file @
7a6ebcbf
...
@@ -576,20 +576,19 @@ class InternS1ProForConditionalGeneration(
...
@@ -576,20 +576,19 @@ class InternS1ProForConditionalGeneration(
multimodal_config
.
is_multimodal_pruning_enabled
()
multimodal_config
.
is_multimodal_pruning_enabled
()
)
)
if
not
multimodal_config
.
get_limit_per_prompt
(
with
self
.
_mark_tower_model
(
vllm_config
,
{
"image"
,
"video"
}):
"image"
)
and
not
multimodal_config
.
get_limit_per_prompt
(
"video"
):
self
.
visual
=
None
else
:
self
.
visual
=
Qwen3_VisionTransformer
(
self
.
visual
=
Qwen3_VisionTransformer
(
config
.
vision_config
,
config
.
vision_config
,
norm_eps
=
getattr
(
config
,
"rms_norm_eps"
,
1e-6
),
norm_eps
=
getattr
(
config
,
"rms_norm_eps"
,
1e-6
),
prefix
=
maybe_prefix
(
prefix
,
"visual"
),
prefix
=
maybe_prefix
(
prefix
,
"visual"
),
)
)
with
self
.
_mark_language_model
(
vllm_config
):
self
.
language_model
=
InternS1ProMoeLLMForCausalLM
(
self
.
language_model
=
InternS1ProMoeLLMForCausalLM
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"language_model"
)
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"language_model"
),
)
)
# Whether to include the gate_up_proj mapping is determined by
# Whether to include the gate_up_proj mapping is determined by
# the language model.
# the language model.
self
.
packed_modules_mapping
=
(
self
.
packed_modules_mapping
=
(
...
...
vllm/model_executor/models/kimi_audio.py
View file @
7a6ebcbf
...
@@ -15,7 +15,6 @@ from transformers import WhisperConfig as HFWhisperConfig
...
@@ -15,7 +15,6 @@ from transformers import WhisperConfig as HFWhisperConfig
from
vllm.config
import
ModelConfig
,
SpeechToTextConfig
,
VllmConfig
from
vllm.config
import
ModelConfig
,
SpeechToTextConfig
,
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.inputs.data
import
PromptType
,
TokensPrompt
from
vllm.inputs.data
import
PromptType
,
TokensPrompt
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.model_loader
import
DefaultModelLoader
from
vllm.model_executor.model_loader
import
DefaultModelLoader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.interfaces
import
(
from
vllm.model_executor.models.interfaces
import
(
...
@@ -54,7 +53,6 @@ from vllm.tokenizers import cached_get_tokenizer
...
@@ -54,7 +53,6 @@ from vllm.tokenizers import cached_get_tokenizer
from
vllm.tokenizers.kimi_audio
import
KimiAudioTokenizer
from
vllm.tokenizers.kimi_audio
import
KimiAudioTokenizer
from
vllm.transformers_utils.processor
import
cached_feature_extractor_from_config
from
vllm.transformers_utils.processor
import
cached_feature_extractor_from_config
from
vllm.transformers_utils.processors.kimi_audio
import
KimiAudioProcessor
from
vllm.transformers_utils.processors.kimi_audio
import
KimiAudioProcessor
from
vllm.v1.sample.metadata
import
SamplingMetadata
# Kimi-Audio constants
# Kimi-Audio constants
KIMIA_WHISPER_SUBFOLDER
=
"whisper-large-v3"
KIMIA_WHISPER_SUBFOLDER
=
"whisper-large-v3"
...
@@ -431,17 +429,18 @@ class KimiAudioForConditionalGeneration(
...
@@ -431,17 +429,18 @@ class KimiAudioForConditionalGeneration(
)
)
]
]
with
self
.
_mark_tower_model
(
vllm_config
,
"audio"
):
self
.
audio_tower
=
KimiAudioWhisperEncoder
(
self
.
audio_tower
=
KimiAudioWhisperEncoder
(
vllm_config
=
vllm_config
,
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"audio_tower"
),
prefix
=
maybe_prefix
(
prefix
,
"audio_tower"
),
)
)
self
.
multi_modal_projector
=
KimiAudioMultiModalProjector
(
self
.
multi_modal_projector
=
KimiAudioMultiModalProjector
(
whisper_dim
=
getattr
(
self
.
config
,
"kimia_adaptor_input_dim"
,
5120
),
whisper_dim
=
getattr
(
self
.
config
,
"kimia_adaptor_input_dim"
,
5120
),
llm_dim
=
self
.
config
.
hidden_size
,
llm_dim
=
self
.
config
.
hidden_size
,
prefix
=
maybe_prefix
(
prefix
,
"multi_modal_projector"
),
prefix
=
maybe_prefix
(
prefix
,
"multi_modal_projector"
),
)
)
with
self
.
_mark_language_model
(
vllm_config
):
self
.
language_model
=
init_vllm_registered_model
(
self
.
language_model
=
init_vllm_registered_model
(
vllm_config
=
vllm_config
.
with_hf_config
(
vllm_config
=
vllm_config
.
with_hf_config
(
self
.
config
,
architectures
=
[
"Qwen2ForCausalLM"
]
self
.
config
,
architectures
=
[
"Qwen2ForCausalLM"
]
...
@@ -449,11 +448,6 @@ class KimiAudioForConditionalGeneration(
...
@@ -449,11 +448,6 @@ class KimiAudioForConditionalGeneration(
prefix
=
maybe_prefix
(
prefix
,
"language_model"
),
prefix
=
maybe_prefix
(
prefix
,
"language_model"
),
)
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
config
.
vocab_size
,
self
.
config
.
vocab_size
,
)
self
.
make_empty_intermediate_tensors
=
(
self
.
make_empty_intermediate_tensors
=
(
self
.
language_model
.
make_empty_intermediate_tensors
self
.
language_model
.
make_empty_intermediate_tensors
)
)
...
@@ -595,12 +589,8 @@ class KimiAudioForConditionalGeneration(
...
@@ -595,12 +589,8 @@ class KimiAudioForConditionalGeneration(
def
compute_logits
(
def
compute_logits
(
self
,
self
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
|
None
=
None
,
)
->
torch
.
Tensor
|
None
:
)
->
torch
.
Tensor
|
None
:
logits
=
self
.
logits_processor
(
return
self
.
language_model
.
compute_logits
(
hidden_states
)
self
.
language_model
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
"""Load weights, skipping MIMO layers (TTS-only) for ASR."""
"""Load weights, skipping MIMO layers (TTS-only) for ASR."""
...
...
vllm/model_executor/models/lightonocr.py
View file @
7a6ebcbf
...
@@ -163,13 +163,13 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
...
@@ -163,13 +163,13 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
self
.
config
=
config
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
self
.
multimodal_config
=
multimodal_config
with
self
.
_mark_tower_model
(
vllm_config
,
"image"
):
self
.
vision_tower
=
init_vision_tower_for_llava
(
self
.
vision_tower
=
init_vision_tower_for_llava
(
config
,
config
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
require_post_norm
=
False
,
require_post_norm
=
False
,
prefix
=
maybe_prefix
(
prefix
,
"vision_tower"
),
prefix
=
maybe_prefix
(
prefix
,
"vision_tower"
),
)
)
self
.
multi_modal_projector
=
Mistral3MultiModalProjector
(
self
.
multi_modal_projector
=
Mistral3MultiModalProjector
(
vision_hidden_size
=
config
.
vision_config
.
hidden_size
,
vision_hidden_size
=
config
.
vision_config
.
hidden_size
,
text_hidden_size
=
config
.
text_config
.
hidden_size
,
text_hidden_size
=
config
.
text_config
.
hidden_size
,
...
@@ -181,6 +181,7 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
...
@@ -181,6 +181,7 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
prefix
=
maybe_prefix
(
prefix
,
"multi_modal_projector"
),
prefix
=
maybe_prefix
(
prefix
,
"multi_modal_projector"
),
)
)
with
self
.
_mark_language_model
(
vllm_config
):
self
.
language_model
=
init_vllm_registered_model
(
self
.
language_model
=
init_vllm_registered_model
(
vllm_config
=
vllm_config
,
vllm_config
=
vllm_config
,
hf_config
=
config
.
text_config
,
hf_config
=
config
.
text_config
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment