Commit a1825fe6 authored by Roger Wang's avatar Roger Wang Committed by simon-mo
Browse files

[MM] Add text-only mode for Qwen3-VL (#26000)


Signed-off-by: default avatarsimon-mo <simon.mo@hey.com>
parent bab9231b
...@@ -1126,7 +1126,10 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1126,7 +1126,10 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
self.config = config self.config = config
self.multimodal_config = multimodal_config self.multimodal_config = multimodal_config
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
if not multimodal_config.get_limit_per_prompt("image") and \
not multimodal_config.get_limit_per_prompt("video"):
self.visual = None
else:
self.visual = Qwen3_VisionTransformer( self.visual = Qwen3_VisionTransformer(
config.vision_config, config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6), norm_eps=getattr(config, "rms_norm_eps", 1e-6),
...@@ -1149,11 +1152,15 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1149,11 +1152,15 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
config.vision_config.deepstack_visual_indexes config.vision_config.deepstack_visual_indexes
) if self.use_deepstack else 0 ) if self.use_deepstack else 0
# register buffer for deepstack # register buffer for deepstack
if self.use_deepstack and self.visual is not None:
self.deepstack_input_embeds = [ self.deepstack_input_embeds = [
torch.zeros(vllm_config.scheduler_config.max_num_batched_tokens, torch.zeros(
vllm_config.scheduler_config.max_num_batched_tokens,
config.text_config.hidden_size) config.text_config.hidden_size)
for _ in range(self.deepstack_num_level) for _ in range(self.deepstack_num_level)
] if self.use_deepstack else None ]
else:
self.deepstack_input_embeds = None
self.visual_dim = config.vision_config.out_hidden_size self.visual_dim = config.vision_config.out_hidden_size
self.multiscale_dim = self.visual_dim * self.deepstack_num_level self.multiscale_dim = self.visual_dim * self.deepstack_num_level
...@@ -1588,7 +1595,11 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1588,7 +1595,11 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
def load_weights(self, weights: Iterable[tuple[str, def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]: torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(self)
skip_prefixes = []
if self.visual is None:
skip_prefixes.extend(["visual."])
loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
def get_mm_mapping(self) -> MultiModelKeys: def get_mm_mapping(self) -> MultiModelKeys:
......
...@@ -319,6 +319,10 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): ...@@ -319,6 +319,10 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
self.multimodal_config = multimodal_config self.multimodal_config = multimodal_config
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
if not multimodal_config.get_limit_per_prompt("image") and \
not multimodal_config.get_limit_per_prompt("video"):
self.visual = None
else:
self.visual = Qwen3_VisionTransformer( self.visual = Qwen3_VisionTransformer(
config.vision_config, config.vision_config,
norm_eps=getattr(config, "rms_norm_eps", 1e-6), norm_eps=getattr(config, "rms_norm_eps", 1e-6),
...@@ -341,10 +345,14 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): ...@@ -341,10 +345,14 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
config.vision_config.deepstack_visual_indexes config.vision_config.deepstack_visual_indexes
) if self.use_deepstack else 0 ) if self.use_deepstack else 0
# register buffer for deepstack # register buffer for deepstack
if self.use_deepstack and self.visual is not None:
self.deepstack_input_embeds = [ self.deepstack_input_embeds = [
torch.zeros(vllm_config.scheduler_config.max_num_batched_tokens, torch.zeros(
vllm_config.scheduler_config.max_num_batched_tokens,
config.text_config.hidden_size) config.text_config.hidden_size)
for _ in range(self.deepstack_num_level) for _ in range(self.deepstack_num_level)
] if self.use_deepstack else None ]
else:
self.deepstack_input_embeds = None
self.visual_dim = config.vision_config.out_hidden_size self.visual_dim = config.vision_config.out_hidden_size
self.multiscale_dim = self.visual_dim * self.deepstack_num_level self.multiscale_dim = self.visual_dim * self.deepstack_num_level
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment