Unverified Commit 04f2abcb authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

fix: gemma 3 not use softcap (#5622)

parent 506be6b8
...@@ -78,6 +78,11 @@ class ModelConfig: ...@@ -78,6 +78,11 @@ class ModelConfig:
logger.info( logger.info(
"Multimodal is disabled for Llama4. To enable it, set --enable-llama4-multimodal." "Multimodal is disabled for Llama4. To enable it, set --enable-llama4-multimodal."
) )
elif self.hf_config.architectures[0] == "Gemma3ForConditionalGeneration":
enable_multimodal = False
logger.info(
"Multimodal is disabled for Gemma3. To enable it, set --enable-gemma3-multimodal."
)
else: else:
enable_multimodal = True enable_multimodal = True
......
...@@ -189,7 +189,7 @@ class Gemma3Attention(nn.Module): ...@@ -189,7 +189,7 @@ class Gemma3Attention(nn.Module):
self.scaling, self.scaling,
num_kv_heads=self.num_kv_heads, num_kv_heads=self.num_kv_heads,
layer_id=layer_id, layer_id=layer_id,
logit_cap=getattr(self.config, "attn_logit_softcapping", None), logit_cap=0.0,
# Module must also define `get_attention_sliding_window_size` to correctly initialize # Module must also define `get_attention_sliding_window_size` to correctly initialize
# attention backend in `ForwardBatch`. # attention backend in `ForwardBatch`.
sliding_window_size=self.sliding_window, sliding_window_size=self.sliding_window,
......
...@@ -154,6 +154,7 @@ class ServerArgs: ...@@ -154,6 +154,7 @@ class ServerArgs:
disable_outlines_disk_cache: bool = False disable_outlines_disk_cache: bool = False
disable_custom_all_reduce: bool = False disable_custom_all_reduce: bool = False
enable_llama4_multimodal: Optional[bool] = None enable_llama4_multimodal: Optional[bool] = None
enable_gemma3_multimodal: Optional[bool] = None
disable_overlap_schedule: bool = False disable_overlap_schedule: bool = False
enable_mixed_chunk: bool = False enable_mixed_chunk: bool = False
enable_dp_attention: bool = False enable_dp_attention: bool = False
...@@ -285,7 +286,9 @@ class ServerArgs: ...@@ -285,7 +286,9 @@ class ServerArgs:
if self.grammar_backend is None: if self.grammar_backend is None:
self.grammar_backend = "xgrammar" self.grammar_backend = "xgrammar"
self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal self.enable_multimodal: Optional[bool] = (
self.enable_llama4_multimodal or self.enable_gemma3_multimodal
)
# Data parallelism attention # Data parallelism attention
if self.enable_dp_attention: if self.enable_dp_attention:
...@@ -984,6 +987,12 @@ class ServerArgs: ...@@ -984,6 +987,12 @@ class ServerArgs:
action="store_true", action="store_true",
help="Enable the multimodal functionality for Llama-4.", help="Enable the multimodal functionality for Llama-4.",
) )
parser.add_argument(
"--enable-gemma3-multimodal",
default=ServerArgs.enable_gemma3_multimodal,
action="store_true",
help="Enable the multimodal functionality for Gemma-3.",
)
parser.add_argument( parser.add_argument(
"--disable-overlap-schedule", "--disable-overlap-schedule",
action="store_true", action="store_true",
......
...@@ -1971,6 +1971,7 @@ def is_fa3_default_architecture(hf_config): ...@@ -1971,6 +1971,7 @@ def is_fa3_default_architecture(hf_config):
"LlamaForCausalLM", "LlamaForCausalLM",
"MistralForCausalLM", "MistralForCausalLM",
"Gemma2ForCausalLM", "Gemma2ForCausalLM",
"Gemma3ForConditionalGeneration",
} }
return architectures[0] in default_archs return architectures[0] in default_archs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment