Support DP attention with GPT-OSS (#9359)

c10b8e6a · Nicolas Castet · GitHub · d4bce297 · c10b8e6a · c10b8e6a
Unverified Commit c10b8e6a authored Aug 20, 2025 by Nicolas Castet Committed by GitHub Aug 20, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 5 deletions

python/sglang/srt/models/gpt_oss.py python/sglang/srt/models/gpt_oss.py +1 -1

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +5 -4

No files found.
--- a/python/sglang/srt/models/gpt_oss.py
+++ b/python/sglang/srt/models/gpt_oss.py
@@ -1091,7 +1091,7 @@ class GptOssForCausalLM(nn.Module):
                    if name in params_dict.keys():
                        param = params_dict[name]
                        if "sinks" in name:
-                            start = tp_rank * param.numel()
+                            start = get_attention_tp_rank() * param.numel()
                            param.data.copy_(
                                loaded_weight[start : start + param.numel()]
                            )

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -2183,10 +2183,11 @@ class ServerArgs:
            ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
            if is_sm100_supported():
-                self.enable_flashinfer_allreduce_fusion = True
+                if not self.enable_dp_attention:
-                logger.info(
+                    self.enable_flashinfer_allreduce_fusion = True
-                    "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
+                    logger.info(
-                )
+                        "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
+                    )
            quantization_config = getattr(hf_config, "quantization_config", None)
            is_mxfp4_quant_format = (
                quantization_config is not None