[NVIDIA] Fix wrong symmetric sizes for fp4 cases (#12640)

34f7564d · Kaixi Hou · GitHub · 1cfbbc42 · 34f7564d
Unverified Commit 34f7564d authored Nov 04, 2025 by Kaixi Hou Committed by GitHub Nov 04, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 1 deletion

python/sglang/srt/layers/quantization/modelopt_quant.py python/sglang/srt/layers/quantization/modelopt_quant.py +6 -1

No files found.
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -1602,8 +1602,13 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
                x_sf = nvfp4_block_scale_interleave(x_sf)

            with use_symmetric_memory(get_tp_group()) as sm:
+                # The x might be packed in the case of fp4. So, use the output dim of the
+                # weight of the second GEMM.
                symm_output = torch.empty(
-                    x.shape[0], x.shape[1], dtype=output_dtype, device=x.device
+                    x.shape[0],
+                    layer.w2_weight.shape[1],
+                    dtype=output_dtype,
+                    device=x.device,
                )
                sm.tag(symm_output)