support fp8_e4m3 and fp8_e5m2.

59b01a00 · linhai1 · 34f0ebb1 · 59b01a00
Commit 59b01a00 authored Nov 11, 2025 by linhai1
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

python/sglang/srt/layers/attention/dcu_mla_backend.py python/sglang/srt/layers/attention/dcu_mla_backend.py +4 -2

No files found.
--- a/python/sglang/srt/layers/attention/dcu_mla_backend.py
+++ b/python/sglang/srt/layers/attention/dcu_mla_backend.py
@@ -394,9 +394,11 @@ class DCUMLABackend(AttentionBackend):
            getattr(torch, "float8_e5m2", None),
            getattr(torch, "float8_e5m2fnuz", None),
        ):
-            if k_cache_reshaped.dtype == torch.float8_e4m3fnuz:
+            if k_cache_reshaped.dtype == torch.float8_e4m3fnuz or \
+                k_cache_reshaped.dtype == torch.float8_e4m3fn:
                kv_cache_dtype="fp8_e4m3"
-            elif k_cache_reshaped.dtype == torch.float8_e5m2fnuz:
+            elif k_cache_reshaped.dtype == torch.float8_e5m2fnuz or \
+                k_cache_reshaped.dtype == torch.float8_e5m2:
                kv_cache_dtype="fp8_e5m2"
            k_scale = layer.k_scale if layer.k_scale is not None else torch.tensor([1.0], dtype=torch.float32, device=reshape_q.device)
            o = self._call_fp8_decode(