support fp8_e4m3.

04ea3540 · linhai1 · 50f7ea0f · 04ea3540
Commit 04ea3540 authored Nov 11, 2025 by linhai1
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

python/sglang/srt/layers/attention/dcu_mla_backend.py python/sglang/srt/layers/attention/dcu_mla_backend.py +3 -3

No files found.
--- a/python/sglang/srt/layers/attention/dcu_mla_backend.py
+++ b/python/sglang/srt/layers/attention/dcu_mla_backend.py
@@ -465,9 +465,9 @@ class DCUMLABackend(AttentionBackend):
            getattr(torch, "float8_e5m2fnuz", None),
        ):
            if k_cache_reshaped.dtype == torch.float8_e4m3fnuz:
-                k_cache_reshaped = k_cache_reshaped.view(torch.float8_e4m3fn)
+                kv_cache_dtype="fp8_e4m3"
            elif k_cache_reshaped.dtype == torch.float8_e5m2fnuz:
-                k_cache_reshaped = k_cache_reshaped.view(torch.float8_e5m2)
+                kv_cache_dtype="fp8_e5m2"
            k_scale = layer.k_scale if layer.k_scale is not None else torch.tensor([1.0], dtype=torch.float32, device=reshape_q.device)
            o = self._call_fp8_decode(
                reshape_q, 
@@ -476,7 +476,7 @@ class DCUMLABackend(AttentionBackend):
                (forward_batch.seq_lens + self.num_draft_tokens).to(torch.int32),
                layer.scaling,
                k_scale.to(torch.float32),
-                kv_cache_dtype=self.data_type,
+                kv_cache_dtype=kv_cache_dtype,
            )
        else:
            o = self._call_decode(