[feature] Ascend NPU graph support (#8027)

Co-authored-by: ronnie_zheng <zl19940307@163.com> Co-authored-by: yezhifeng (D) <y00897525@china.huawei.com> Co-authored-by: anon189Ty <Stari_Falcon@outlook.com> Co-authored-by: Maksim <makcum888e@mail.ru> Co-authored-by: ssshinigami <44640852+ssshinigami@users.noreply.github.com>

[feature] Ascend NPU graph support (#8027)
Co-authored-by: ronnie_zheng <zl19940307@163.com> Co-authored-by: yezhifeng (D) <y00897525@china.huawei.com> Co-authored-by: anon189Ty <Stari_Falcon@outlook.com> Co-authored-by: Maksim <makcum888e@mail.ru> Co-authored-by: ssshinigami <44640852+ssshinigami@users.noreply.github.com>
94371dbb · VDV1985 · GitHub · 740f0630 · 94371dbb · 94371dbb
Unverified Commit 94371dbb authored Aug 17, 2025 by VDV1985 Committed by GitHub Aug 16, 2025
18 changed files
--- a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py
+++ b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py
@@ -9,7 +9,7 @@ from transformers import AutoConfig
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
    fused_moe as fused_moe_triton,
 )
-from sglang.srt.model_executor.cuda_graph_runner import set_torch_compile_config
+from sglang.srt.model_executor.graph_runner import set_torch_compile_config
 def get_model_config(model_name: str, tp_size: int):

--- a/python/sglang/srt/distributed/parallel_state.py
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -55,7 +55,7 @@ _is_npu = is_npu()
 @dataclass
 class GraphCaptureContext:
-    stream: torch.cuda.Stream
+    stream: torch.cuda.Stream if not _is_npu else torch.npu.Stream
 TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
@@ -252,9 +252,13 @@ class GroupCoordinator:
        if is_cuda_alike():
            self.device = torch.device(f"cuda:{local_rank}")
+        elif _is_npu:
+            self.device = torch.device(f"npu:{local_rank}")
        else:
            self.device = torch.device("cpu")
+        self.device_module = torch.get_device_module(self.device)
        self.use_pynccl = use_pynccl
        self.use_pymscclpp = use_pymscclpp
        self.use_custom_allreduce = use_custom_allreduce
@@ -402,7 +406,7 @@ class GroupCoordinator:
        self, graph_capture_context: Optional[GraphCaptureContext] = None
    ):
        if graph_capture_context is None:
-            stream = torch.cuda.Stream()
+            stream = self.device_module.Stream()
            graph_capture_context = GraphCaptureContext(stream)
        else:
            stream = graph_capture_context.stream
@@ -413,11 +417,11 @@ class GroupCoordinator:
        # ensure all initialization operations complete before attempting to
        # capture the graph on another stream
-        curr_stream = torch.cuda.current_stream()
+        curr_stream = self.device_module.current_stream()
        if curr_stream != stream:
            stream.wait_stream(curr_stream)
-        with torch.cuda.stream(stream), maybe_ca_context:
+        with self.device_module.stream(stream), maybe_ca_context:
            # In graph mode, we have to be very careful about the collective
            # operations. The current status is:
            #     allreduce \ Mode   |  Eager  |  Graph  |
@@ -1641,6 +1645,8 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
                )
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
            torch.xpu.empty_cache()
+        elif hasattr(torch, "npu") and torch.npu.is_available():
+            torch.npu.empty_cache()
 def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:

--- a/python/sglang/srt/layers/attention/ascend_backend.py
+++ b/python/sglang/srt/layers/attention/ascend_backend.py
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, List, Optional
 import torch
 import torch_npu
@@ -27,6 +27,7 @@ class ForwardMetadata:
    # seq len inputs
    extend_seq_lens_cpu_int: Optional[torch.Tensor] = None
    seq_lens_cpu_int: Optional[torch.Tensor] = None
+    seq_lens_cpu_list: Optional[List[int]] = None
 class AscendAttnBackend(AttentionBackend):
@@ -51,7 +52,7 @@ class AscendAttnBackend(AttentionBackend):
    def __init__(self, model_runner: ModelRunner):
        super().__init__()
-        self.forward_metadata = ForwardMetadata()
+        self.forward_metadata = None
        self.device = model_runner.device
        self.gen_attention_mask(128, model_runner.dtype)
        self.page_size = model_runner.page_size
@@ -60,9 +61,15 @@ class AscendAttnBackend(AttentionBackend):
            self.kv_lora_rank = model_runner.model_config.kv_lora_rank
            self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
            self.native_attn = TorchNativeAttnBackend(model_runner)
+        self.graph_metadata = {}
+        self.max_context_len = model_runner.model_config.context_len
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.graph_mode = False
    def init_forward_metadata(self, forward_batch: ForwardBatch):
        """Init the metadata for a forward pass."""
+        self.forward_metadata = ForwardMetadata()
        self.forward_metadata.block_tables = (
            forward_batch.req_to_token_pool.req_to_token[
                forward_batch.req_pool_indices, : forward_batch.seq_lens.max()
@@ -75,6 +82,63 @@ class AscendAttnBackend(AttentionBackend):
            )
        self.forward_metadata.seq_lens_cpu_int = forward_batch.seq_lens_cpu.int()
+        self.graph_mode = False
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.graph_metadata = {
+            "block_tables": torch.empty(
+                (max_bs, self.max_context_len // self.page_size),
+                dtype=torch.int32,
+                device=self.device,
+            ),
+        }
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        metadata = ForwardMetadata()
+        metadata.block_tables = self.graph_metadata["block_tables"][:bs, :]
+        metadata.seq_lens_cpu_list = seq_lens.cpu().int().tolist()
+        self.graph_metadata[bs] = metadata
+        self.forward_metadata = metadata
+        self.graph_mode = True
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        metadata = self.graph_metadata[bs]
+        max_len = seq_lens_cpu[:bs].max().item()
+        max_seq_pages = (max_len + self.page_size - 1) // self.page_size
+        metadata.block_tables[:bs, :max_seq_pages].copy_(
+            self.req_to_token[req_pool_indices[:bs], :max_len][:, :: self.page_size]
+            // self.page_size
+        )
+        metadata.block_tables[:bs, max_seq_pages:].fill_(0)
+        metadata.block_tables[bs:, :].fill_(0)
+        self.forward_metadata = metadata
+        self.graph_mode = True
    def get_cuda_graph_seq_len_fill_value(self):
        return 1
@@ -167,28 +231,74 @@ class AscendAttnBackend(AttentionBackend):
                layer, forward_batch.out_cache_loc, k, v
            )
        if not self.use_mla:
-            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            if self.graph_mode:
-            v_cache = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+                k_cache = forward_batch.token_to_kv_pool.get_key_buffer(
+                    layer.layer_id
+                ).view(-1, self.page_size, layer.tp_k_head_num * layer.qk_head_dim)
+                v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
+                    layer.layer_id
+                ).view(-1, self.page_size, layer.tp_v_head_num * layer.v_head_dim)
+                query = q.view(-1, 1, layer.tp_q_head_num * layer.qk_head_dim)
+                num_tokens = query.shape[0]
+                workspace = (
+                    torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+                        query,
+                        k_cache,
+                        v_cache,
+                        block_table=self.forward_metadata.block_tables,
+                        block_size=self.page_size,
+                        num_heads=layer.tp_q_head_num,
+                        num_key_value_heads=layer.tp_k_head_num,
+                        input_layout="BSH",
+                        scale=layer.scaling,
+                        actual_seq_lengths_kv=self.forward_metadata.seq_lens_cpu_list,
+                    )
+                )
+                output = torch.empty(
+                    (num_tokens, 1, layer.tp_q_head_num * layer.v_head_dim),
+                    dtype=q.dtype,
+                    device=q.device,
+                )
+                softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
+                torch_npu.npu_fused_infer_attention_score.out(
+                    query,
+                    k_cache,
+                    v_cache,
+                    block_table=self.forward_metadata.block_tables,
+                    block_size=self.page_size,
+                    num_heads=layer.tp_q_head_num,
+                    num_key_value_heads=layer.tp_k_head_num,
+                    input_layout="BSH",
+                    scale=layer.scaling,
+                    actual_seq_lengths_kv=self.forward_metadata.seq_lens_cpu_list,
+                    workspace=workspace,
+                    out=[output, softmax_lse],
+                )
+            else:
+                k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+                v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
+                    layer.layer_id
+                )
-            query = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
+                query = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
-            num_tokens = query.shape[0]
+                num_tokens = query.shape[0]
-            output = torch.empty(
+                output = torch.empty(
-                (num_tokens, layer.tp_q_head_num, layer.v_head_dim),
+                    (num_tokens, layer.tp_q_head_num, layer.v_head_dim),
-                dtype=query.dtype,
+                    dtype=query.dtype,
-                device=query.device,
+                    device=query.device,
-            )
+                )
-            torch_npu._npu_paged_attention(
+                torch_npu._npu_paged_attention(
-                query=query,
+                    query=query,
-                key_cache=k_cache,
+                    key_cache=k_cache,
-                value_cache=v_cache,
+                    value_cache=v_cache,
-                num_heads=layer.tp_q_head_num,
+                    num_heads=layer.tp_q_head_num,
-                num_kv_heads=layer.tp_k_head_num,
+                    num_kv_heads=layer.tp_k_head_num,
-                scale_value=layer.scaling,
+                    scale_value=layer.scaling,
-                block_table=self.forward_metadata.block_tables,
+                    block_table=self.forward_metadata.block_tables,
-                context_lens=self.forward_metadata.seq_lens_cpu_int,
+                    context_lens=self.forward_metadata.seq_lens_cpu_int,
-                out=output,
+                    out=output,
-            )
+                )
            return output.view(num_tokens, layer.tp_q_head_num * layer.v_head_dim)
        else:
            query = q.view(-1, layer.tp_q_head_num, layer.head_dim)
@@ -220,3 +330,6 @@ class AscendAttnBackend(AttentionBackend):
                out=attn_output,
            )
            return attn_output.view(num_tokens, layer.tp_q_head_num * self.kv_lora_rank)
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 0
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -376,7 +376,7 @@ class MHATokenToKVPool(KVCache):
        v_scale: Optional[float] = None,
        layer_id_override: Optional[int] = None,
    ):
-        from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+        from sglang.srt.model_executor.graph_runner import get_is_capture_mode
        if layer_id_override is not None:
            layer_id = layer_id_override

--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
--- a/python/sglang/srt/model_executor/graph_runner.py
+++ b/python/sglang/srt/model_executor/graph_runner.py
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -91,6 +91,7 @@ from sglang.srt.mem_cache.memory_pool import (
 )
 from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_executor.npu_graph_runner import NPUGraphRunner
 from sglang.srt.model_loader import get_model
 from sglang.srt.model_loader.loader import DefaultModelLoader, get_model_loader
 from sglang.srt.model_loader.utils import set_default_torch_dtype
@@ -336,9 +337,12 @@ class ModelRunner:
        if self.device == "cuda":
            self.init_cublas()
            self.init_attention_backend()
-            self.init_cuda_graphs()
+            self.init_device_graphs()
+        elif self.device == "npu":
+            self.init_attention_backend()
+            self.init_device_graphs()
        else:
-            self.cuda_graph_runner = None
+            self.graph_runner = None
            self.cuda_graph_mem_usage = 0
            self.init_attention_backend()
@@ -912,7 +916,8 @@ class ModelRunner:
            )
        # We need to get device after patch otherwise the device would be wrong
-        infered_device = torch.cuda.current_device()
+        self.device_module = torch.get_device_module(self.device)
+        infered_device = self.device_module.current_device()
        named_tensors = [
            (name, _unwrap_tensor(tensor, tp_rank=self.tp_rank, device=infered_device))
@@ -1588,9 +1593,9 @@ class ModelRunner:
                .cuda()
            )
-    def init_cuda_graphs(self):
+    def init_device_graphs(self):
        """Capture cuda graphs."""
-        self.cuda_graph_runner = None
+        self.graph_runner = None
        self.cuda_graph_mem_usage = 0
        if not self.is_generation:
@@ -1605,8 +1610,9 @@ class ModelRunner:
        logger.info(
            f"Capture cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
        )
-        self.cuda_graph_runner = CudaGraphRunner(self)
+        self.graph_runner = (
+            CudaGraphRunner(self) if not _is_npu else NPUGraphRunner(self)
+        )
        after_mem = get_available_gpu_memory(self.device, self.gpu_id)
        self.cuda_graph_mem_usage = before_mem - after_mem
        logger.info(
@@ -1758,11 +1764,11 @@ class ModelRunner:
    ) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
        can_run_cuda_graph = bool(
            forward_batch.forward_mode.is_cuda_graph()
-            and self.cuda_graph_runner
+            and self.graph_runner
-            and self.cuda_graph_runner.can_run(forward_batch)
+            and self.graph_runner.can_run(forward_batch)
        )
        if can_run_cuda_graph:
-            ret = self.cuda_graph_runner.replay(
+            ret = self.graph_runner.replay(
                forward_batch,
                skip_attn_backend_init=skip_attn_backend_init,
                pp_proxy_tensors=pp_proxy_tensors,

--- a/python/sglang/srt/model_executor/npu_graph_runner.py
+++ b/python/sglang/srt/model_executor/npu_graph_runner.py
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the model with npu graph and torch.compile."""
+from __future__ import annotations
+import logging
+import threading
+from typing import TYPE_CHECKING
+import torch
+from sglang.srt.model_executor.graph_runner import GraphRunner
+logger = logging.getLogger(__name__)
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+class NPUGraphRunner(GraphRunner):
+    """A NPUGraphRunner runs the forward pass of a model with npu graph and torch.compile."""
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__(model_runner)
+    def _create_device_graph(self):
+        return torch.npu.NPUGraph()
+    def _capture_graph(self, graph, pool, stream, run_once_fn):
+        with torch.npu.graph(
+            graph,
+            pool=pool,
+            stream=stream,
+            auto_dispatch_capture=True,
+        ):
+            out = run_once_fn()
+        return out
+    def _update_inputs(self, seq_lens):
+        self.graphs[self.bs].update(
+            cpu_update_input=[{"actual_seq_lengths_kv": seq_lens}]
+        )
+    def _cache_loc_dtype(self):
+        return torch.int32
+    def replay(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[LogitsProcessorOutput, PPProxyTensors]:
+        if not skip_attn_backend_init:
+            self.replay_prepare(forward_batch, pp_proxy_tensors)
+        else:
+            # In speculative decoding, these two fields are still needed.
+            self.input_ids[: self.raw_num_token].copy_(forward_batch.input_ids)
+            self.positions[: self.raw_num_token].copy_(forward_batch.positions)
+        # Replay
+        seq_lens = forward_batch.seq_lens.cpu().tolist() + [0] * (self.bs - self.raw_bs)
+        thread = threading.Thread(target=self._update_inputs, args=(seq_lens,))
+        thread.start()
+        self.graphs[self.bs].replay()
+        thread.join()
+        output = self.output_buffers[self.bs]
+        if isinstance(output, LogitsProcessorOutput):
+            return LogitsProcessorOutput(
+                next_token_logits=output.next_token_logits[: self.raw_num_token],
+                hidden_states=(
+                    output.hidden_states[: self.raw_num_token]
+                    if output.hidden_states is not None
+                    else None
+                ),
+            )
+        else:
+            assert isinstance(output, PPProxyTensors)
+            return PPProxyTensors({k: v[: self.bs] for k, v in output.tensors.items()})
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -1198,7 +1198,7 @@ class DeepseekV2AttentionMLA(nn.Module):
        forward_batch: ForwardBatch,
        zero_allocator: BumpAllocator,
    ):
-        from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+        from sglang.srt.model_executor.graph_runner import get_is_capture_mode
        if self.q_lora_rank is not None:
            if hidden_states.shape[0] <= 16 and self.use_min_latency_fused_a_gemm:

--- a/python/sglang/srt/models/glm4_moe.py
+++ b/python/sglang/srt/models/glm4_moe.py
@@ -67,8 +67,8 @@ from sglang.srt.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding,
 )
 from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_executor.graph_runner import get_is_capture_mode
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.deepseek_v2 import (
    DeepseekV2DecoderLayer,

--- a/python/sglang/srt/models/mllama.py
+++ b/python/sglang/srt/models/mllama.py
@@ -966,7 +966,7 @@ class MllamaForConditionalGeneration(nn.Module):
        positions: torch.Tensor,
        forward_batch: ForwardBatch,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+        from sglang.srt.model_executor.graph_runner import get_is_capture_mode
        batched_images, batched_ar_ids, batched_ar_mask, encoder_lens_need = (
            self._batch_image_inputs(forward_batch)

--- a/python/sglang/srt/models/qwen3.py
+++ b/python/sglang/srt/models/qwen3.py
@@ -22,8 +22,8 @@ from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
 from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
-from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_executor.graph_runner import get_is_capture_mode
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2 import Qwen2MLP as Qwen3MLP
 from sglang.srt.models.qwen2 import Qwen2Model

--- a/python/sglang/srt/models/qwen3_moe.py
+++ b/python/sglang/srt/models/qwen3_moe.py
@@ -52,8 +52,8 @@ from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.utils import get_layer_id
 from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
 from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_executor.graph_runner import get_is_capture_mode
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
 from sglang.srt.models.qwen2_moe import Qwen2MoeModel

--- a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
+++ b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
@@ -6,20 +6,20 @@ from typing import TYPE_CHECKING, Callable
 import torch
 from sglang.srt.layers.dp_attention import DpPaddingMode, set_dp_buffer_len
-from sglang.srt.model_executor.cuda_graph_runner import (
+from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
-    CUDA_GRAPH_CAPTURE_FAILED_MSG,
+from sglang.srt.model_executor.forward_batch_info import (
-    CudaGraphRunner,
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.model_executor.graph_runner import (
+    GRAPH_CAPTURE_FAILED_MSG,
    get_batch_sizes_to_capture,
    get_global_graph_memory_pool,
    model_capture_mode,
    set_global_graph_memory_pool,
    set_torch_compile_config,
 )
-from sglang.srt.model_executor.forward_batch_info import (
-    CaptureHiddenMode,
-    ForwardBatch,
-    ForwardMode,
-)
 from sglang.srt.speculative.eagle_utils import EagleDraftInput
 from sglang.srt.utils import (
    require_attn_tp_gather,
@@ -121,7 +121,7 @@ class EAGLEDraftCudaGraphRunner:
                self.capture()
        except RuntimeError as e:
            raise Exception(
-                f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
+                f"Capture cuda graph failed: {e}\n{GRAPH_CAPTURE_FAILED_MSG}"
            )
    def can_run(self, forward_batch: ForwardBatch):

--- a/python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
+++ b/python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
@@ -6,9 +6,14 @@ from typing import TYPE_CHECKING, Callable
 import torch
 from sglang.srt.layers.dp_attention import DpPaddingMode, set_dp_buffer_len
-from sglang.srt.model_executor.cuda_graph_runner import (
+from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
-    CUDA_GRAPH_CAPTURE_FAILED_MSG,
+from sglang.srt.model_executor.forward_batch_info import (
-    CudaGraphRunner,
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.model_executor.graph_runner import (
+    GRAPH_CAPTURE_FAILED_MSG,
    LogitsProcessorOutput,
    get_batch_sizes_to_capture,
    get_global_graph_memory_pool,
@@ -16,11 +21,6 @@ from sglang.srt.model_executor.cuda_graph_runner import (
    set_global_graph_memory_pool,
    set_torch_compile_config,
 )
-from sglang.srt.model_executor.forward_batch_info import (
-    CaptureHiddenMode,
-    ForwardBatch,
-    ForwardMode,
-)
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, fast_topk
 from sglang.srt.utils import (
    require_attn_tp_gather,
@@ -149,7 +149,7 @@ class EAGLEDraftExtendCudaGraphRunner:
                self.capture()
        except RuntimeError as e:
            raise Exception(
-                f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
+                f"Capture cuda graph failed: {e}\n{GRAPH_CAPTURE_FAILED_MSG}"
            )
    def can_run(self, forward_batch: ForwardBatch):

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -229,6 +229,17 @@ suite_amd = {
        TestFile("test_wave_attention_kernels.py", 2),
        TestFile("test_wave_attention_backend.py", 150),
    ],
+    "per-commit-1-ascend-npu": [
+        TestFile("test_ascend_tp1_bf16.py", 400),
+        TestFile("test_ascend_graph_tp1_bf16.py", 400),
+    ],
+    "per-commit-2-ascend-npu": [
+        TestFile("test_ascend_tp2_bf16.py", 400),
+        TestFile("test_ascend_graph_tp2_bf16.py", 400),
+    ],
+    "per-commit-4-ascend-npu": [
+        TestFile("test_ascend_mla_w8a8int8.py", 400),
+    ],
    "per-commit-2-gpu-amd": [
        TestFile("lora/test_lora_tp.py", 116),
        TestFile("rl/test_update_weights_from_distributed.py", 103),

--- a/test/srt/test_ascend_graph_tp1_bf16.py
+++ b/test/srt/test_ascend_graph_tp1_bf16.py
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.85,
+        "latency": 150,
+        "output_throughput": 30,
+    },
+}
+class TestAscendGraphTp1Bf16(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+        ]
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_ascend_graph_tp2_bf16.py
+++ b/test/srt/test_ascend_graph_tp2_bf16.py
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.85,
+        "latency": 180,
+        "output_throughput": 20,
+    },
+}
+class TestAscendGraphTp2Bf16(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+            "--tp-size",
+            2,
+        ]
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+if __name__ == "__main__":
+    unittest.main()