[CI] Ascend NPU CI enhancement (#8294)

Co-authored-by: ronnie_zheng <zl19940307@163.com>

[CI] Ascend NPU CI enhancement (#8294)
Co-authored-by: ronnie_zheng <zl19940307@163.com>
fee0ab0f · Even Zhou · GitHub · f57d2dc1 · fee0ab0f · fee0ab0f
Unverified Commit fee0ab0f authored Aug 04, 2025 by Even Zhou Committed by GitHub Aug 03, 2025
9 changed files
--- a/.github/workflows/pr-test-npu.yml
+++ b/.github/workflows/pr-test-npu.yml
@@ -22,7 +22,7 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  unit-test-basic:
+  per-commit-1-ascend-npu:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
        github.event.pull_request.draft == false
    runs-on: linux-arm64-npu-1
@@ -44,13 +44,77 @@ jobs:
        timeout-minutes: 30
        env:
          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
        run: |
          cd test/srt
-          python3 run_suite.py --suite per-commit-npu
+          python3 run_suite.py --suite per-commit-1-ascend-npu
+  per-commit-2-ascend-npu:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-arm64-npu-2
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Install dependencies
+        run: |
+          bash scripts/npu_ci_install_dependency.sh
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+      - name: Run test
+        timeout-minutes: 30
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-2-ascend-npu
+  per-commit-4-ascend-npu:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-arm64-npu-4
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Install dependencies
+        run: |
+          bash scripts/npu_ci_install_dependency.sh
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+      - name: Run test
+        timeout-minutes: 30
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-4-ascend-npu --timeout-per-file 3600
  finish:
    if: always()
-    needs: [ unit-test-basic ]
+    needs:
+      - per-commit-1-ascend-npu
+      - per-commit-2-ascend-npu
+      - per-commit-4-ascend-npu
    runs-on: ubuntu-latest
    steps:
      - name: Check all dependent job statuses

--- a/python/sglang/srt/layers/moe/topk.py
+++ b/python/sglang/srt/layers/moe/topk.py
@@ -398,8 +398,12 @@ def grouped_topk_gpu(
        .reshape(num_token, -1)
    )  # [n, e]
    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    # TODO: NPU can't support directly evaluating a comparison for now
    topk_weights, topk_ids = torch.topk(
-        tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0
+        tmp_scores,
+        k=topk,
+        dim=-1,
+        sorted=(True if num_fused_shared_experts > 0 else False),
    )
    if num_fused_shared_experts:
        topk_ids[:, -1] = torch.randint(
@@ -489,8 +493,12 @@ def biased_grouped_topk_impl(
    tmp_scores = scores_for_choice.masked_fill(
        ~score_mask.bool(), float("-inf")
    )  # [n, e]
+    # TODO: NPU can't support directly evaluating a comparison for now
    _, topk_ids = torch.topk(
-        tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0
+        tmp_scores,
+        k=topk,
+        dim=-1,
+        sorted=(True if num_fused_shared_experts > 0 else False),
    )
    topk_weights = scores.gather(1, topk_ids)

--- a/scripts/npu_ci_install_dependency.sh
+++ b/scripts/npu_ci_install_dependency.sh
 #!/bin/bash
 set -euo pipefail
-# Install the required dependencies from cache
+CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
-sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
+PIP_INSTALL="pip install --no-cache-dir"
-apt update -y
-apt install -y build-essential cmake python3-pip python3-dev wget net-tools zlib1g-dev lld clang software-properties-common curl
-# Setup pip cache
-pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
+# Update apt & pip sources
-pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
+sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
-python3 -m pip install --upgrade pip
+pip config set global.index-url http://${CACHING_URL}/pypi/simple
-pip uninstall sgl-kernel -y || true
+pip config set global.trusted-host ${CACHING_URL}
+# Install the required dependencies in CI.
+apt update -y && apt install -y \
+    build-essential \
+    cmake \
+    wget \
+    curl \
+    net-tools \
+    zlib1g-dev \
+    lld \
+    clang \
+    locales \
+    ccache \
+    ca-certificates
+update-ca-certificates
+python3 -m ${PIP_INSTALL} --upgrade pip
 ### Download MemFabricV2
 MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
-MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com:443/sglang/${MF_WHL_NAME}"
+MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}"
-wget "${MEMFABRIC_URL}" && pip install "./${MF_WHL_NAME}"
+wget "${MEMFABRIC_URL}" && ${PIP_INSTALL} "./${MF_WHL_NAME}"
 ### Install vLLM
 VLLM_TAG=v0.8.5
 git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
-(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v -e .)
+(cd vllm && VLLM_TARGET_DEVICE="empty" ${PIP_INSTALL} -v -e .)
 ### Install PyTorch and PTA
 PYTORCH_VERSION=2.6.0
 TORCHVISION_VERSION=0.21.0
-PTA_VERSION=2.6.0rc1
+PTA_VERSION=2.6.0
-pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
+${PIP_INSTALL} torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
-pip install torch_npu==$PTA_VERSION
+${PIP_INSTALL} torch_npu==$PTA_VERSION
 ### Install Triton-Ascend
-TRITON_ASCEND_VERSION=3.2.0rc2
+TRITON_ASCEND_NAME="triton_ascend-3.2.0.dev20250729-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
-pip install attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11
+TRITON_ASCEND_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${TRITON_ASCEND_NAME}"
-pip install triton-ascend==$TRITON_ASCEND_VERSION
+${PIP_INSTALL} attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11
+wget "${TRITON_ASCEND_URL}" && ${PIP_INSTALL} "./${TRITON_ASCEND_NAME}"
-pip install -e "python[srt_npu]"
-### Modify PyTorch TODO: to be removed later
+### Install SGLang
-TORCH_LOCATION=$(python3 -c 'import torch; print(torch.__path__[0])')
+${PIP_INSTALL} -v -e "python[srt_npu]"
-sed -i 's/from triton.runtime.autotuner import OutOfResources/from triton.runtime.errors import OutOfResources/' "${TORCH_LOCATION}/_inductor/runtime/triton_heuristics.py"
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -154,8 +154,14 @@ suites = {
        TestFile("test_rope_rocm.py", 3),
        TestFile("test_awq_dequant.py", 2),
    ],
-    "per-commit-npu": [
+    "per-commit-1-ascend-npu": [
-        TestFile("test_ascend_attention_backend.py", 400),
+        TestFile("test_ascend_tp1_bf16.py", 400),
+    ],
+    "per-commit-2-ascend-npu": [
+        TestFile("test_ascend_tp2_bf16.py", 400),
+    ],
+    "per-commit-4-ascend-npu": [
+        TestFile("test_ascend_mla_w8a8int8.py", 400),
    ],
    "per-commit-2-gpu": [
        TestFile("models/lora/test_lora_tp.py", 116),

--- a/test/srt/test_ascend_attention_backend.py
+++ b/test/srt/test_ascend_attention_backend.py
-"""
-Usage:
-python3 -m unittest test_ascend_attention_backend.TestAscendAttnBackend.test_gsm8k
-"""
-import unittest
-from types import SimpleNamespace
-from urllib.parse import urlparse
-from sglang.srt.utils import kill_process_tree
-from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
-    is_in_ci,
-    popen_launch_server,
-    run_bench_offline_throughput,
-)
-DEFAULT_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-7B-Instruct"
-class TestAscendAttnBackend(CustomTestCase):
-    def test_gsm8k(self):
-        model = DEFAULT_MODEL_NAME_FOR_TEST
-        base_url = DEFAULT_URL_FOR_TEST
-        url = urlparse(base_url)
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--attention-backend",
-                "ascend",
-                "--mem-fraction-static",
-                0.8,
-            ],
-        )
-        try:
-            args = SimpleNamespace(
-                num_shots=5,
-                data_path=None,
-                num_questions=1319,
-                max_new_tokens=512,
-                parallel=128,
-                host=f"http://{url.hostname}",
-                port=int(url.port),
-            )
-            metrics = run_eval_few_shot_gsm8k(args)
-            self.assertGreaterEqual(metrics["accuracy"], 0.62)
-            self.assertLessEqual(metrics["latency"], 150)
-        finally:
-            kill_process_tree(process.pid)
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_ascend_mla_backend.py
+++ b/test/srt/test_ascend_mla_backend.py
-"""
-Usage:
-python3 -m unittest test_ascend_mla_backend.TestAscendMLABackend.test_gsm8k
-"""
-import os
-import unittest
-from types import SimpleNamespace
-from urllib.parse import urlparse
-from sglang.srt.utils import kill_process_tree
-from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
-    is_in_ci,
-    popen_launch_server,
-    run_bench_offline_throughput,
-)
-if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3"
-DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
-    7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
-)
-DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
-DEFAULT_MODEL_NAME_FOR_TEST = "/models/DeepSeek-V2-Lite-Chat"
-if not os.path.exists(DEFAULT_MODEL_NAME_FOR_TEST):
-    DEFAULT_MODEL_NAME_FOR_TEST = DEFAULT_MLA_MODEL_NAME_FOR_TEST
-class TestAscendMLABackend(CustomTestCase):
-    def test_latency(self):
-        output_throughput = run_bench_offline_throughput(
-            DEFAULT_MODEL_NAME_FOR_TEST,
-            [
-                "--attention-backend",
-                "ascend",
-                "--mem-fraction-static",
-                0.7,
-                "--tp-size",
-                "4",
-                "--trust-remote-code",
-                "--disable-cuda-graph",
-            ],
-        )
-        print(f"{output_throughput=}")
-        if is_in_ci():
-            self.assertGreater(output_throughput, 18)
-    def test_gsm8k(self):
-        model = DEFAULT_MODEL_NAME_FOR_TEST
-        base_url = DEFAULT_URL_FOR_TEST
-        url = urlparse(base_url)
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--attention-backend",
-                "ascend",
-                "--mem-fraction-static",
-                0.7,
-                "--tp-size",
-                "4",
-                "--trust-remote-code",
-                "--disable-cuda-graph",
-            ],
-        )
-        try:
-            args = SimpleNamespace(
-                num_shots=5,
-                data_path=None,
-                num_questions=128,
-                max_new_tokens=512,
-                parallel=128,
-                host=f"http://{url.hostname}",
-                port=int(url.port),
-            )
-            metrics = run_eval_few_shot_gsm8k(args)
-            self.assertGreaterEqual(metrics["accuracy"], 0.62)
-            self.assertGreaterEqual(metrics["output_throughput"], 50)
-        finally:
-            kill_process_tree(process.pid)
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_ascend_mla_w8a8int8.py
+++ b/test/srt/test_ascend_mla_w8a8int8.py
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+TEST_MODEL_MATRIX = {
+    "/root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V2-Lite-W8A8": {
+        "accuracy": 0.34,
+        "latency": 1000,
+        "output_throughput": 6,
+    },
+}
+class TestAscendMlaW8A8Int8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+            "--quantization",
+            "w8a8_int8",
+            "--tp-size",
+            4,
+        ]
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_ascend_tp1_bf16.py
+++ b/test/srt/test_ascend_tp1_bf16.py
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.85,
+        "latency": 150,
+        "output_throughput": 30,
+    },
+}
+class TestAscendTp1Bf16(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+        ]
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_ascend_tp2_bf16.py
+++ b/test/srt/test_ascend_tp2_bf16.py
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.85,
+        "latency": 180,
+        "output_throughput": 20,
+    },
+}
+class TestAscendTp2Bf16(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+            "--tp-size",
+            2,
+        ]
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+if __name__ == "__main__":
+    unittest.main()