Fix GPU OOM (#6564)

Co-authored-by: michael <michael.zhang@amd.com>

Fix GPU OOM (#6564)
Co-authored-by: michael <michael.zhang@amd.com>
7a5e6ce1 · kk · GitHub · 24c035f2 · 7a5e6ce1 · 7a5e6ce1
Unverified Commit 7a5e6ce1 authored May 25, 2025 by kk Committed by GitHub May 24, 2025
8 changed files
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -138,11 +138,6 @@ jobs:
        run: |
          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
-      - name: Benchmark online latency (EAGLE)
-        timeout-minutes: 15
-        run: |
-          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
  performance-test-1-gpu-part-2-amd:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
        github.event.pull_request.draft == false

--- a/python/sglang/srt/layers/attention/aiter_backend.py
+++ b/python/sglang/srt/layers/attention/aiter_backend.py
@@ -506,6 +506,7 @@ class AiterIndicesUpdaterPrefill:
                spec_info.generate_attn_arg_prefill(
                    req_pool_indices,
                    paged_kernel_lens,
+                    None,
                    self.req_to_token,
                )
            )

--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -412,6 +412,10 @@ class ModelRunner:
        if not server_args.disable_chunked_prefix_cache:
            logger.info("Chunked prefix cache is turned on.")
+        if server_args.attention_backend == "aiter":
+            if self.model_config.context_len > 8192:
+                self.mem_fraction_static *= 0.85
    def init_torch_distributed(self):
        logger.info("Init torch distributed begin.")

--- a/scripts/amd_ci_install_dependency.sh
+++ b/scripts/amd_ci_install_dependency.sh
@@ -5,6 +5,7 @@ set -euo pipefail
 docker exec ci_sglang pip install --upgrade pip
 docker exec ci_sglang pip uninstall sgl-kernel -y || true
 docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
+docker exec ci_sglang pip install -e "python[dev_hip]"
 docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
 docker exec -w /human-eval ci_sglang pip install -e .

--- a/test/srt/test_bench_one_batch.py
+++ b/test/srt/test_bench_one_batch.py
@@ -62,6 +62,9 @@ class TestBenchOneBatch(CustomTestCase):
                f"### test_torch_compile_tp2_bs1 (Mixtral-8x7B)\n"
                f"output_throughput: {output_throughput:.2f} token/s\n"
            )
+            if os.getenv("SGLANG_AMD_CI") == "1":
+                self.assertGreater(output_throughput, 200)
+            else:
                self.assertGreater(output_throughput, 220)

--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -32,7 +32,7 @@ class TestBenchServing(CustomTestCase):
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
            if os.getenv("SGLANG_AMD_CI") == "1":
-                self.assertGreater(res["output_throughput"], 3500)
+                self.assertGreater(res["output_throughput"], 3150)
            else:
                self.assertGreater(res["output_throughput"], 3800)
@@ -70,7 +70,7 @@ class TestBenchServing(CustomTestCase):
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
            if os.getenv("SGLANG_AMD_CI") == "1":
-                self.assertGreater(res["output_throughput"], 3500)
+                self.assertGreater(res["output_throughput"], 3050)
            else:
                self.assertGreater(res["output_throughput"], 3800)
@@ -126,7 +126,7 @@ class TestBenchServing(CustomTestCase):
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
            if os.getenv("SGLANG_AMD_CI") == "1":
-                self.assertGreater(res["output_throughput"], 4000)
+                self.assertGreater(res["output_throughput"], 3500)
            else:
                self.assertGreater(res["output_throughput"], 4300)

--- a/test/srt/test_eval_accuracy_large.py
+++ b/test/srt/test_eval_accuracy_large.py
@@ -37,11 +37,6 @@ class TestEvalAccuracyLarge(CustomTestCase):
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)
-    def tearDown(self):
-        # Delay between tests to allow GPU memory cleanup
-        if os.getenv("SGLANG_AMD_CI") == "1":
-            time.sleep(180)
    def test_mmlu(self):
        args = SimpleNamespace(
            base_url=self.base_url,

--- a/test/srt/test_full_deepseek_v3.py
+++ b/test/srt/test_full_deepseek_v3.py
@@ -90,9 +90,9 @@ class TestDeepseekV3MTP(CustomTestCase):
            "2",
            "--speculative-num-draft-tokens",
            "4",
-            "--mem-fraction-static",
-            "0.7",
        ]
+        if os.environ.get("SGLANG_AMD_CI") != "1":
+            other_args += ["--mem-frac", "0.7"]
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,