[feat] apply deep_gemm compile_mode to skip launch (#9879)

1db649ac · JieXin Liang · GitHub · a1e5d781 · 1db649ac · 1db649ac
Unverified Commit 1db649ac authored Sep 02, 2025 by JieXin Liang Committed by GitHub Sep 02, 2025
4 changed files
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -85,10 +85,10 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
 && python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
 && python3 -m flashinfer --download-cubin \
 && if [ "$CUDA_VERSION" = "12.8.1" ]; then \
-      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.7.post1/sgl_kernel-0.3.7.post1+cu128-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
+      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.8/sgl_kernel-0.3.8+cu128-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
    fi \
 && if [ "$CUDA_VERSION" = "12.9.1" ]; then \
-      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.7.post1/sgl_kernel-0.3.7.post1+cu129-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
+      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.8/sgl_kernel-0.3.8+cu129-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
    fi

 # Download source files

--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -58,7 +58,7 @@ runtime_common = [

 srt = [
    "sglang[runtime_common]",
-    "sgl-kernel==0.3.7.post1",
+    "sgl-kernel==0.3.8",
    "torch==2.8.0",
    "torchaudio==2.8.0",
    "torchvision",

--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -681,7 +681,7 @@ def _set_envs_and_config(server_args: ServerArgs):
    if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
        assert_pkg_version(
            "sgl-kernel",
-            "0.3.7.post1",
+            "0.3.8",
            "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
        )


--- a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py
+++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py
@@ -132,9 +132,17 @@ def _compile_deep_gemm_one_type_all(
        kernel_type, max_m=max(m_list), n=n, k=k, num_groups=num_groups
    )

+    old_compile_mode = deep_gemm.get_compile_mode()
+    deep_gemm.set_compile_mode(1)
    # TODO can use multi thread
    for m in tqdm(m_list, desc=f"DeepGEMM warmup"):
        executor.execute(m=m)
+    deep_gemm.set_compile_mode(old_compile_mode)
+
+    # clean up input buffers
+    torch.cuda.current_stream().synchronize()
+    del executor
+    torch.cuda.empty_cache()


 class _BaseWarmupExecutor: