chore: upgrade flashinfer 0.5.0 (#12523)

Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>

chore: upgrade flashinfer 0.5.0 (#12523)
Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
0c3543d7 · Yineng Zhang · GitHub · 6a3b9fd0 · 0c3543d7 · 0c3543d7
Unverified Commit 0c3543d7 authored Nov 02, 2025 by Yineng Zhang Committed by GitHub Nov 02, 2025
6 changed files
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -26,7 +26,9 @@ dependencies = [
  "datasets",
  "einops",
  "fastapi",
-  "flashinfer_python==0.4.1",
+  "flashinfer_python==0.5.0",
+  "flashinfer_cubin==0.5.0",
+  "flashinfer_jit_cache==0.5.0",
  "gguf",
  "hf_transfer",
  "huggingface_hub",

--- a/python/sglang/check_env.py
+++ b/python/sglang/check_env.py
@@ -22,6 +22,8 @@ PACKAGE_LIST = [
    "sglang",
    "sgl_kernel",
    "flashinfer_python",
+    "flashinfer_cubin",
+    "flashinfer_jit_cache",
    "triton",
    "transformers",
    "torchao",

--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -712,7 +712,7 @@ def _set_envs_and_config(server_args: ServerArgs):
    if server_args.attention_backend == "flashinfer":
        assert_pkg_version(
            "flashinfer_python",
-            "0.4.1",
+            "0.5.0",
            "Please uninstall the old version and "
            "reinstall the latest version by following the instructions "
            "at https://docs.flashinfer.ai/installation.html.",

--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
@@ -2386,7 +2386,9 @@ def set_cuda_arch():
    if is_flashinfer_available():
        capability = torch.cuda.get_device_capability()
        arch = f"{capability[0]}.{capability[1]}"
-        os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
+        os.environ["FLASHINFER_CUDA_ARCH_LIST"] = (
+            f"{arch}{'a' if capability[0] >= 9 else ''}"
+        )


 def next_power_of_2(n: int):

--- a/scripts/ci/ci_install_dependency.sh
+++ b/scripts/ci/ci_install_dependency.sh
@@ -23,6 +23,7 @@ echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
 # Clear torch compilation cache
 python3 -c 'import os, shutil, tempfile, getpass; cache_dir = os.environ.get("TORCHINDUCTOR_CACHE_DIR") or os.path.join(tempfile.gettempdir(), "torchinductor_" + getpass.getuser()); shutil.rmtree(cache_dir, ignore_errors=True)'
 rm -rf /root/.cache/flashinfer
+pip3 uninstall flashinfer-python flashinfer-cubin flashinfer-jit-cache || true

 # Install apt packages
 apt install -y git libnuma-dev libssl-dev pkg-config
@@ -93,7 +94,7 @@ else
 fi

 # Install the main package
-$PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX
+$PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} --extra-index-url https://flashinfer.ai/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX

 # Install router for pd-disagg test
 $PIP_CMD install sglang-router $PIP_INSTALL_SUFFIX

--- a/sgl-kernel/build.sh
+++ b/sgl-kernel/build.sh
@@ -147,7 +147,7 @@ docker run --rm \
   ln -sv /usr/lib64/libibverbs.so.1 /usr/lib64/libibverbs.so && \
   ${PYTHON_ROOT_PATH}/bin/${TORCH_INSTALL} && \
   ${PYTHON_ROOT_PATH}/bin/pip install --no-cache-dir ninja setuptools==75.0.0 wheel==0.41.0 numpy uv scikit-build-core && \
-   export TORCH_CUDA_ARCH_LIST='8.0 8.9 9.0+PTX' && \
+   export FLASHINFER_CUDA_ARCH_LIST='8.0 8.9 9.0a 10.0a 12.0a' && \
   export CUDA_VERSION=${CUDA_VERSION} && \
   mkdir -p /usr/lib/${ARCH}-linux-gnu/ && \
   ln -s /usr/local/cuda-${CUDA_VERSION}/targets/${LIBCUDA_ARCH}-linux/lib/stubs/libcuda.so /usr/lib/${ARCH}-linux-gnu/libcuda.so && \