feat: lmcache on CUDA 13 and ARM (#7534)

Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com>

feat: lmcache on CUDA 13 and ARM (#7534)
Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com>
21b44473 · Dmitry Tokarev · GitHub · 2263defc · 21b44473 · 21b44473
Unverified Commit 21b44473 authored Apr 14, 2026 by Dmitry Tokarev Committed by GitHub Apr 14, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 10 deletions

container/deps/vllm/install_vllm.sh container/deps/vllm/install_vllm.sh +2 -2

tests/serve/test_vllm.py tests/serve/test_vllm.py +0 -8

No files found.
--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -231,7 +231,7 @@ echo "\n=== Installing LMCache from source ==="
 # (undefined symbol: c10::cuda::c10_cuda_check_implementation).
 # Build from source AFTER vLLM so c_ops.so compiles against the installed PyTorch.
 # Ref: https://docs.lmcache.ai/getting_started/installation.html#install-latest-lmcache-from-source
-if [ "$DEVICE" = "cuda" ] && [[ "$CUDA_VERSION_MAJOR" == "12" ]] && [ "$ARCH" = "amd64" ]; then
+if [ "$DEVICE" = "cuda" ]; then
    git clone --depth 1 --branch v${LMCACHE_REF} https://github.com/LMCache/LMCache.git ${INSTALLATION_DIR}/lmcache
    cd ${INSTALLATION_DIR}/lmcache
    uv pip install -r requirements/build.txt
@@ -256,7 +256,7 @@ elif [ "$DEVICE" = "xpu" ] && [ "$ARCH" = "amd64" ]; then
    uv pip install lmcache==${LMCACHE_REF}
    echo "✓ LMCache ${LMCACHE_REF} installed from PyPI (XPU)"
 else
-    echo "⚠ Skipping LMCache (ARM64 or CUDA 13 not supported)"
+    echo "⚠ Skipping LMCache for DEVICE=${DEVICE} ARCH=${ARCH} (not supported)"
 fi
 if [ "$DEVICE" = "cuda" ]; then

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -148,10 +148,6 @@ vllm_configs = {
            ),  # KV cache cap (2x safety over min=559_693_824)
            pytest.mark.timeout(360),  # ~7x observed 49.0s; old value before profiling
            pytest.mark.pre_merge,
-            pytest.mark.skipif(
-                _is_cuda13(),
-                reason="lmcache does not support CUDA 13 as of v0.3.11",
-            ),
        ],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
@@ -174,10 +170,6 @@ vllm_configs = {
            ),  # KV cache cap (2x safety over min=559_693_824)
            pytest.mark.timeout(360),  # ~7x observed 49.3s; old value before profiling
            pytest.mark.pre_merge,
-            pytest.mark.skipif(
-                _is_cuda13(),
-                reason="lmcache does not support CUDA 13 as of v0.3.11",
-            ),
        ],
        model="Qwen/Qwen3-0.6B",
        env={