test_full_cudagraph.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
import os
import weakref
from contextlib import ExitStack

import pytest

from tests.utils import wait_for_gpu_memory_to_clear
from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig
from vllm.platforms import current_platform


@contextlib.contextmanager
def temporary_environ(env_vars):
    """
    Temporarily set environment variables and restore them afterward.
    We have to do this vs monkeypatch because monkeypatch doesn't work
    with "module" scoped fixtures.
    """
    original_env = {k: os.environ.get(k) for k in env_vars}
    try:
        os.environ.update(env_vars)
        yield
    finally:
        for k, v in original_env.items():
            if v is None:
                os.environ.pop(k, None)
            else:
                os.environ[k] = v


@pytest.fixture(scope="class")
def llm_pair(request):
    model = request.param

    with temporary_environ({
            "VLLM_USE_V1": "1",
            "VLLM_FLASH_ATTN_VERSION": "3"
    }):
        full = LLM(
            model=model,
            gpu_memory_utilization=0.45,
            trust_remote_code=True,
            max_model_len=1024,
            compilation_config=CompilationConfig(full_cuda_graph=True),
        )
        piecewise = LLM(
            model=model,
            gpu_memory_utilization=0.45,
            trust_remote_code=True,
            max_model_len=1024,
            compilation_config=CompilationConfig(),
        )

    # PyTest caches the fixture values so we use weakref.proxy to enable GC
    yield weakref.proxy(full), weakref.proxy(piecewise)
    del full
    del piecewise

    wait_for_gpu_memory_to_clear(
        devices=[0],
        threshold_ratio=0.1,
    )


@pytest.fixture(scope="class")
def cutlass_mla_llm_pair(request):
    model = request.param

    # force V1 engine and Cutlass MLA backend
    with temporary_environ({
            "VLLM_USE_V1": "1",
            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
            "FORCE_NUM_KV_SPLITS":
            "1",  # TODO: remove this when hang issue is fixed
    }):
        full = LLM(
            model=model,
            gpu_memory_utilization=0.45,
            trust_remote_code=True,
            max_model_len=1024,
            compilation_config=CompilationConfig(
                full_cuda_graph=True,
                cudagraph_capture_sizes=[16, 32, 64, 128, 256, 512],
            ),
        )
        piecewise = LLM(
            model=model,
            gpu_memory_utilization=0.45,
            trust_remote_code=True,
            max_model_len=1024,
            compilation_config=CompilationConfig(),
        )

    yield weakref.proxy(full), weakref.proxy(piecewise)
    del full
    del piecewise

    wait_for_gpu_memory_to_clear(
        devices=[0],
        threshold_ratio=0.1,
    )


@pytest.mark.parametrize(
    "cutlass_mla_llm_pair",
    [
        # use an MLA model
        "deepseek-ai/DeepSeek-V2-Lite",
    ],
    indirect=True)
@pytest.mark.skipif(current_platform.get_device_capability() != (10, 0),
                    reason="Only Blackwell GPUs support Cutlass MLA")
class TestFullCUDAGraphCutlassMLA:
    """
    Validate full CUDA Graph with Cutlass MLA (decode-only capture).
    """

    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
        (8, 8),
    ])
    def test_full_cudagraph_sm100_cutlass_mla(
            self, batch_size, max_tokens, cutlass_mla_llm_pair: tuple[LLM,
                                                                      LLM]):
        piecewise_llm, full_cudagraph_llm = cutlass_mla_llm_pair

        prompts = ["Hello, my name is"] * batch_size
        sampling_params = SamplingParams(temperature=0.0,
                                         max_tokens=max_tokens,
                                         top_p=0.95)

        piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
        full_responses = full_cudagraph_llm.generate(prompts, sampling_params)

        for piecewise_res, full_res in zip(piecewise_responses,
                                           full_responses):
            assert piecewise_res.outputs[0].text == full_res.outputs[0].text


@pytest.mark.parametrize(
    "llm_pair",
    [
        # Model names for the llm_pair fixture
        "deepseek-ai/DeepSeek-V2-Lite",
        "Qwen/Qwen2-1.5B-Instruct"
    ],
    indirect=True)
@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
                    reason="Only Hopper GPUs support FA3 and FlashMLA")
class TestFullCUDAGraph:
    """
    Use a class such that an llm pair is constructed once for all
    batch_size/max_tokens combinations and released immediately after.

    Module-scope fixtures would stick around the whole time,
    meaning there would be multiple LLM instances hogging memory simultaneously.
    """

    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
        (1, 10),
        (7, 10),
        (16, 10),
        (25, 10),
        (32, 10),
        (45, 10),
        (64, 10),
        (123, 10),
        (8, 5),
        (8, 30),
    ])
    def test_full_cudagraph(self, batch_size, max_tokens,
                            llm_pair: tuple[LLM, LLM]):
        """
        Test various batch sizes and max_tokens to ensure that the
        full cudagraph compilation works for padded cases too.
        """

        piecewise_llm, full_cudagraph_llm = llm_pair

        prompts = ["Hello, my name is"] * batch_size
        sampling_params = SamplingParams(temperature=0.0,
                                         max_tokens=max_tokens,
                                         top_p=0.95)

        piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
        full_responses = full_cudagraph_llm.generate(prompts, sampling_params)

        # Check that all responses are the same
        for piecewise_res, full_res in zip(piecewise_responses,
                                           full_responses):
            assert piecewise_res.outputs[0].text == full_res.outputs[0].text


@pytest.mark.parametrize(
    "model, supported",
    [
        ("Qwen/Qwen2-1.5B-Instruct", True),
        # MLA does not support capturing CUDA Graphs with size > max_num_seqs
        ("deepseek-ai/DeepSeek-V2-Lite", False),
    ])
@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
                    reason="Only Hopper GPUs support FA3 and FlashMLA")
def test_lower_max_num_seqs(model, supported):
    with temporary_environ({
            "VLLM_USE_V1": "1",
            "VLLM_FLASH_ATTN_VERSION": "3"
    }), ExitStack() as stack:
        if not supported:
            stack.enter_context(pytest.raises(RuntimeError))

        llm = LLM(model=model,
                  max_num_seqs=256,
                  trust_remote_code=True,
                  max_model_len=1024,
                  compilation_config=CompilationConfig(
                      full_cuda_graph=True,
                      cudagraph_capture_sizes=[64, 256, 512]))
        llm.generate(["Hello, my name is"] * 10)


@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
def test_full_cudagraph_with_invalid_backend():
    with temporary_environ({
            "VLLM_USE_V1": "1",
            "VLLM_FLASH_ATTN_VERSION":
            "2"  #FA2 not supported with full_cuda_graph
    }), pytest.raises(RuntimeError):
        LLM(model="Qwen/Qwen2-1.5B-Instruct",
            compilation_config=CompilationConfig(full_cuda_graph=True))