test_async_spec_decode.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test that verifies no implicit GPU-CPU synchronization occurs during
speculative decoding generation under expected conditions.
"""

import multiprocessing
import sys
import traceback

import pytest
import torch


@pytest.fixture
def sync_tracker():
    """
    Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
    lazy init syncs. Prints stack traces immediately when syncs occur.
    """
    from vllm.v1.attention.backend import CommonAttentionMetadata

    # Shared counter for cross-process communication (inherited by fork)
    sync_count = multiprocessing.Value("i", 0)

    # Save original property
    original_prop = CommonAttentionMetadata.seq_lens_cpu
    original_fget = original_prop.fget

    # Create tracking wrapper
    def tracking_seq_lens_cpu(self):
        if self._seq_lens_cpu is None:
            # Increment counter
            with sync_count.get_lock():
                sync_count.value += 1
                count = sync_count.value
            # Print stack trace immediately (shows in subprocess output)
            print(f"\n{'=' * 60}", file=sys.stderr)
            print(f"SYNC #{count}: seq_lens_cpu lazy init triggered!", file=sys.stderr)
            print(f"{'=' * 60}", file=sys.stderr)
            traceback.print_stack(file=sys.stderr)
            print(f"{'=' * 60}\n", file=sys.stderr)
            sys.stderr.flush()
        return original_fget(self)

    # Apply patch
    CommonAttentionMetadata.seq_lens_cpu = property(tracking_seq_lens_cpu)

    class SyncTracker:
        @property
        def count(self) -> int:
            return sync_count.value

        def assert_no_sync(self, msg: str = ""):
            count = sync_count.value
            assert count == 0, (
                f"Unexpected GPU-CPU sync: seq_lens_cpu lazy init triggered "
                f"{count} times. See stack traces above. {msg}"
            )

    yield SyncTracker()

    # Restore original property
    CommonAttentionMetadata.seq_lens_cpu = original_prop
    torch._dynamo.reset()


# Test configurations: (model, spec_model, method, num_spec_tokens, backend_env)
SPEC_DECODE_CONFIGS = [
    pytest.param(
        "meta-llama/Llama-3.2-1B-Instruct",
        "nm-testing/Llama3_2_1B_speculator.eagle3",
        "eagle3",
        2,
        id="eagle3-llama",
    ),
    pytest.param(
        "eagle618/deepseek-v3-random",
        "eagle618/eagle-deepseek-v3-random",
        "eagle",
        2,
        id="eagle-mla-deepseek",
    ),
]


@pytest.mark.parametrize(
    "model,spec_model,method,num_spec_tokens",
    SPEC_DECODE_CONFIGS,
)
def test_no_sync_with_spec_decode(
    sync_tracker,
    model: str,
    spec_model: str,
    method: str,
    num_spec_tokens: int,
):
    """
    Test that no implicit GPU-CPU sync occurs during speculative decoding
    generation.
    """
    # Import vLLM AFTER sync_tracker fixture has applied the patch
    from vllm import LLM, SamplingParams
    from vllm.distributed import cleanup_dist_env_and_memory

    llm = LLM(
        model=model,
        max_model_len=256,
        speculative_config={
            "method": method,
            "num_speculative_tokens": num_spec_tokens,
            "model": spec_model,
        },
        enforce_eager=True,
        async_scheduling=True,
    )

    outputs = llm.generate(
        ["Hello, my name is"],
        SamplingParams(temperature=0, max_tokens=10),
    )

    assert len(outputs) == 1
    assert len(outputs[0].outputs[0].text) > 0

    del llm
    torch.cuda.empty_cache()
    cleanup_dist_env_and_memory()

    sync_tracker.assert_no_sync()