Unverified Commit 0247a91e authored by Andreas Karatzas's avatar Andreas Karatzas Committed by GitHub
Browse files

[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979)


Signed-off-by: default avatarAndreas Karatzas <akaratza@amd.com>
parent 8ee90c83
...@@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")) ...@@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
sys.platform, sys.platform,
) )
VLLM_TARGET_DEVICE = "empty" VLLM_TARGET_DEVICE = "empty"
elif ( elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
sys.platform.startswith("linux") if torch.version.hip is not None:
and torch.version.cuda is None VLLM_TARGET_DEVICE = "rocm"
and os.getenv("VLLM_TARGET_DEVICE") is None logger.info("Auto-detected ROCm")
and torch.version.hip is None elif torch.version.cuda is not None:
): VLLM_TARGET_DEVICE = "cuda"
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set, logger.info("Auto-detected CUDA")
# fallback to cpu else:
VLLM_TARGET_DEVICE = "cpu" VLLM_TARGET_DEVICE = "cpu"
def is_sccache_available() -> bool: def is_sccache_available() -> bool:
...@@ -108,20 +108,26 @@ class cmake_build_ext(build_ext): ...@@ -108,20 +108,26 @@ class cmake_build_ext(build_ext):
num_jobs = os.cpu_count() num_jobs = os.cpu_count()
nvcc_threads = None nvcc_threads = None
if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"): if _is_cuda() and CUDA_HOME is not None:
# `nvcc_threads` is either the value of the NVCC_THREADS try:
# environment variable (if defined) or 1. nvcc_version = get_nvcc_cuda_version()
# when it is set, we reduce `num_jobs` to avoid if nvcc_version >= Version("11.2"):
# overloading the system. # `nvcc_threads` is either the value of the NVCC_THREADS
nvcc_threads = envs.NVCC_THREADS # environment variable (if defined) or 1.
if nvcc_threads is not None: # when it is set, we reduce `num_jobs` to avoid
nvcc_threads = int(nvcc_threads) # overloading the system.
logger.info( nvcc_threads = envs.NVCC_THREADS
"Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads if nvcc_threads is not None:
) nvcc_threads = int(nvcc_threads)
else: logger.info(
nvcc_threads = 1 "Using NVCC_THREADS=%d as the number of nvcc threads.",
num_jobs = max(1, num_jobs // nvcc_threads) nvcc_threads,
)
else:
nvcc_threads = 1
num_jobs = max(1, num_jobs // nvcc_threads)
except Exception as e:
logger.warning("Failed to get NVCC version: %s", e)
return num_jobs, nvcc_threads return num_jobs, nvcc_threads
...@@ -199,9 +205,9 @@ class cmake_build_ext(build_ext): ...@@ -199,9 +205,9 @@ class cmake_build_ext(build_ext):
# Default build tool to whatever cmake picks. # Default build tool to whatever cmake picks.
build_tool = [] build_tool = []
# Make sure we use the nvcc from CUDA_HOME # Make sure we use the nvcc from CUDA_HOME
if _is_cuda(): if _is_cuda() and CUDA_HOME is not None:
cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"] cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
elif _is_hip(): elif _is_hip() and ROCM_HOME is not None:
cmake_args += [f"-DROCM_PATH={ROCM_HOME}"] cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
other_cmake_args = os.environ.get("CMAKE_ARGS") other_cmake_args = os.environ.get("CMAKE_ARGS")
...@@ -339,6 +345,89 @@ class precompiled_wheel_utils: ...@@ -339,6 +345,89 @@ class precompiled_wheel_utils:
wheels = json.loads(resp.read().decode("utf-8")) wheels = json.loads(resp.read().decode("utf-8"))
return wheels, repo_url return wheels, repo_url
@staticmethod
def is_rocm_system() -> bool:
"""Detect ROCm without relying on torch (for build environment)."""
if os.getenv("ROCM_PATH"):
return True
if os.path.isdir("/opt/rocm"):
return True
if which("rocminfo") is not None:
return True
try:
import torch
return torch.version.hip is not None
except ImportError:
return False
@staticmethod
def find_local_rocm_wheel() -> str | None:
"""Search for a local vllm wheel in common locations."""
import glob
for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
wheels = glob.glob(pattern)
if wheels:
return sorted(wheels)[-1]
return None
@staticmethod
def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
"""Fetch the latest wheel URL from a PyPI-style simple index."""
import platform
from html.parser import HTMLParser
from urllib.parse import urljoin
from urllib.request import urlopen
arch = platform.machine()
class WheelLinkParser(HTMLParser):
def __init__(self):
super().__init__()
self.wheels = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for name, value in attrs:
if name == "href" and value.endswith(".whl"):
self.wheels.append(value)
simple_url = f"{index_url.rstrip('/')}/{package}/"
print(f"Fetching wheel list from {simple_url}")
with urlopen(simple_url) as resp:
html = resp.read().decode("utf-8")
parser = WheelLinkParser()
parser.feed(html)
for wheel in reversed(parser.wheels):
if arch in wheel:
if wheel.startswith("http"):
return wheel
return urljoin(simple_url, wheel)
raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
@staticmethod
def determine_wheel_url_rocm() -> tuple[str, str | None]:
"""Determine the precompiled wheel for ROCm."""
# Search for local wheel first
local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
if local_wheel is not None:
print(f"Found local ROCm wheel: {local_wheel}")
return local_wheel, None
# Fall back to AMD's PyPI index
index_url = os.getenv(
"VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
)
print(f"Fetching ROCm precompiled wheel from {index_url}")
wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
download_filename = wheel_url.split("/")[-1].split("#")[0]
print(f"Using ROCm precompiled wheel: {wheel_url}")
return wheel_url, download_filename
@staticmethod @staticmethod
def determine_wheel_url() -> tuple[str, str | None]: def determine_wheel_url() -> tuple[str, str | None]:
""" """
...@@ -359,6 +448,11 @@ class precompiled_wheel_utils: ...@@ -359,6 +448,11 @@ class precompiled_wheel_utils:
print(f"Using user-specified precompiled wheel location: {wheel_location}") print(f"Using user-specified precompiled wheel location: {wheel_location}")
return wheel_location, None return wheel_location, None
else: else:
# ROCm: use local wheel or AMD's PyPI index
# TODO: When we have ROCm nightly wheels, we can update this logic.
if precompiled_wheel_utils.is_rocm_system():
return precompiled_wheel_utils.determine_wheel_url_rocm()
import platform import platform
arch = platform.machine() arch = platform.machine()
...@@ -465,6 +559,8 @@ class precompiled_wheel_utils: ...@@ -465,6 +559,8 @@ class precompiled_wheel_utils:
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so", "vllm/cumem_allocator.abi3.so",
# ROCm-specific libraries
"vllm/_rocm_C.abi3.so",
] ]
flash_attn_regex = re.compile( flash_attn_regex = re.compile(
...@@ -601,6 +697,8 @@ def get_rocm_version(): ...@@ -601,6 +697,8 @@ def get_rocm_version():
# Get the Rocm version from the ROCM_HOME/bin/librocm-core.so # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
# see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21 # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
try: try:
if ROCM_HOME is None:
return None
librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so" librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
if not librocm_core_file.is_file(): if not librocm_core_file.is_file():
return None return None
...@@ -745,7 +843,9 @@ if _is_hip(): ...@@ -745,7 +843,9 @@ if _is_hip():
if _is_cuda(): if _is_cuda():
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"): if envs.VLLM_USE_PRECOMPILED or (
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
):
# FA3 requires CUDA 12.3 or later # FA3 requires CUDA 12.3 or later
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
# Optional since this doesn't get built (produce an .so file) when # Optional since this doesn't get built (produce an .so file) when
......
...@@ -5,6 +5,30 @@ import pytest ...@@ -5,6 +5,30 @@ import pytest
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
def add_attention_backend(server_args, attention_config):
"""Append attention backend CLI arg if specified.
Args:
server_args: List of server arguments to extend in-place.
attention_config: Dict with 'backend' key, or None.
"""
if attention_config and "backend" in attention_config:
server_args.extend(["--attention-backend", attention_config["backend"]])
@pytest.fixture(scope="module")
def rocm_aiter_fa_attention():
"""Return attention config for transcription/translation tests on ROCm.
On ROCm, audio tests require ROCM_AITER_FA attention backend.
"""
from vllm.platforms import current_platform
if current_platform.is_rocm():
return {"backend": "ROCM_AITER_FA"}
return None
@pytest.fixture @pytest.fixture
def mary_had_lamb(): def mary_had_lamb():
path = AudioAsset("mary_had_lamb").get_local_path() path = AudioAsset("mary_had_lamb").get_local_path()
......
...@@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): ...@@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
{"role": "system", "content": "you are a helpful assistant"}, {"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "what is 1+1?"}, {"role": "user", "content": "what is 1+1?"},
] ]
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=5,
logprobs=True, logprobs=True,
top_logprobs=5, top_logprobs=5,
) )
...@@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): ...@@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage( assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=37, total_tokens=47 completion_tokens=5, prompt_tokens=37, total_tokens=42
) )
message = choice.message message = choice.message
assert message.content is not None and len(message.content) >= 10 assert message.content is not None and len(message.content) >= 5
assert message.role == "assistant" assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content}) messages.append({"role": "assistant", "content": message.content})
...@@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): ...@@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=5,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
......
...@@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest): ...@@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest):
"2", "2",
*passed_params, *passed_params,
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
......
...@@ -504,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str): ...@@ -504,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_code_interpreter(client: OpenAI, model_name: str): async def test_code_interpreter(client: OpenAI, model_name: str):
response = await client.responses.create( # Code interpreter may need more time for container init + code execution
timeout_value = client.timeout * 3
client_with_timeout = client.with_options(timeout=timeout_value)
response = await client_with_timeout.responses.create(
model=model_name, model=model_name,
# TODO: Ideally should be able to set max tool calls # TODO: Ideally should be able to set max tool calls
# to prevent multi-turn, but it is not currently supported # to prevent multi-turn, but it is not currently supported
...@@ -868,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server): ...@@ -868,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.flaky(reruns=3)
async def test_function_call_with_previous_input_messages( async def test_function_call_with_previous_input_messages(
client: OpenAI, model_name: str client: OpenAI, model_name: str
): ):
......
...@@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages): ...@@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
add_generation_prompt=True, add_generation_prompt=True,
enable_thinking=False, # default with Qwen3 enable_thinking=False, # default with Qwen3
) )
for ignore_eos in [True, False]: for ignore_eos in [True, False]:
payload = { payload = {
"model": MODEL_NAME, "model": MODEL_NAME,
...@@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages): ...@@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
} }
generate_resp = await client.post(GEN_ENDPOINT, json=payload) generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_data = generate_resp.json() generate_data = generate_resp.json()
generate_res = tokenizer.decode( gen_token_ids = generate_data["choices"][0]["token_ids"]
generate_data["choices"][0]["token_ids"], skip_special_tokens=True generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True)
)
payload = { payload = {
"model": MODEL_NAME, "model": MODEL_NAME,
...@@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages): ...@@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
"temperature": 0.0, "temperature": 0.0,
"stream": False, "stream": False,
"ignore_eos": ignore_eos, "ignore_eos": ignore_eos,
"chat_template_kwargs": dict(enable_thinking=False), "chat_template_kwargs": {"enable_thinking": False},
} }
completions_resp = await client.post("/v1/chat/completions", json=payload) completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json() completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"] completions_res = completions_data["choices"][0]["message"]["content"]
if ignore_eos:
# When ignoring EOS, only compare up to the first EOS token
# Post-EOS generation is undefined and may differ
eos_tokens = {
tokenizer.eos_token_id,
*tokenizer.additional_special_tokens_ids,
}
# Find first EOS in generated tokens
eos_pos = None
for i, tid in enumerate(gen_token_ids):
if tid in eos_tokens:
eos_pos = i
break
if eos_pos is not None:
gen_token_ids_truncated = gen_token_ids[:eos_pos]
generate_res = tokenizer.decode(
gen_token_ids_truncated, skip_special_tokens=True
)
# Truncate completions_res to same length for comparison
completions_res = completions_res[: len(generate_res)]
assert generate_res == completions_res assert generate_res == completions_res
......
...@@ -9,10 +9,16 @@ import time ...@@ -9,10 +9,16 @@ import time
import openai import openai
import pytest import pytest
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_open_port from vllm.utils.network_utils import get_open_port
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
# GPU initialization might take take longer
_IS_ROCM = current_platform.is_rocm()
_SERVER_STARTUP_TIMEOUT = 120
_PROCESS_EXIT_TIMEOUT = 15
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_shutdown_on_engine_failure(): async def test_shutdown_on_engine_failure():
...@@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure(): ...@@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure():
"2", "2",
"--disable-frontend-multiprocessing", "--disable-frontend-multiprocessing",
], ],
stdout=subprocess.PIPE, # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
stderr=subprocess.PIPE, # stdout/stderr pipes are enabled during ROCm GPU initialization.
text=True, stdout=None if _IS_ROCM else subprocess.PIPE,
stderr=None if _IS_ROCM else subprocess.PIPE,
text=None if _IS_ROCM else True,
preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN), preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
) )
...@@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure(): ...@@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure():
) )
# Poll until server is ready # Poll until server is ready
while time.time() - start_time < 30: while time.time() - start_time < _SERVER_STARTUP_TIMEOUT:
try: try:
await client.completions.create( await client.completions.create(
model=MODEL_NAME, prompt="Hello", max_tokens=1 model=MODEL_NAME, prompt="Hello", max_tokens=1
...@@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure(): ...@@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure():
except Exception: except Exception:
time.sleep(0.5) time.sleep(0.5)
if proc.poll() is not None: if proc.poll() is not None:
stdout, stderr = proc.communicate(timeout=1) if _IS_ROCM:
pytest.fail( pytest.fail(f"Server died during startup: {proc.returncode}")
f"Server died during startup. stdout: {stdout}, stderr: {stderr}" else:
) stdout, stderr = proc.communicate(timeout=1)
pytest.fail(
f"Server died during startup. "
f"stdout: {stdout}, stderr: {stderr}"
)
else: else:
proc.terminate() proc.terminate()
proc.wait(timeout=5) proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
pytest.fail("Server failed to start in 30 seconds") pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds")
# Kill server to simulate crash # Kill server to simulate crash
proc.terminate() proc.terminate()
...@@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure(): ...@@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure():
model=MODEL_NAME, prompt="This should fail", max_tokens=1 model=MODEL_NAME, prompt="This should fail", max_tokens=1
) )
return_code = proc.wait(timeout=5) return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
assert return_code is not None assert return_code is not None
...@@ -7,6 +7,7 @@ import json ...@@ -7,6 +7,7 @@ import json
import pytest import pytest
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
from .conftest import add_attention_backend
MISTRAL_FORMAT_ARGS = [ MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode", "--tokenizer_mode",
...@@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [ ...@@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"]) @pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
async def test_basic_audio(mary_had_lamb, model_name): async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
server_args = ["--enforce-eager"] server_args = ["--enforce-eager"]
if model_name.startswith("mistralai"): if model_name.startswith("mistralai"):
server_args += MISTRAL_FORMAT_ARGS server_args += MISTRAL_FORMAT_ARGS
add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server: with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
...@@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name): ...@@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb): async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
"""Ensure STT (transcribe) requests can pass LoRA through to generate.""" """Ensure STT (transcribe) requests can pass LoRA through to generate."""
# ROCm SPECIFIC CONFIGURATION:
# To ensure the test passes on ROCm, we modify the max model length to 512.
# We DO NOT apply this to other platforms to maintain strict upstream parity.
from vllm.platforms import current_platform
model_name = "ibm-granite/granite-speech-3.3-2b" model_name = "ibm-granite/granite-speech-3.3-2b"
lora_model_name = "speech" lora_model_name = "speech"
server_args = [ server_args = [
...@@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb): ...@@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
"--lora-modules", "--lora-modules",
f"{lora_model_name}={model_name}", f"{lora_model_name}={model_name}",
"--max-model-len", "--max-model-len",
"2048", "512" if current_platform.is_rocm() else "2048",
"--max-num-seqs", "--max-num-seqs",
"1", "1",
] ]
add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server: with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
...@@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb): ...@@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_basic_audio_gemma(foscolo): async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention):
# Gemma accuracy on some of the audio samples we use is particularly bad, # Gemma accuracy on some of the audio samples we use is particularly bad,
# hence we use a different one here. WER is evaluated separately. # hence we use a different one here. WER is evaluated separately.
model_name = "google/gemma-3n-E2B-it" model_name = "google/gemma-3n-E2B-it"
server_args = ["--enforce-eager"] server_args = ["--enforce-eager"]
add_attention_backend(server_args, rocm_aiter_fa_attention)
with RemoteOpenAIServer( with RemoteOpenAIServer(
model_name, server_args, max_wait_seconds=480 model_name, server_args, max_wait_seconds=480
) as remote_server: ) as remote_server:
......
...@@ -14,16 +14,26 @@ import pytest_asyncio ...@@ -14,16 +14,26 @@ import pytest_asyncio
import soundfile as sf import soundfile as sf
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
from .conftest import add_attention_backend
SERVER_ARGS = ["--enforce-eager"] SERVER_ARGS = ["--enforce-eager"]
def _get_server_args(attention_config):
"""Get server args with attention backend if specified."""
args = SERVER_ARGS.copy()
add_attention_backend(args, attention_config)
return args
@pytest.fixture( @pytest.fixture(
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"] scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
) )
def server(request): def server(request, rocm_aiter_fa_attention):
# Parametrize over model name # Parametrize over model name
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server: with RemoteOpenAIServer(
request.param, _get_server_args(rocm_aiter_fa_attention)
) as remote_server:
yield remote_server, request.param yield remote_server, request.param
...@@ -35,10 +45,12 @@ async def client_and_model(server): ...@@ -35,10 +45,12 @@ async def client_and_model(server):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_non_asr_model(foscolo): async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
# text to text model # text to text model
model_name = "JackFram/llama-68m" model_name = "JackFram/llama-68m"
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server: with RemoteOpenAIServer(
model_name, _get_server_args(rocm_aiter_fa_attention)
) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
res = await client.audio.translations.create( res = await client.audio.translations.create(
model=model_name, file=foscolo, temperature=0.0 model=model_name, file=foscolo, temperature=0.0
...@@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo): ...@@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb): async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
"""Ensure STT (translate) requests can pass LoRA through to generate.""" """Ensure STT (translate) requests can pass LoRA through to generate."""
# ROCm SPECIFIC CONFIGURATION:
# To ensure the test passes on ROCm, we modify the max model length to 512.
# We DO NOT apply this to other platforms to maintain strict upstream parity.
from vllm.platforms import current_platform
# NOTE - careful to call this test before the module scoped server # NOTE - careful to call this test before the module scoped server
# fixture, otherwise it'll OOMkill the CI # fixture, otherwise it'll OOMkill the CI
model_name = "ibm-granite/granite-speech-3.3-2b" model_name = "ibm-granite/granite-speech-3.3-2b"
...@@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb): ...@@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
"--lora-modules", "--lora-modules",
f"{lora_model_name}={model_name}", f"{lora_model_name}={model_name}",
"--max-model-len", "--max-model-len",
"2048", "512" if current_platform.is_rocm() else "2048",
"--max-num-seqs", "--max-num-seqs",
"1", "1",
] ]
add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server: with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
......
...@@ -8,6 +8,7 @@ import pytest ...@@ -8,6 +8,7 @@ import pytest
import pytest_asyncio import pytest_asyncio
from vllm.multimodal.utils import encode_video_url, fetch_video from vllm.multimodal.utils import encode_video_url, fetch_video
from vllm.platforms import current_platform
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
...@@ -37,7 +38,16 @@ def server(): ...@@ -37,7 +38,16 @@ def server():
json.dumps({"video": MAXIMUM_VIDEOS}), json.dumps({"video": MAXIMUM_VIDEOS}),
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: # ROCm: Increase timeouts to handle potential network delays and slower
# video processing when downloading multiple videos from external sources
env_overrides = {}
if current_platform.is_rocm():
env_overrides = {
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
yield remote_server yield remote_server
...@@ -287,6 +297,11 @@ async def test_chat_streaming_video( ...@@ -287,6 +297,11 @@ async def test_chat_streaming_video(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))] "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
) )
@pytest.mark.flaky(
reruns=2,
reruns_delay=5,
condition=current_platform.is_rocm(),
)
async def test_multi_video_input( async def test_multi_video_input(
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str] client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
): ):
......
...@@ -10,6 +10,7 @@ from transformers import AutoProcessor ...@@ -10,6 +10,7 @@ from transformers import AutoProcessor
from vllm.multimodal.base import MediaWithBytes from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_url, fetch_image from vllm.multimodal.utils import encode_image_url, fetch_image
from vllm.platforms import current_platform
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
...@@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [ ...@@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
], ],
] ]
EXPECTED_MM_BEAM_SEARCH_RES_ROCM = [
# MultiHeadAttention attn_backend: FLASH_ATTN
# with Triton Attention backend
[
"The image shows a wooden boardwalk leading through a",
"The image shows a wooden boardwalk extending into a",
],
[
"The image shows two parrots perched on",
"The image shows two birds perched on a cur",
],
[
"The image shows a Venn diagram with three over",
"The image contains a Venn diagram with three over",
],
[
"This image displays a gradient of colors ranging from",
"This image displays a gradient of colors transitioning from",
],
]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
...@@ -59,7 +81,16 @@ def server(): ...@@ -59,7 +81,16 @@ def server():
json.dumps({"image": MAXIMUM_IMAGES}), json.dumps({"image": MAXIMUM_IMAGES}),
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: # ROCm: Increase timeouts to handle potential network delays and slower
# video processing when downloading multiple videos from external sources
env_overrides = {}
if current_platform.is_rocm():
env_overrides = {
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
yield remote_server yield remote_server
...@@ -288,9 +319,16 @@ async def test_single_chat_session_image_base64encoded_beamsearch( ...@@ -288,9 +319,16 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
image_idx: int, image_idx: int,
url_encoded_image: dict[str, str], url_encoded_image: dict[str, str],
): ):
# ROCm: Switch expected results based on platform
from vllm.platforms import current_platform
# NOTE: This test also validates that we pass MM data through beam search # NOTE: This test also validates that we pass MM data through beam search
raw_image_url = TEST_IMAGE_ASSETS[image_idx] raw_image_url = TEST_IMAGE_ASSETS[image_idx]
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
if current_platform.is_rocm():
expected_res = EXPECTED_MM_BEAM_SEARCH_RES_ROCM[image_idx]
else:
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url]) messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
......
...@@ -33,6 +33,7 @@ def _terratorch_dummy_messages(): ...@@ -33,6 +33,7 @@ def _terratorch_dummy_messages():
] ]
@pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
) )
......
...@@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams ...@@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small" MODEL_NAME = "intfloat/multilingual-e5-small"
PROMPTS = [ PROMPTS = [
...@@ -35,6 +30,12 @@ TOKEN_IDS = [ ...@@ -35,6 +30,12 @@ TOKEN_IDS = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm(): def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM( llm = LLM(
...@@ -44,6 +45,7 @@ def llm(): ...@@ -44,6 +45,7 @@ def llm():
gpu_memory_utilization=0.75, gpu_memory_utilization=0.75,
enforce_eager=True, enforce_eager=True,
seed=0, seed=0,
attention_config=attention_config,
) )
yield weakref.proxy(llm) yield weakref.proxy(llm)
......
...@@ -9,11 +9,6 @@ import pytest_asyncio ...@@ -9,11 +9,6 @@ import pytest_asyncio
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
max_model_len = 128 max_model_len = 128
...@@ -44,6 +39,10 @@ def server(): ...@@ -44,6 +39,10 @@ def server():
str(max_model_len), str(max_model_len),
] ]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling embed tests."""
import warnings
import torch
from vllm.platforms import current_platform
def pytest_collection_modifyitems(config, items):
"""Configure ROCm-specific settings based on collected tests."""
if not current_platform.is_rocm():
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
warnings.warn(
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
"to avoid HuggingFace Transformers accuracy issues",
UserWarning,
stacklevel=1,
)
...@@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import ( ...@@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "intfloat/e5-small" MODEL_NAME = "intfloat/e5-small"
...@@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667 ...@@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667
def server(): def server():
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"] args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
......
...@@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams ...@@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small" MODEL_NAME = "intfloat/multilingual-e5-small"
prompts = ["The chef prepared a delicious meal."] prompts = ["The chef prepared a delicious meal."]
...@@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."] ...@@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm(): def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM( llm = LLM(
...@@ -32,6 +33,7 @@ def llm(): ...@@ -32,6 +33,7 @@ def llm():
gpu_memory_utilization=0.75, gpu_memory_utilization=0.75,
enforce_eager=True, enforce_eager=True,
seed=0, seed=0,
attention_config=attention_config,
) )
yield weakref.proxy(llm) yield weakref.proxy(llm)
......
...@@ -28,16 +28,20 @@ from vllm.utils.serial_utils import ( ...@@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
decode_pooling_output, decode_pooling_output,
) )
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small" MODEL_NAME = "intfloat/multilingual-e5-small"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
DTYPE = "bfloat16" DTYPE = "bfloat16"
if current_platform.is_rocm():
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
...@@ -53,6 +57,10 @@ def server(): ...@@ -53,6 +57,10 @@ def server():
DUMMY_CHAT_TEMPLATE, DUMMY_CHAT_TEMPLATE,
] ]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
......
...@@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer ...@@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODELS = [ MODELS = [
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
EmbedModelInfo( EmbedModelInfo(
...@@ -62,6 +57,10 @@ def server(model_info, dtype: str): ...@@ -62,6 +57,10 @@ def server(model_info, dtype: str):
["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}'] ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
) )
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(model_info.name, args) as remote_server: with RemoteOpenAIServer(model_info.name, args) as remote_server:
yield remote_server yield remote_server
......
...@@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer ...@@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
def _generate_random_text(word_count: int) -> str: def _generate_random_text(word_count: int) -> str:
"""Generate random text with approximately the specified word count.""" """Generate random text with approximately the specified word count."""
...@@ -228,6 +223,10 @@ def server_with_chunked_processing(): ...@@ -228,6 +223,10 @@ def server_with_chunked_processing():
"0.8", "0.8",
] ]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment