Unverified Commit 4ff9b045 authored by Micah Williamson's avatar Micah Williamson Committed by GitHub
Browse files

[ROCm][CI] Prep Tests For Change To ROCM_ATTN As New Default Backend On ROCm (#36025)


Signed-off-by: default avatarMicah Williamson <micah.williamson@amd.com>
parent 3fd03f1e
...@@ -13,9 +13,10 @@ import os ...@@ -13,9 +13,10 @@ import os
from contextlib import contextmanager from contextlib import contextmanager
import lm_eval import lm_eval
import numpy as np
import yaml import yaml
from vllm.platforms import current_platform
DEFAULT_RTOL = 0.08 DEFAULT_RTOL = 0.08
...@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size): ...@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
"allow_deprecated_quantization=True," "allow_deprecated_quantization=True,"
) )
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
model_args += "attention_backend=TRITON_ATTN"
env_vars = eval_config.get("env_vars", None) env_vars = eval_config.get("env_vars", None)
with scoped_env_vars(env_vars): with scoped_env_vars(env_vars):
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
...@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size): ...@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
f"ground_truth={ground_truth:.3f} | " f"ground_truth={ground_truth:.3f} | "
f"measured={measured_value:.3f} | rtol={rtol}" f"measured={measured_value:.3f} | rtol={rtol}"
) )
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
min_acceptable = ground_truth * (1 - rtol)
success = success and measured_value >= min_acceptable
assert success assert success
...@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH: ...@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
BACKENDS=("allgather_reducescatter") BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail # Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0 export VLLM_ROCM_MOE_PADDING=0
PLATFORM_ARGS=("--no-async-scheduling") PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
echo "Disabled async scheduling for ROCm platform due to issues with spec decode." echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
else else
# Non-ROCm platform (CUDA/other) # Non-ROCm platform (CUDA/other)
......
...@@ -529,7 +529,7 @@ steps: ...@@ -529,7 +529,7 @@ steps:
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic # for basic
- python3 basic/offline_inference/chat.py - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
- python3 basic/offline_inference/generate.py --model facebook/opt-125m - python3 basic/offline_inference/generate.py --model facebook/opt-125m
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 basic/offline_inference/classify.py - python3 basic/offline_inference/classify.py
...@@ -2208,7 +2208,7 @@ steps: ...@@ -2208,7 +2208,7 @@ steps:
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic # for basic
- python3 basic/offline_inference/chat.py - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
- python3 basic/offline_inference/generate.py --model facebook/opt-125m - python3 basic/offline_inference/generate.py --model facebook/opt-125m
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 basic/offline_inference/classify.py - python3 basic/offline_inference/classify.py
......
...@@ -15,6 +15,7 @@ from vllm.model_executor.model_loader.tensorizer import ( ...@@ -15,6 +15,7 @@ from vllm.model_executor.model_loader.tensorizer import (
tensorize_lora_adapter, tensorize_lora_adapter,
tensorize_vllm_model, tensorize_vllm_model,
) )
from vllm.platforms import current_platform
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
...@@ -74,6 +75,8 @@ def server(model_uri, tensorize_model_and_lora): ...@@ -74,6 +75,8 @@ def server(model_uri, tensorize_model_and_lora):
MODEL_NAME, MODEL_NAME,
"--enable-lora", "--enable-lora",
] ]
if current_platform.is_rocm():
args += ["--attention-backend", "TRITON_ATTN"]
model_dir = os.path.dirname(model_uri) model_dir = os.path.dirname(model_uri)
with RemoteOpenAIServer(model_dir, args) as remote_server: with RemoteOpenAIServer(model_dir, args) as remote_server:
......
...@@ -8,6 +8,7 @@ from tests.models.utils import ( ...@@ -8,6 +8,7 @@ from tests.models.utils import (
EmbedModelInfo, EmbedModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from vllm.platforms import current_platform
from .mteb_embed_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models from .mteb_score_utils import mteb_test_rerank_models
...@@ -142,4 +143,9 @@ def test_embed_models_correctness( ...@@ -142,4 +143,9 @@ def test_embed_models_correctness(
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(vllm_runner, model_info) vllm_extra_kwargs = {}
if current_platform.is_rocm():
vllm_extra_kwargs["attention_backend"] = "TRITON_ATTN"
mteb_test_rerank_models(
vllm_runner, model_info, vllm_extra_kwargs=vllm_extra_kwargs
)
...@@ -173,6 +173,9 @@ VLM_TEST_SETTINGS = { ...@@ -173,6 +173,9 @@ VLM_TEST_SETTINGS = {
marks=[ marks=[
pytest.mark.core_model, pytest.mark.core_model,
], ],
vllm_runner_kwargs={"attention_backend": "TRITON_ATTN"}
if current_platform.is_rocm()
else {},
), ),
"ultravox": VLMTestInfo( "ultravox": VLMTestInfo(
models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"], models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
......
...@@ -13,6 +13,7 @@ import pytest ...@@ -13,6 +13,7 @@ import pytest
import torch import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len") @pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
...@@ -65,7 +66,8 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): ...@@ -65,7 +66,8 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# Don't use HF_TOKEN for ModelScope repos, otherwise it will fail # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
# with 400 Client Error: Bad Request. # with 400 Client Error: Bad Request.
m.setenv("HF_TOKEN", "") m.setenv("HF_TOKEN", "")
llm = LLM(model="qwen/Qwen1.5-0.5B-Chat") attn_backend = "TRITON_ATTN" if current_platform.is_rocm() else "auto"
llm = LLM(model="qwen/Qwen1.5-0.5B-Chat", attention_backend=attn_backend)
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
......
...@@ -91,6 +91,7 @@ def test_kv_sharing_fast_prefill( ...@@ -91,6 +91,7 @@ def test_kv_sharing_fast_prefill(
compilation_config=compilation_config, compilation_config=compilation_config,
seed=SEED, seed=SEED,
kv_sharing_fast_prefill=kv_sharing_fast_prefill, kv_sharing_fast_prefill=kv_sharing_fast_prefill,
attention_backend="TRITON_ATTN",
) )
responses = llm.generate(prompts, sampling_params) responses = llm.generate(prompts, sampling_params)
check_answers( check_answers(
......
...@@ -732,11 +732,13 @@ def test_mtp_correctness( ...@@ -732,11 +732,13 @@ def test_mtp_correctness(
method, model_name, tp_size = model_setup method, model_name, tp_size = model_setup
_skip_if_insufficient_gpus_for_tp(tp_size) _skip_if_insufficient_gpus_for_tp(tp_size)
attn_backend = "TRITON_ATTN" if current_platform.is_rocm() else "auto"
ref_llm = LLM( ref_llm = LLM(
model=model_name, model=model_name,
max_model_len=2048, max_model_len=2048,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
trust_remote_code=True, trust_remote_code=True,
attention_backend=attn_backend,
) )
ref_outputs = ref_llm.chat(test_prompts, sampling_config) ref_outputs = ref_llm.chat(test_prompts, sampling_config)
evaluate_llm_for_gsm8k( evaluate_llm_for_gsm8k(
...@@ -756,6 +758,7 @@ def test_mtp_correctness( ...@@ -756,6 +758,7 @@ def test_mtp_correctness(
"max_model_len": 2048, "max_model_len": 2048,
}, },
max_model_len=2048, max_model_len=2048,
attention_backend=attn_backend,
) )
evaluate_llm_for_gsm8k( evaluate_llm_for_gsm8k(
spec_llm, expected_accuracy_threshold=expected_accuracy_threshold spec_llm, expected_accuracy_threshold=expected_accuracy_threshold
......
...@@ -42,9 +42,7 @@ SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT ...@@ -42,9 +42,7 @@ SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT
# Force LLM instances into an identical, deterministic execution # Force LLM instances into an identical, deterministic execution
# mode so the test isolates spec-decode correctness only: # mode so the test isolates spec-decode correctness only:
ROCM_DETERMINISM_KWARGS: dict = ( ROCM_DETERMINISM_KWARGS: dict = (
dict( dict(max_num_seqs=1, attention_backend="TRITON_ATTN")
max_num_seqs=1,
)
if current_platform.is_rocm() if current_platform.is_rocm()
else {} else {}
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment