Unverified Commit 6afa587d authored by Andreas Karatzas's avatar Andreas Karatzas Committed by GitHub
Browse files

[ROCm][CI] Fix serving tokens test failures (#34047)


Signed-off-by: default avatarAndreas Karatzas <akaratza@amd.com>
parent 94ed6cf6
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import httpx import httpx
import pytest import pytest
import pytest_asyncio import pytest_asyncio
...@@ -46,6 +48,27 @@ def server(request): ...@@ -46,6 +48,27 @@ def server(request):
"--max-model-len", "--max-model-len",
"1024", "1024",
"--enforce-eager", "--enforce-eager",
# On ROCm (e.g. MI355X/gfx950), bf16 GEMM results can differ by
# 1 ULP when the batch dimension (M) changes, because different M
# values cause the Tensile backend to select different tile
# configurations with different fp32 accumulation orders. With
# prefix caching, cache-miss prefills compute all tokens in one
# pass (large M) while cache-hit requests compute only the
# uncached suffix (small M), seeding a divergence that amplifies
# through the residual stream and flips argmax tokens.
# See: https://github.com/vllm-project/vllm/issues/33123
#
# Either disable prefix caching entirely, or enable it with
# --deterministic-prefix-caching which forces cache-miss prefills
# to split at block boundaries so the suffix GEMM shape is always
# identical regardless of cache state.
#
# Option A: disable prefix caching
"--no-enable-prefix-caching",
#
# Option B: deterministic prefix caching
# "--enable-prefix-caching",
# "--deterministic-prefix-caching",
] ]
extra_args = getattr(request, "param", None) extra_args = getattr(request, "param", None)
...@@ -56,7 +79,11 @@ def server(request): ...@@ -56,7 +79,11 @@ def server(request):
else [str(extra_args)] else [str(extra_args)]
) )
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: envs = os.environ.copy()
# See: https://github.com/vllm-project/vllm/pull/33493#issuecomment-3888060787
envs["VLLM_ROCM_USE_SKINNY_GEMM"] = "0"
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
yield remote_server yield remote_server
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment