"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "d3c04b6a39df016504c28ec3fc27ea58ca802a28"
Unverified Commit 6afa587d authored by Andreas Karatzas's avatar Andreas Karatzas Committed by GitHub
Browse files

[ROCm][CI] Fix serving tokens test failures (#34047)


Signed-off-by: default avatarAndreas Karatzas <akaratza@amd.com>
parent 94ed6cf6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import httpx
import pytest
import pytest_asyncio
......@@ -46,6 +48,27 @@ def server(request):
"--max-model-len",
"1024",
"--enforce-eager",
# On ROCm (e.g. MI355X/gfx950), bf16 GEMM results can differ by
# 1 ULP when the batch dimension (M) changes, because different M
# values cause the Tensile backend to select different tile
# configurations with different fp32 accumulation orders. With
# prefix caching, cache-miss prefills compute all tokens in one
# pass (large M) while cache-hit requests compute only the
# uncached suffix (small M), seeding a divergence that amplifies
# through the residual stream and flips argmax tokens.
# See: https://github.com/vllm-project/vllm/issues/33123
#
# Either disable prefix caching entirely, or enable it with
# --deterministic-prefix-caching which forces cache-miss prefills
# to split at block boundaries so the suffix GEMM shape is always
# identical regardless of cache state.
#
# Option A: disable prefix caching
"--no-enable-prefix-caching",
#
# Option B: deterministic prefix caching
# "--enable-prefix-caching",
# "--deterministic-prefix-caching",
]
extra_args = getattr(request, "param", None)
......@@ -56,7 +79,11 @@ def server(request):
else [str(extra_args)]
)
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
envs = os.environ.copy()
# See: https://github.com/vllm-project/vllm/pull/33493#issuecomment-3888060787
envs["VLLM_ROCM_USE_SKINNY_GEMM"] = "0"
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
yield remote_server
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment