Unverified Commit fc168c33 authored by Zhewen Li's avatar Zhewen Li Committed by GitHub
Browse files

[CI/Build] Fix test_torch_utils in AMD CI (#27317)


Signed-off-by: default avatarzhewenli <zhewenli@meta.com>
parent acc78aeb
...@@ -50,7 +50,7 @@ steps: ...@@ -50,7 +50,7 @@ steps:
- label: Async Engine, Inputs, Utils, Worker Test # 36min - label: Async Engine, Inputs, Utils, Worker Test # 36min
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
......
...@@ -60,15 +60,10 @@ def test_common_broadcastable_dtype(dtypes, expected_result): ...@@ -60,15 +60,10 @@ def test_common_broadcastable_dtype(dtypes, expected_result):
assert common_broadcastable_dtype(dtypes) == expected_result assert common_broadcastable_dtype(dtypes) == expected_result
def test_current_stream_multithread(): def _test_stream_thread(main_expected_stream: torch.cuda.Stream):
import threading import threading
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
main_default_stream = torch.cuda.current_stream()
child_stream = torch.cuda.Stream() child_stream = torch.cuda.Stream()
thread_stream_ready = threading.Event() thread_stream_ready = threading.Event()
thread_can_exit = threading.Event() thread_can_exit = threading.Event()
...@@ -90,15 +85,44 @@ def test_current_stream_multithread(): ...@@ -90,15 +85,44 @@ def test_current_stream_multithread():
assert main_current_stream != child_stream, ( assert main_current_stream != child_stream, (
"Main thread's current_stream was contaminated by child thread" "Main thread's current_stream was contaminated by child thread"
) )
assert main_current_stream == main_default_stream, ( assert main_current_stream == main_expected_stream, (
"Main thread's current_stream is not the default stream" f"Main thread's stream changed unexpectedly. "
f"Expected {main_expected_stream}, got {main_current_stream}"
) )
# Notify child thread it can exit
thread_can_exit.set() thread_can_exit.set()
finally: finally:
# Ensure child thread exits properly
child_thread.join(timeout=5) child_thread.join(timeout=5)
if child_thread.is_alive(): if child_thread.is_alive():
pytest.fail("Child thread failed to exit properly") pytest.fail("Child thread failed to exit properly")
def test_current_stream_multithread():
from vllm.platforms import current_platform
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
if current_platform.is_rocm():
main_dedicated_stream = current_stream()
assert main_dedicated_stream.cuda_stream != 0, (
"ROCm should create a dedicated stream, not use default stream (0x0)"
)
main_stream_again = current_stream()
assert main_stream_again == main_dedicated_stream, (
"Multiple calls to current_stream should return the same dedicated stream"
)
_test_stream_thread(main_dedicated_stream)
else:
main_default_stream = torch.cuda.default_stream()
main_initial_stream = current_stream()
assert main_initial_stream == main_default_stream, (
"First call to current_stream should return default stream on CUDA"
)
_test_stream_thread(main_default_stream)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment