Unverified Commit d6b61e51 authored by Aaron Hao's avatar Aaron Hao Committed by GitHub
Browse files

[BUG] Fix async rlhf tests (#35811)


Signed-off-by: default avatarahao-anyscale <ahao@anyscale.com>
parent cf632499
...@@ -149,7 +149,7 @@ steps: ...@@ -149,7 +149,7 @@ steps:
num_devices: 2 num_devices: 2
commands: commands:
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
# - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
......
...@@ -1006,6 +1006,10 @@ class Worker(WorkerBase): ...@@ -1006,6 +1006,10 @@ class Worker(WorkerBase):
load_weights=load_weights_direct, load_weights=load_weights_direct,
) )
# NCCL broadcast/packed path are asynchronous.
# Sync here so the next step uses the new weights.
torch.accelerator.synchronize()
def shutdown(self) -> None: def shutdown(self) -> None:
# has_kv_transfer_group can be None during interpreter shutdown. # has_kv_transfer_group can be None during interpreter shutdown.
if ensure_kv_transfer_shutdown is not None: if ensure_kv_transfer_shutdown is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment