[ci][distributed] add tests for custom allreduce (#5689)

d571ca01 · youkaichao · GitHub · afed90a0 · d571ca01 · d571ca01
Unverified Commit d571ca01 authored Jun 19, 2024 by youkaichao Committed by GitHub Jun 19, 2024
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 5 deletions

.buildkite/test-pipeline.yaml .buildkite/test-pipeline.yaml +6 -2

tests/distributed/test_custom_all_reduce.py tests/distributed/test_custom_all_reduce.py +4 -3

No files found.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -182,7 +182,11 @@ steps:
  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html
- label: A100 status
+- label: Distributed Tests (A100)
  gpu: a100
  commands: 
-  - nvidia-smi
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -11,7 +11,8 @@ from vllm.distributed.communication_op import (  # noqa
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
                                             get_tp_group, graph_capture)
-from ..utils import (init_test_distributed_environment,
+from ..utils import (ensure_model_parallel_initialized,
+                     init_test_distributed_environment,
                     multi_process_tensor_parallel)
 random.seed(42)
@@ -27,8 +28,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
                                      distributed_init_port)
+    ensure_model_parallel_initialized(tp_size, pp_size)
-    group = get_tensor_model_parallel_group()
+    group = get_tensor_model_parallel_group().device_group
    # A small all_reduce for warmup.
    # this is needed because device communicators might be created lazily