[CI/Build] Test torchrun with 8 cards (#27548)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>

[CI/Build] Test torchrun with 8 cards (#27548)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
f7a66828 · 22quinn · GitHub · a9fe0793 · f7a66828 · f7a66828
Unverified Commit f7a66828 authored Oct 29, 2025 by 22quinn Committed by GitHub Oct 29, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 94 additions and 10 deletions

.buildkite/test-pipeline.yaml .buildkite/test-pipeline.yaml +20 -2

examples/offline_inference/torchrun_dp_example.py examples/offline_inference/torchrun_dp_example.py +74 -8

No files found.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -205,6 +205,24 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: EPLB Algorithm Test # 5min
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
@@ -401,7 +419,7 @@ steps:
      --ignore=lora/test_deepseekv2_tp.py \
      --ignore=lora/test_gptoss.py \
      --ignore=lora/test_qwen3moe_tp.py
  parallelism: 4
 - label: PyTorch Compilation Unit Tests # 15min
@@ -1126,7 +1144,7 @@ steps:
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"

--- a/examples/offline_inference/torchrun_dp_example.py
+++ b/examples/offline_inference/torchrun_dp_example.py
@@ -9,10 +9,76 @@ To run this example:
 ```bash
 $ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
 ```
+With custom parallelism settings:
+```bash
+$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \
+    --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+```
 """
+import argparse
 from vllm import LLM, SamplingParams
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Data-parallel inference with torchrun"
+    )
+    parser.add_argument(
+        "--tp-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size (default: 1)",
+    )
+    parser.add_argument(
+        "--pp-size",
+        type=int,
+        default=1,
+        help="Pipeline parallel size (default: 1)",
+    )
+    parser.add_argument(
+        "--dp-size",
+        type=int,
+        default=2,
+        help="Data parallel size (default: 2)",
+    )
+    parser.add_argument(
+        "--enable-ep",
+        action="store_true",
+        help="Enable expert parallel (default: False)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="microsoft/Phi-mini-MoE-instruct",
+        help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)",
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=4096,
+        help="Maximum model length (default: 4096)",
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.6,
+        help="GPU memory utilization (default: 0.6)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1,
+        help="Random seed (default: 1)",
+    )
+    return parser.parse_args()
+args = parse_args()
 # Create prompts, the same across all ranks
 prompts = [
    "Hello, my name is",
@@ -30,15 +96,15 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # all ranks have the same random seed, so that sampling can be
 # deterministic across ranks.
 llm = LLM(
-    model="microsoft/Phi-mini-MoE-instruct",
+    model=args.model,
-    tensor_parallel_size=1,
+    tensor_parallel_size=args.tp_size,
-    data_parallel_size=2,
+    data_parallel_size=args.dp_size,
-    pipeline_parallel_size=1,
+    pipeline_parallel_size=args.pp_size,
-    enable_expert_parallel=False,
+    enable_expert_parallel=args.enable_ep,
    distributed_executor_backend="external_launcher",
-    max_model_len=4096,
+    max_model_len=args.max_model_len,
-    gpu_memory_utilization=0.6,
+    gpu_memory_utilization=args.gpu_memory_utilization,
-    seed=1,
+    seed=args.seed,
 )
 dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank