Unverified Commit 1e1c0678 authored by Bradley D's avatar Bradley D Committed by GitHub
Browse files

[ci][amd] fix EPLB execution test (#28742)


Signed-off-by: default avatarBradley Davis <bradleyhd@meta.com>
parent 7218f839
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import multiprocessing
import os import os
import random import random
import pytest import pytest
import torch import torch
import torch.distributed import torch.distributed
import torch.multiprocessing as mp
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
from vllm.distributed.parallel_state import ( from vllm.distributed.parallel_state import (
...@@ -17,10 +17,12 @@ from vllm.distributed.parallel_state import ( ...@@ -17,10 +17,12 @@ from vllm.distributed.parallel_state import (
) )
from vllm.utils.system_utils import update_environment_variables from vllm.utils.system_utils import update_environment_variables
mp.set_start_method("spawn", force=True)
def distributed_run(fn, world_size):
def distributed_run(fn, world_size, *args):
number_of_processes = world_size number_of_processes = world_size
processes: list[multiprocessing.Process] = [] processes: list[mp.Process] = []
for i in range(number_of_processes): for i in range(number_of_processes):
env: dict[str, str] = {} env: dict[str, str] = {}
env["RANK"] = str(i) env["RANK"] = str(i)
...@@ -29,7 +31,7 @@ def distributed_run(fn, world_size): ...@@ -29,7 +31,7 @@ def distributed_run(fn, world_size):
env["LOCAL_WORLD_SIZE"] = str(number_of_processes) env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
env["MASTER_ADDR"] = "localhost" env["MASTER_ADDR"] = "localhost"
env["MASTER_PORT"] = "12345" env["MASTER_PORT"] = "12345"
p = multiprocessing.Process(target=fn, args=(env,)) p = mp.Process(target=fn, args=(env, world_size, *args))
processes.append(p) processes.append(p)
p.start() p.start()
...@@ -40,11 +42,7 @@ def distributed_run(fn, world_size): ...@@ -40,11 +42,7 @@ def distributed_run(fn, world_size):
assert p.exitcode == 0 assert p.exitcode == 0
def worker_fn_wrapper(fn): def set_env_vars_and_device(env: dict[str, str]) -> None:
# `multiprocessing.Process` cannot accept environment variables directly
# so we need to pass the environment variables as arguments
# and update the environment variables in the function
def wrapped_fn(env):
update_environment_variables(env) update_environment_variables(env)
local_rank = os.environ["LOCAL_RANK"] local_rank = os.environ["LOCAL_RANK"]
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
...@@ -55,10 +53,6 @@ def worker_fn_wrapper(fn): ...@@ -55,10 +53,6 @@ def worker_fn_wrapper(fn):
random.seed(42) random.seed(42)
torch.manual_seed(42) torch.manual_seed(42)
fn()
return wrapped_fn
def create_expert_indices_with_redundancy( def create_expert_indices_with_redundancy(
num_layers: int, num_layers: int,
...@@ -275,41 +269,12 @@ def verify_redundant_experts_have_same_weights( ...@@ -275,41 +269,12 @@ def verify_redundant_experts_have_same_weights(
) )
@pytest.mark.parametrize( def _test_rearrange_expert_weights_with_redundancy(
"world_size,num_layers,num_local_experts,num_logical_experts", env, world_size, num_layers, num_local_experts, num_logical_experts
[ ) -> None:
# 2 GPU, 2 experts per GPU
# 3 logical experts, 4 physical experts, 1 redundant experts
(2, 1, 2, 3),
# 2 GPU, 3 experts per GPU
# 4 logical experts, 6 physical experts, 2 redundant experts
(2, 2, 3, 4),
# 2 GPU, 8 experts per GPU
# 16 logical experts, 16 physical experts, 0 redundant experts
(2, 4, 8, 16),
# 4 GPU, 2 experts per GPU
# 6 logical experts, 8 physical experts, 2 redundant experts
(4, 1, 2, 6),
# 4 GPU, 2 experts per GPU
# 5 logical experts, 8 physical experts, 3 redundant experts
(4, 2, 2, 5),
# 4 GPU, 8 experts per GPU
# 16 logical experts, 32 physical experts, 16 redundant experts
(4, 8, 8, 16),
],
)
def test_rearrange_expert_weights_with_redundancy(
world_size, num_layers, num_local_experts, num_logical_experts
):
"""Test the functionality of rearranging expert weights with redundancy."""
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
@worker_fn_wrapper
def worker_fn():
# Initialize model parallel (using tensor parallel as an entrypoint # Initialize model parallel (using tensor parallel as an entrypoint
# to expert parallel) # to expert parallel)
set_env_vars_and_device(env)
ensure_model_parallel_initialized( ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
) )
...@@ -376,21 +341,48 @@ def test_rearrange_expert_weights_with_redundancy( ...@@ -376,21 +341,48 @@ def test_rearrange_expert_weights_with_redundancy(
num_local_experts, num_local_experts,
) )
distributed_run(worker_fn, world_size)
@pytest.mark.parametrize(
@pytest.mark.parametrize("world_size", [2, 4]) "world_size,num_layers,num_local_experts,num_logical_experts",
def test_rearrange_expert_weights_no_change(world_size): [
""" # 2 GPU, 2 experts per GPU
Test that when the indices do not change, the weights should remain # 3 logical experts, 4 physical experts, 1 redundant experts
unchanged. (2, 1, 2, 3),
""" # 2 GPU, 3 experts per GPU
# 4 logical experts, 6 physical experts, 2 redundant experts
(2, 2, 3, 4),
# 2 GPU, 8 experts per GPU
# 16 logical experts, 16 physical experts, 0 redundant experts
(2, 4, 8, 16),
# 4 GPU, 2 experts per GPU
# 6 logical experts, 8 physical experts, 2 redundant experts
(4, 1, 2, 6),
# 4 GPU, 2 experts per GPU
# 5 logical experts, 8 physical experts, 3 redundant experts
(4, 2, 2, 5),
# 4 GPU, 8 experts per GPU
# 16 logical experts, 32 physical experts, 16 redundant experts
(4, 8, 8, 16),
],
)
def test_rearrange_expert_weights_with_redundancy(
world_size, num_layers, num_local_experts, num_logical_experts
):
"""Test the functionality of rearranging expert weights with redundancy."""
if torch.cuda.device_count() < world_size: if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test") pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(
_test_rearrange_expert_weights_with_redundancy,
world_size,
num_layers,
num_local_experts,
num_logical_experts,
)
@worker_fn_wrapper def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
def worker_fn(): set_env_vars_and_device(env)
ensure_model_parallel_initialized( ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
) )
...@@ -440,21 +432,25 @@ def test_rearrange_expert_weights_no_change(world_size): ...@@ -440,21 +432,25 @@ def test_rearrange_expert_weights_no_change(world_size):
torch.testing.assert_close( torch.testing.assert_close(
expert_weights[layer][weight_idx], expert_weights[layer][weight_idx],
original_weights[layer][weight_idx], original_weights[layer][weight_idx],
msg=f"Layer {layer}, weight {weight_idx} should remain unchanged", msg=f"""Layer {layer}, weight {weight_idx}
should remain unchanged""",
) )
distributed_run(worker_fn, world_size)
@pytest.mark.parametrize("world_size", [2, 4]) @pytest.mark.parametrize("world_size", [2, 4])
def test_rearrange_expert_weights_profile_mode(world_size): def test_rearrange_expert_weights_no_change(world_size):
"""Test profile mode (should not copy actual weights)""" """
Test that when the indices do not change, the weights should remain
unchanged.
"""
if torch.cuda.device_count() < world_size: if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test") pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(_test_rearrange_expert_weights_no_change, world_size)
@worker_fn_wrapper
def worker_fn(): def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
set_env_vars_and_device(env)
ensure_model_parallel_initialized( ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
) )
...@@ -514,4 +510,11 @@ def test_rearrange_expert_weights_profile_mode(world_size): ...@@ -514,4 +510,11 @@ def test_rearrange_expert_weights_profile_mode(world_size):
msg="In profile mode, the weights should remain unchanged", msg="In profile mode, the weights should remain unchanged",
) )
distributed_run(worker_fn, world_size)
@pytest.mark.parametrize("world_size", [2, 4])
def test_rearrange_expert_weights_profile_mode(world_size):
"""Test profile mode (should not copy actual weights)"""
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment