[MISC] Introduce pipeline parallelism partition strategies (#6920)

Co-authored-by: youkaichao <youkaichao@126.com>

[MISC] Introduce pipeline parallelism partition strategies (#6920)
Co-authored-by: youkaichao <youkaichao@126.com>
bd700134 · Cody Yu · GitHub · 2ee8d3ba · bd700134 · bd700134
Unverified Commit bd700134 authored Jul 31, 2024 by Cody Yu Committed by GitHub Jul 31, 2024
Showing with 66 additions and 5 deletions

tests/distributed/test_pipeline_partition.py tests/distributed/test_pipeline_partition.py +34 -0

vllm/distributed/utils.py vllm/distributed/utils.py +27 -5

vllm/envs.py vllm/envs.py +5 -0

No files found.
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
+import os
+import pytest
+from vllm.distributed.utils import get_pp_indices
+def test_custom_layer_partition():
+    def _verify(partition_str, num_layers, pp_size, goldens):
+        bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
+        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
+        for pp_rank, golden in enumerate(goldens):
+            assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+        if bak is not None:
+            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
+    # Even partition
+    _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Balanced partition
+    _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+    # Put reminder somewhere
+    _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+    # Invalid partition strings
+    with pytest.raises(ValueError):
+        _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    with pytest.raises(ValueError):
+        _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Wrong number of partitions
+    with pytest.raises(ValueError):
+        _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+    # Wrong number of layers
+    with pytest.raises(ValueError):
+        _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -6,6 +6,11 @@ from typing import Sequence, Tuple
 import torch
+import vllm.envs as envs
+from vllm.logger import init_logger
+logger = init_logger(__name__)
 def ensure_divisibility(numerator, denominator):
    """Ensure that numerator is divisible by the denominator."""
@@ -54,6 +59,23 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
    If the number of layers is not divisible by the number of partitions,
    the last partition will have the remaining layers.
    """
+    partition_list_str = envs.VLLM_PP_LAYER_PARTITION
+    if partition_list_str is not None:
+        try:
+            partitions = [
+                int(layer) for layer in partition_list_str.split(",")
+            ]
+        except ValueError as err:
+            raise ValueError("Invalid partition string: {}".format(
+                partition_list_str)) from err
+        if len(partitions) != pp_size:
+            raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
+        if sum(partitions) != num_hidden_layers:
+            raise ValueError(
+                f"{sum(partitions)=} does not match {num_hidden_layers=}.")
+        start_layer = sum(partitions[:pp_rank])
+        end_layer = start_layer + partitions[pp_rank]
+    else:
        layers_per_partition = num_hidden_layers // pp_size
        start_layer = pp_rank * layers_per_partition
        end_layer = start_layer + layers_per_partition

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -28,6 +28,7 @@ if TYPE_CHECKING:
    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
    VLLM_TRACE_FUNCTION: int = 0
    VLLM_ATTENTION_BACKEND: Optional[str] = None
+    VLLM_PP_LAYER_PARTITION: Optional[str] = None
    VLLM_CPU_KVCACHE_SPACE: int = 0
    VLLM_CPU_OMP_THREADS_BIND: str = ""
    VLLM_OPENVINO_KVCACHE_SPACE: int = 0
@@ -242,6 +243,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_ATTENTION_BACKEND":
    lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
+    # Pipeline stage partition strategy
+    "VLLM_PP_LAYER_PARTITION":
+    lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
    # (CPU backend only) CPU key-value cache space.
    # default is 4GB
    "VLLM_CPU_KVCACHE_SPACE":