async_worker.py 4.44 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
The async worker that transfers experts in the background.
"""

import asyncio
import threading
from typing import TYPE_CHECKING

import torch
from torch.distributed import ProcessGroup

from vllm.distributed.parallel_state import get_ep_group
from vllm.logger import init_logger

from .rebalance_execute import transfer_layer

if TYPE_CHECKING:
    from .eplb_state import EplbState

logger = init_logger(__name__)


def start_async_worker(
    state: "EplbState",
    rank_mapping: dict[int, int] | None = None,
    is_profile: bool = False,
) -> threading.Thread:
    ep_group = get_ep_group().device_group
    rank = ep_group.rank()
    device_index = state.cuda_device_index
33
    assert state.is_async
34
35
36
37
38
39
40
41
42
43
44
45

    def thread_target() -> None:
        assert device_index is not None
        torch.cuda.set_device(device_index)
        cuda_stream = torch.cuda.Stream(device=device_index)
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        try:
            loop.run_until_complete(
                transfer_run_periodically(
                    state=state,
                    ep_group=ep_group,
46
                    cuda_stream=cuda_stream,
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
                    is_profile=is_profile,
                    rank_mapping=rank_mapping,
                )
            )
        except Exception as exc:  # pragma: no cover - diagnostic path
            logger.exception("async loop error (Rank %d): %s", rank, str(exc))
        finally:
            loop.close()

    thread = threading.Thread(target=thread_target, daemon=True)
    thread.start()
    return thread


async def transfer_run_periodically(
    state: "EplbState",
    ep_group: ProcessGroup,
64
    cuda_stream: torch.cuda.Stream,
65
66
67
68
69
70
71
    is_profile: bool = False,
    rank_mapping: dict[int, int] | None = None,
) -> None:
    while True:
        await asyncio.to_thread(state.rearrange_event.wait)
        logger.info("async worker woke up for EPLB transfer")

72
        assert state.is_async
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
        for model_state in state.model_states.values():
            current_num_layers = model_state.model.num_moe_layers
            while (
                model_state.rebalanced
                and model_state.layer_to_transfer < current_num_layers
            ):
                if (
                    not model_state.ep_buffer_ready
                    and model_state.rebalanced
                    and model_state.new_physical_to_logical_map is not None
                ):
                    await asyncio.to_thread(model_state.buffer_lock.acquire)
                    try:
                        if model_state.layer_to_transfer >= current_num_layers:
                            break

89
90
91
92
93
94
                        # Wait for the main thread to finish consuming the buffer
                        # before overwriting it
                        if model_state.buffer_consumed_event is not None:
                            cuda_stream.wait_event(model_state.buffer_consumed_event)
                            model_state.buffer_consumed_event = None

95
96
97
                        (
                            model_state.is_unchanged,
                            model_state.is_received_locally,
98
                            model_state.recv_metadata,
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
                        ) = await transfer_layer(
                            old_global_expert_indices=model_state.physical_to_logical_map,
                            new_global_expert_indices=model_state.new_physical_to_logical_map,
                            expert_weights=model_state.model.expert_weights,
                            expert_weights_buffer=model_state.expert_buffer,
                            ep_group=ep_group,
                            is_profile=is_profile,
                            layer=model_state.layer_to_transfer,
                            cuda_stream=cuda_stream,
                            rank_mapping=rank_mapping,
                        )
                        event = torch.cuda.Event(blocking=False)
                        cuda_stream.record_event(event)
                        model_state.buffer_ready_event = event
                        model_state.ep_buffer_ready = 1
                    finally:
                        model_state.buffer_lock.release()
                else:
                    if not model_state.rebalanced:
                        break
                    await asyncio.sleep(0.001)

        state.rearrange_event.clear()