ray_utils.py 6.57 KB
Newer Older
1
from typing import List, Optional, Tuple, Union
2

3
4
import msgspec

5
from vllm.config import ParallelConfig
6
from vllm.executor.msgspec_utils import decode_hook, encode_hook
7
from vllm.logger import init_logger
8
from vllm.platforms import current_platform
9
from vllm.sequence import ExecuteModelRequest, IntermediateTensors
10
from vllm.utils import get_ip, is_hip, is_xpu
11
from vllm.worker.worker_base import WorkerWrapperBase
12
13

logger = init_logger(__name__)
14
15
16

try:
    import ray
17

18
    class RayWorkerWrapper(WorkerWrapperBase):
19
20
21
        """Ray wrapper for vllm.worker.Worker, allowing Worker to be
        lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""

22
23
        def __init__(self, *args, **kwargs) -> None:
            super().__init__(*args, **kwargs)
24
25
26
27
28
            # Since the compiled DAG runs a main execution
            # in a different thread that calls cuda.set_device.
            # The flag indicates is set_device is called on
            # that thread.
            self.compiled_dag_cuda_device_set = False
29

30
31
32
33
            self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
                                                         dec_hook=decode_hook)
            self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)

34
35
36
37
38
39
40
41
        def get_node_ip(self) -> str:
            return get_ip()

        def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
            node_id = ray.get_runtime_context().get_node_id()
            gpu_ids = ray.get_gpu_ids()
            return node_id, gpu_ids

42
        def execute_model_spmd(
43
44
45
46
            self, req_or_tuple: Union[bytes,
                                      Tuple[bytes,
                                            Optional[IntermediateTensors]]]
        ) -> bytes:
47
48
49
50
            """Execute model in SPMD fashion: used only when SPMD worker and
            compiled DAG are both enabled.

            Args:
51
52
53
54
                req_or_tuple: A request or a tuple containing the
                    request and intermediate tensors. Intermediate tensors are
                    None unless if it is provided because it is > 0 pipeline
                    stage. The request is serialized by msgspec.
55
            """
56
57
58
59
60
61
62
            if isinstance(req_or_tuple, bytes):
                serialized_req, intermediate_tensors = req_or_tuple, None
            else:
                serialized_req, intermediate_tensors = req_or_tuple

            execute_model_req = self.input_decoder.decode(serialized_req)

63
64
65
            # TODO(swang): This is needed right now because Ray aDAG executes
            # on a background thread, so we need to reset torch's current
            # device.
66
67
68
69
70
            import torch
            if not self.compiled_dag_cuda_device_set:
                torch.cuda.set_device(self.worker.device)
                self.compiled_dag_cuda_device_set = True

71
72
            output = self.worker._execute_model_spmd(execute_model_req,
                                                     intermediate_tensors)
73
            # Pipeline model request and output to the next pipeline stage.
74
            if isinstance(output, IntermediateTensors):
75
76
77
78
                output = serialized_req, output
            else:
                output = self.output_encoder.encode(output)

79
            return output
80

81
82
    ray_import_err = None

83
except ImportError as e:
84
    ray = None  # type: ignore
85
    ray_import_err = e
86
    RayWorkerWrapper = None  # type: ignore
87
88


89
90
91
92
93
94
95
96
97
98
99
100
def ray_is_available() -> bool:
    """Returns True if Ray is available."""
    return ray is not None


def assert_ray_available():
    """Raise an exception if Ray is not available."""
    if ray is None:
        raise ValueError("Failed to import Ray, please install Ray with "
                         "`pip install ray`.") from ray_import_err


101
def initialize_ray_cluster(
102
    parallel_config: ParallelConfig,
Zhuohan Li's avatar
Zhuohan Li committed
103
    ray_address: Optional[str] = None,
104
105
106
107
108
109
):
    """Initialize the distributed cluster with Ray.

    it will connect to the Ray cluster and create a placement group
    for the workers, which includes the specification of the resources
    for each distributed worker.
110
111
112

    Args:
        parallel_config: The configurations for parallel execution.
Zhuohan Li's avatar
Zhuohan Li committed
113
        ray_address: The address of the Ray cluster. If None, uses
114
115
            the default Ray cluster address.
    """
116
    assert_ray_available()
117
118

    # Connect to a ray cluster.
119
    if is_hip() or is_xpu():
120
121
122
123
124
125
126
127
128
        ray.init(address=ray_address,
                 ignore_reinit_error=True,
                 num_gpus=parallel_config.world_size)
    else:
        ray.init(address=ray_address, ignore_reinit_error=True)

    if parallel_config.placement_group:
        # Placement group is already set.
        return
129

130
    device_str = "GPU" if not current_platform.is_tpu() else "TPU"
131
    # Create placement group for worker processes
132
133
134
135
136
    current_placement_group = ray.util.get_current_placement_group()
    if current_placement_group:
        # We are in a placement group
        bundles = current_placement_group.bundle_specs
        # Verify that we can use the placement group.
137
        device_bundles = 0
138
        for bundle in bundles:
139
140
            bundle_devices = bundle.get(device_str, 0)
            if bundle_devices > 1:
141
                raise ValueError(
142
143
144
145
146
                    "Placement group bundle cannot have more than 1 "
                    f"{device_str}.")
            if bundle_devices:
                device_bundles += 1
        if parallel_config.world_size > device_bundles:
147
            raise ValueError(
148
149
150
151
                f"The number of required {device_str}s exceeds the total "
                f"number of available {device_str}s in the placement group."
                f"Required number of devices: {parallel_config.world_size}. "
                f"Total number of devices: {device_bundles}.")
152
    else:
153
154
        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
        if parallel_config.world_size > num_devices_in_cluster:
155
            raise ValueError(
156
157
                f"The number of required {device_str}s exceeds the total "
                f"number of available {device_str}s in the placement group.")
158
        # Create a new placement group
159
160
161
        placement_group_specs = ([{
            device_str: 1
        }] * parallel_config.world_size)
162
163
        current_placement_group = ray.util.placement_group(
            placement_group_specs)
164
165
166
167
168
        # Wait until PG is ready - this will block until all
        # requested resources are available, and will timeout
        # if they cannot be provisioned.
        ray.get(current_placement_group.ready(), timeout=1800)

169
170
    # Set the placement group in the parallel config
    parallel_config.placement_group = current_placement_group