Offload port selection to OS (#467)

6d7d95a7 · Zhangir Azerbayev · GitHub · 96853af5 · 6d7d95a7
Unverified Commit 6d7d95a7 authored Jul 16, 2023 by Zhangir Azerbayev Committed by GitHub Jul 15, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 3 deletions

vllm/engine/ray_utils.py vllm/engine/ray_utils.py +9 -3

No files found.
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
-import random
+import socket
 from typing import List, Optional, Tuple

 try:
@@ -12,6 +12,12 @@ from vllm.config import ParallelConfig
 DeviceID = Tuple[int, Optional[str], int]


+def get_open_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
+
+
 def initialize_cluster(
    parallel_config: ParallelConfig,
    engine_use_ray: bool = False,
@@ -42,7 +48,7 @@ def initialize_cluster(

    if not parallel_config.worker_use_ray:
        # Initialize cluster locally.
-        port = random.randint(10000, 20000)
+        port = get_open_port()
        # We need to setup the distributed init method to make sure
        # the distributed megatron code (e.g., get world size) works correctly.
        distributed_init_method = f"tcp://localhost:{port}"
@@ -96,7 +102,7 @@ def initialize_cluster(
            stage_devices.append((rank, node_resource, current_device_id))
            if distributed_init_method is None:
                ip = node_resource.split("node:")[-1]
-                port = random.randint(10000, 20000)
+                port = get_open_port()
                distributed_init_method = f"tcp://{ip}:{port}"
            rank += 1
            current_device_id += 1