Unverified Commit 6d7d95a7 authored by Zhangir Azerbayev's avatar Zhangir Azerbayev Committed by GitHub
Browse files

Offload port selection to OS (#467)

parent 96853af5
import random
import socket
from typing import List, Optional, Tuple
try:
......@@ -12,6 +12,12 @@ from vllm.config import ParallelConfig
DeviceID = Tuple[int, Optional[str], int]
def get_open_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("", 0))
return s.getsockname()[1]
def initialize_cluster(
parallel_config: ParallelConfig,
engine_use_ray: bool = False,
......@@ -42,7 +48,7 @@ def initialize_cluster(
if not parallel_config.worker_use_ray:
# Initialize cluster locally.
port = random.randint(10000, 20000)
port = get_open_port()
# We need to setup the distributed init method to make sure
# the distributed megatron code (e.g., get world size) works correctly.
distributed_init_method = f"tcp://localhost:{port}"
......@@ -96,7 +102,7 @@ def initialize_cluster(
stage_devices.append((rank, node_resource, current_device_id))
if distributed_init_method is None:
ip = node_resource.split("node:")[-1]
port = random.randint(10000, 20000)
port = get_open_port()
distributed_init_method = f"tcp://{ip}:{port}"
rank += 1
current_device_id += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment