Unverified Commit 5465f9ea authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

fix: handle error in port reserve and cleanup allocator (#1536)

parent a67e682b
......@@ -233,8 +233,7 @@ class ResourceAllocator:
f"GPU {stat['index']} ({stat['name']}): "
f"Memory: {format_memory_gb(stat['free_memory'])} free / "
f"{format_memory_gb(stat['total_memory'])} total, "
f"Utilization: {stat['gpu_utilization']}%, "
f"Temperature: {stat['temperature']}°C"
f"Utilization: {stat['gpu_utilization']}% "
)
except Exception as e:
logger.debug(f"Failed to get GPU stats: {e}")
......
......@@ -104,7 +104,7 @@ def get_env_or_reserved_port(env_var):
if port_env:
return int(port_env)
else:
with reserve_free_port() as port:
with reserve_free_port() as port: # type: ignore
return port
......
......@@ -23,9 +23,8 @@ import json
import logging
import os
import pathlib
import random
import socket
from typing import Any, DefaultDict, Dict, Iterator, Optional, Protocol, TextIO, Union
from typing import Any, DefaultDict, Dict, Iterator, Protocol, TextIO, Union
import typer
import yaml
......@@ -59,47 +58,46 @@ class ServiceProtocol(Protocol):
...
class PortReserver:
def __init__(self, host: str = "localhost"):
self.host = host
self.socket: socket.socket | None = None
self.port: int | None = None
def __enter__(self) -> int:
try:
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.socket.bind((self.host, 0))
_, self.port = self.socket.getsockname()
return self.port
except socket.error as e:
self.close_socket()
logger.warning(f"Failed to reserve port on {self.host}: {str(e)}")
raise
def __exit__(self, exc_type, exc_val, exc_tb):
self.close_socket()
def close_socket(self):
try:
if self.socket:
self.socket.close()
except socket.error as e:
logger.warning(f"Error while closing socket: {str(e)}")
# Don't re-raise the exception as this is cleanup code
return True
@contextlib.contextmanager
def reserve_free_port(
host: str = "localhost",
port: int | None = None,
prefix: Optional[str] = None,
max_retry: int = 50,
enable_so_reuseport: bool = False,
) -> Iterator[int]:
"""
detect free port and reserve until exit the context
Detect free port and reserve until exit the context.
Returns a context manager that yields the reserved port.
"""
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
if enable_so_reuseport:
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT) == 0:
raise RuntimeError("Failed to set SO_REUSEPORT.") from None
if prefix is not None:
prefix_num = int(prefix) * 10 ** (5 - len(prefix))
suffix_range = min(65535 - prefix_num, 10 ** (5 - len(prefix)))
for _ in range(max_retry):
suffix = random.randint(0, suffix_range)
port = int(f"{prefix_num + suffix}")
try:
sock.bind((host, port))
break
except OSError:
continue
else:
raise RuntimeError(
f"Cannot find free port with prefix {prefix} after {max_retry} retries."
) from None
else:
if port:
sock.bind((host, port))
else:
sock.bind((host, 0))
try:
yield sock.getsockname()[1]
finally:
sock.close()
with PortReserver(host) as port:
yield port
def save_dynamo_state(
......
......@@ -67,7 +67,6 @@ class GPUInfo:
self.name = name
self.uuid = uuid
self.available = True # Can be set to False if GPU is reserved/in use
self.temperature = 0 # in Celsius
self.utilization = 0 # in percent (0-100)
self.processes: list[GPUProcess] = []
......@@ -142,14 +141,6 @@ class GPUManager:
index=i, total_memory=memory_info.total, name=name, uuid=uuid
)
# Get additional GPU information if available
try:
gpu_info.temperature = pynvml.nvmlDeviceGetTemperature(
handle, pynvml.NVML_TEMPERATURE_GPU
)
except pynvml.NVMLError:
logger.debug(f"Could not get temperature for GPU {i}")
try:
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
gpu_info.utilization = utilization.gpu
......@@ -173,7 +164,7 @@ class GPUManager:
logger.warning(f"Error discovering GPUs: {e}")
def update_gpu_stats(self):
"""Update GPU statistics (utilization, memory, temperature, etc.)."""
"""Update GPU statistics (utilization, memory etc.)."""
if not self._initialized:
return
......@@ -185,14 +176,6 @@ class GPUManager:
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu.total_memory = memory_info.total
# Update temperature
try:
gpu.temperature = pynvml.nvmlDeviceGetTemperature(
handle, pynvml.NVML_TEMPERATURE_GPU
)
except pynvml.NVMLError:
pass
# Update utilization
try:
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
......@@ -242,97 +225,6 @@ class GPUManager:
logger.warning(f"Error getting GPU memory for GPU {index}: {e}")
return (0, 0)
def get_gpu_utilization(self, index: int) -> int:
"""
Return GPU utilization percentage for a specific GPU.
Args:
index: GPU index
Returns:
GPU utilization percentage (0-100)
"""
if not self._initialized or index >= len(self.gpus):
return 0
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
return utilization.gpu # Returns GPU utilization percentage (0-100)
except pynvml.NVMLError as e:
logger.warning(f"Error getting GPU utilization for GPU {index}: {e}")
return 0
def get_gpu_temperature(self, index: int) -> int:
"""
Return GPU temperature for a specific GPU.
Args:
index: GPU index
Returns:
GPU temperature in Celsius
"""
if not self._initialized or index >= len(self.gpus):
return 0
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
return pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
except pynvml.NVMLError as e:
logger.warning(f"Error getting GPU temperature for GPU {index}: {e}")
return 0
def get_gpu_processes(self, index: int) -> list[GPUProcess]:
"""
Return processes running on a specific GPU.
Args:
index: GPU index
Returns:
List of processes running on the GPU
"""
if not self._initialized or index >= len(self.gpus):
return []
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
return [
GPUProcess(pid=p.pid, used_memory=p.usedGpuMemory) for p in processes
]
except pynvml.NVMLError as e:
logger.warning(f"Error getting GPU processes for GPU {index}: {e}")
return []
def get_best_gpu_for_memory(self, required_memory: int) -> int:
"""
Return the index of the GPU with the most available memory that meets the requirement.
Args:
required_memory: Required memory in bytes
Returns:
GPU index, or -1 if no suitable GPU was found
"""
if not self._initialized:
return -1
best_gpu = -1
max_free = 0
for gpu in self.gpus:
if not gpu.available:
continue
_, free = self.get_gpu_memory(gpu.index)
if free > required_memory and free > max_free:
max_free = free
best_gpu = gpu.index
return best_gpu
def reset_allocations(self):
"""Reset all GPU allocations."""
self._gpu_fractions = []
......@@ -365,7 +257,6 @@ class GPUManager:
if total_memory > 0
else 0,
"gpu_utilization": gpu.utilization,
"temperature": gpu.temperature,
"process_count": len(gpu.processes),
"processes": [
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment