"lib/bindings/vscode:/vscode.git/clone" did not exist on "d9657b34e4594c04a29ea4e56dbe142aa65d34b2"
Unverified Commit 5465f9ea authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

fix: handle error in port reserve and cleanup allocator (#1536)

parent a67e682b
...@@ -233,8 +233,7 @@ class ResourceAllocator: ...@@ -233,8 +233,7 @@ class ResourceAllocator:
f"GPU {stat['index']} ({stat['name']}): " f"GPU {stat['index']} ({stat['name']}): "
f"Memory: {format_memory_gb(stat['free_memory'])} free / " f"Memory: {format_memory_gb(stat['free_memory'])} free / "
f"{format_memory_gb(stat['total_memory'])} total, " f"{format_memory_gb(stat['total_memory'])} total, "
f"Utilization: {stat['gpu_utilization']}%, " f"Utilization: {stat['gpu_utilization']}% "
f"Temperature: {stat['temperature']}°C"
) )
except Exception as e: except Exception as e:
logger.debug(f"Failed to get GPU stats: {e}") logger.debug(f"Failed to get GPU stats: {e}")
......
...@@ -104,7 +104,7 @@ def get_env_or_reserved_port(env_var): ...@@ -104,7 +104,7 @@ def get_env_or_reserved_port(env_var):
if port_env: if port_env:
return int(port_env) return int(port_env)
else: else:
with reserve_free_port() as port: with reserve_free_port() as port: # type: ignore
return port return port
......
...@@ -23,9 +23,8 @@ import json ...@@ -23,9 +23,8 @@ import json
import logging import logging
import os import os
import pathlib import pathlib
import random
import socket import socket
from typing import Any, DefaultDict, Dict, Iterator, Optional, Protocol, TextIO, Union from typing import Any, DefaultDict, Dict, Iterator, Protocol, TextIO, Union
import typer import typer
import yaml import yaml
...@@ -59,47 +58,46 @@ class ServiceProtocol(Protocol): ...@@ -59,47 +58,46 @@ class ServiceProtocol(Protocol):
... ...
class PortReserver:
def __init__(self, host: str = "localhost"):
self.host = host
self.socket: socket.socket | None = None
self.port: int | None = None
def __enter__(self) -> int:
try:
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.socket.bind((self.host, 0))
_, self.port = self.socket.getsockname()
return self.port
except socket.error as e:
self.close_socket()
logger.warning(f"Failed to reserve port on {self.host}: {str(e)}")
raise
def __exit__(self, exc_type, exc_val, exc_tb):
self.close_socket()
def close_socket(self):
try:
if self.socket:
self.socket.close()
except socket.error as e:
logger.warning(f"Error while closing socket: {str(e)}")
# Don't re-raise the exception as this is cleanup code
return True
@contextlib.contextmanager @contextlib.contextmanager
def reserve_free_port( def reserve_free_port(
host: str = "localhost", host: str = "localhost",
port: int | None = None,
prefix: Optional[str] = None,
max_retry: int = 50,
enable_so_reuseport: bool = False,
) -> Iterator[int]: ) -> Iterator[int]:
""" """
detect free port and reserve until exit the context Detect free port and reserve until exit the context.
Returns a context manager that yields the reserved port.
""" """
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) with PortReserver(host) as port:
if enable_so_reuseport: yield port
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT) == 0:
raise RuntimeError("Failed to set SO_REUSEPORT.") from None
if prefix is not None:
prefix_num = int(prefix) * 10 ** (5 - len(prefix))
suffix_range = min(65535 - prefix_num, 10 ** (5 - len(prefix)))
for _ in range(max_retry):
suffix = random.randint(0, suffix_range)
port = int(f"{prefix_num + suffix}")
try:
sock.bind((host, port))
break
except OSError:
continue
else:
raise RuntimeError(
f"Cannot find free port with prefix {prefix} after {max_retry} retries."
) from None
else:
if port:
sock.bind((host, port))
else:
sock.bind((host, 0))
try:
yield sock.getsockname()[1]
finally:
sock.close()
def save_dynamo_state( def save_dynamo_state(
......
...@@ -67,7 +67,6 @@ class GPUInfo: ...@@ -67,7 +67,6 @@ class GPUInfo:
self.name = name self.name = name
self.uuid = uuid self.uuid = uuid
self.available = True # Can be set to False if GPU is reserved/in use self.available = True # Can be set to False if GPU is reserved/in use
self.temperature = 0 # in Celsius
self.utilization = 0 # in percent (0-100) self.utilization = 0 # in percent (0-100)
self.processes: list[GPUProcess] = [] self.processes: list[GPUProcess] = []
...@@ -142,14 +141,6 @@ class GPUManager: ...@@ -142,14 +141,6 @@ class GPUManager:
index=i, total_memory=memory_info.total, name=name, uuid=uuid index=i, total_memory=memory_info.total, name=name, uuid=uuid
) )
# Get additional GPU information if available
try:
gpu_info.temperature = pynvml.nvmlDeviceGetTemperature(
handle, pynvml.NVML_TEMPERATURE_GPU
)
except pynvml.NVMLError:
logger.debug(f"Could not get temperature for GPU {i}")
try: try:
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
gpu_info.utilization = utilization.gpu gpu_info.utilization = utilization.gpu
...@@ -173,7 +164,7 @@ class GPUManager: ...@@ -173,7 +164,7 @@ class GPUManager:
logger.warning(f"Error discovering GPUs: {e}") logger.warning(f"Error discovering GPUs: {e}")
def update_gpu_stats(self): def update_gpu_stats(self):
"""Update GPU statistics (utilization, memory, temperature, etc.).""" """Update GPU statistics (utilization, memory etc.)."""
if not self._initialized: if not self._initialized:
return return
...@@ -185,14 +176,6 @@ class GPUManager: ...@@ -185,14 +176,6 @@ class GPUManager:
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpu.total_memory = memory_info.total gpu.total_memory = memory_info.total
# Update temperature
try:
gpu.temperature = pynvml.nvmlDeviceGetTemperature(
handle, pynvml.NVML_TEMPERATURE_GPU
)
except pynvml.NVMLError:
pass
# Update utilization # Update utilization
try: try:
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
...@@ -242,97 +225,6 @@ class GPUManager: ...@@ -242,97 +225,6 @@ class GPUManager:
logger.warning(f"Error getting GPU memory for GPU {index}: {e}") logger.warning(f"Error getting GPU memory for GPU {index}: {e}")
return (0, 0) return (0, 0)
def get_gpu_utilization(self, index: int) -> int:
"""
Return GPU utilization percentage for a specific GPU.
Args:
index: GPU index
Returns:
GPU utilization percentage (0-100)
"""
if not self._initialized or index >= len(self.gpus):
return 0
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
return utilization.gpu # Returns GPU utilization percentage (0-100)
except pynvml.NVMLError as e:
logger.warning(f"Error getting GPU utilization for GPU {index}: {e}")
return 0
def get_gpu_temperature(self, index: int) -> int:
"""
Return GPU temperature for a specific GPU.
Args:
index: GPU index
Returns:
GPU temperature in Celsius
"""
if not self._initialized or index >= len(self.gpus):
return 0
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
return pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
except pynvml.NVMLError as e:
logger.warning(f"Error getting GPU temperature for GPU {index}: {e}")
return 0
def get_gpu_processes(self, index: int) -> list[GPUProcess]:
"""
Return processes running on a specific GPU.
Args:
index: GPU index
Returns:
List of processes running on the GPU
"""
if not self._initialized or index >= len(self.gpus):
return []
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
return [
GPUProcess(pid=p.pid, used_memory=p.usedGpuMemory) for p in processes
]
except pynvml.NVMLError as e:
logger.warning(f"Error getting GPU processes for GPU {index}: {e}")
return []
def get_best_gpu_for_memory(self, required_memory: int) -> int:
"""
Return the index of the GPU with the most available memory that meets the requirement.
Args:
required_memory: Required memory in bytes
Returns:
GPU index, or -1 if no suitable GPU was found
"""
if not self._initialized:
return -1
best_gpu = -1
max_free = 0
for gpu in self.gpus:
if not gpu.available:
continue
_, free = self.get_gpu_memory(gpu.index)
if free > required_memory and free > max_free:
max_free = free
best_gpu = gpu.index
return best_gpu
def reset_allocations(self): def reset_allocations(self):
"""Reset all GPU allocations.""" """Reset all GPU allocations."""
self._gpu_fractions = [] self._gpu_fractions = []
...@@ -365,7 +257,6 @@ class GPUManager: ...@@ -365,7 +257,6 @@ class GPUManager:
if total_memory > 0 if total_memory > 0
else 0, else 0,
"gpu_utilization": gpu.utilization, "gpu_utilization": gpu.utilization,
"temperature": gpu.temperature,
"process_count": len(gpu.processes), "process_count": len(gpu.processes),
"processes": [ "processes": [
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment