fix: handle error in port reserve and cleanup allocator (#1536)

5465f9ea · Biswa Panda · GitHub · a67e682b · 5465f9ea · 5465f9ea
Unverified Commit 5465f9ea authored Jun 16, 2025 by Biswa Panda Committed by GitHub Jun 16, 2025
4 changed files
--- a/deploy/sdk/src/dynamo/sdk/cli/allocator.py
+++ b/deploy/sdk/src/dynamo/sdk/cli/allocator.py
@@ -233,8 +233,7 @@ class ResourceAllocator:
                                f"GPU {stat['index']} ({stat['name']}): "
                                f"Memory: {format_memory_gb(stat['free_memory'])} free / "
                                f"{format_memory_gb(stat['total_memory'])} total, "
-                                f"Utilization: {stat['gpu_utilization']}%, "
+                                f"Utilization: {stat['gpu_utilization']}% "
-                                f"Temperature: {stat['temperature']}°C"
                            )
                    except Exception as e:
                        logger.debug(f"Failed to get GPU stats: {e}")

--- a/deploy/sdk/src/dynamo/sdk/cli/circus.py
+++ b/deploy/sdk/src/dynamo/sdk/cli/circus.py
@@ -104,7 +104,7 @@ def get_env_or_reserved_port(env_var):
    if port_env:
        return int(port_env)
    else:
-        with reserve_free_port() as port:
+        with reserve_free_port() as port:  # type: ignore
            return port

--- a/deploy/sdk/src/dynamo/sdk/cli/utils.py
+++ b/deploy/sdk/src/dynamo/sdk/cli/utils.py
@@ -23,9 +23,8 @@ import json
 import logging
 import os
 import pathlib
-import random
 import socket
-from typing import Any, DefaultDict, Dict, Iterator, Optional, Protocol, TextIO, Union
+from typing import Any, DefaultDict, Dict, Iterator, Protocol, TextIO, Union
 import typer
 import yaml
@@ -59,47 +58,46 @@ class ServiceProtocol(Protocol):
        ...
+class PortReserver:
+    def __init__(self, host: str = "localhost"):
+        self.host = host
+        self.socket: socket.socket | None = None
+        self.port: int | None = None
+    def __enter__(self) -> int:
+        try:
+            self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            self.socket.bind((self.host, 0))
+            _, self.port = self.socket.getsockname()
+            return self.port
+        except socket.error as e:
+            self.close_socket()
+            logger.warning(f"Failed to reserve port on {self.host}: {str(e)}")
+            raise
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close_socket()
+    def close_socket(self):
+        try:
+            if self.socket:
+                self.socket.close()
+        except socket.error as e:
+            logger.warning(f"Error while closing socket: {str(e)}")
+            # Don't re-raise the exception as this is cleanup code
+            return True
 @contextlib.contextmanager
 def reserve_free_port(
    host: str = "localhost",
-    port: int | None = None,
-    prefix: Optional[str] = None,
-    max_retry: int = 50,
-    enable_so_reuseport: bool = False,
 ) -> Iterator[int]:
    """
-    detect free port and reserve until exit the context
+    Detect free port and reserve until exit the context.
+    Returns a context manager that yields the reserved port.
    """
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    with PortReserver(host) as port:
-    if enable_so_reuseport:
+        yield port
-        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
-        if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT) == 0:
-            raise RuntimeError("Failed to set SO_REUSEPORT.") from None
-    if prefix is not None:
-        prefix_num = int(prefix) * 10 ** (5 - len(prefix))
-        suffix_range = min(65535 - prefix_num, 10 ** (5 - len(prefix)))
-        for _ in range(max_retry):
-            suffix = random.randint(0, suffix_range)
-            port = int(f"{prefix_num + suffix}")
-            try:
-                sock.bind((host, port))
-                break
-            except OSError:
-                continue
-        else:
-            raise RuntimeError(
-                f"Cannot find free port with prefix {prefix} after {max_retry} retries."
-            ) from None
-    else:
-        if port:
-            sock.bind((host, port))
-        else:
-            sock.bind((host, 0))
-    try:
-        yield sock.getsockname()[1]
-    finally:
-        sock.close()
 def save_dynamo_state(

--- a/deploy/sdk/src/dynamo/sdk/lib/resource.py
+++ b/deploy/sdk/src/dynamo/sdk/lib/resource.py
@@ -67,7 +67,6 @@ class GPUInfo:
        self.name = name
        self.uuid = uuid
        self.available = True  # Can be set to False if GPU is reserved/in use
-        self.temperature = 0  # in Celsius
        self.utilization = 0  # in percent (0-100)
        self.processes: list[GPUProcess] = []
@@ -142,14 +141,6 @@ class GPUManager:
                    index=i, total_memory=memory_info.total, name=name, uuid=uuid
                )
-                # Get additional GPU information if available
-                try:
-                    gpu_info.temperature = pynvml.nvmlDeviceGetTemperature(
-                        handle, pynvml.NVML_TEMPERATURE_GPU
-                    )
-                except pynvml.NVMLError:
-                    logger.debug(f"Could not get temperature for GPU {i}")
                try:
                    utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    gpu_info.utilization = utilization.gpu
@@ -173,7 +164,7 @@ class GPUManager:
            logger.warning(f"Error discovering GPUs: {e}")
    def update_gpu_stats(self):
-        """Update GPU statistics (utilization, memory, temperature, etc.)."""
+        """Update GPU statistics (utilization, memory etc.)."""
        if not self._initialized:
            return
@@ -185,14 +176,6 @@ class GPUManager:
                memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                gpu.total_memory = memory_info.total
-                # Update temperature
-                try:
-                    gpu.temperature = pynvml.nvmlDeviceGetTemperature(
-                        handle, pynvml.NVML_TEMPERATURE_GPU
-                    )
-                except pynvml.NVMLError:
-                    pass
                # Update utilization
                try:
                    utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
@@ -242,97 +225,6 @@ class GPUManager:
            logger.warning(f"Error getting GPU memory for GPU {index}: {e}")
            return (0, 0)
-    def get_gpu_utilization(self, index: int) -> int:
-        """
-        Return GPU utilization percentage for a specific GPU.
-        Args:
-            index: GPU index
-        Returns:
-            GPU utilization percentage (0-100)
-        """
-        if not self._initialized or index >= len(self.gpus):
-            return 0
-        try:
-            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
-            utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
-            return utilization.gpu  # Returns GPU utilization percentage (0-100)
-        except pynvml.NVMLError as e:
-            logger.warning(f"Error getting GPU utilization for GPU {index}: {e}")
-            return 0
-    def get_gpu_temperature(self, index: int) -> int:
-        """
-        Return GPU temperature for a specific GPU.
-        Args:
-            index: GPU index
-        Returns:
-            GPU temperature in Celsius
-        """
-        if not self._initialized or index >= len(self.gpus):
-            return 0
-        try:
-            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
-            return pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
-        except pynvml.NVMLError as e:
-            logger.warning(f"Error getting GPU temperature for GPU {index}: {e}")
-            return 0
-    def get_gpu_processes(self, index: int) -> list[GPUProcess]:
-        """
-        Return processes running on a specific GPU.
-        Args:
-            index: GPU index
-        Returns:
-            List of processes running on the GPU
-        """
-        if not self._initialized or index >= len(self.gpus):
-            return []
-        try:
-            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
-            processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
-            return [
-                GPUProcess(pid=p.pid, used_memory=p.usedGpuMemory) for p in processes
-            ]
-        except pynvml.NVMLError as e:
-            logger.warning(f"Error getting GPU processes for GPU {index}: {e}")
-            return []
-    def get_best_gpu_for_memory(self, required_memory: int) -> int:
-        """
-        Return the index of the GPU with the most available memory that meets the requirement.
-        Args:
-            required_memory: Required memory in bytes
-        Returns:
-            GPU index, or -1 if no suitable GPU was found
-        """
-        if not self._initialized:
-            return -1
-        best_gpu = -1
-        max_free = 0
-        for gpu in self.gpus:
-            if not gpu.available:
-                continue
-            _, free = self.get_gpu_memory(gpu.index)
-            if free > required_memory and free > max_free:
-                max_free = free
-                best_gpu = gpu.index
-        return best_gpu
    def reset_allocations(self):
        """Reset all GPU allocations."""
        self._gpu_fractions = []
@@ -365,7 +257,6 @@ class GPUManager:
                    if total_memory > 0
                    else 0,
                    "gpu_utilization": gpu.utilization,
-                    "temperature": gpu.temperature,
                    "process_count": len(gpu.processes),
                    "processes": [
                        {