Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
5465f9ea
Unverified
Commit
5465f9ea
authored
Jun 16, 2025
by
Biswa Panda
Committed by
GitHub
Jun 16, 2025
Browse files
fix: handle error in port reserve and cleanup allocator (#1536)
parent
a67e682b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
38 additions
and
150 deletions
+38
-150
deploy/sdk/src/dynamo/sdk/cli/allocator.py
deploy/sdk/src/dynamo/sdk/cli/allocator.py
+1
-2
deploy/sdk/src/dynamo/sdk/cli/circus.py
deploy/sdk/src/dynamo/sdk/cli/circus.py
+1
-1
deploy/sdk/src/dynamo/sdk/cli/utils.py
deploy/sdk/src/dynamo/sdk/cli/utils.py
+35
-37
deploy/sdk/src/dynamo/sdk/lib/resource.py
deploy/sdk/src/dynamo/sdk/lib/resource.py
+1
-110
No files found.
deploy/sdk/src/dynamo/sdk/cli/allocator.py
View file @
5465f9ea
...
@@ -233,8 +233,7 @@ class ResourceAllocator:
...
@@ -233,8 +233,7 @@ class ResourceAllocator:
f
"GPU
{
stat
[
'index'
]
}
(
{
stat
[
'name'
]
}
): "
f
"GPU
{
stat
[
'index'
]
}
(
{
stat
[
'name'
]
}
): "
f
"Memory:
{
format_memory_gb
(
stat
[
'free_memory'
])
}
free / "
f
"Memory:
{
format_memory_gb
(
stat
[
'free_memory'
])
}
free / "
f
"
{
format_memory_gb
(
stat
[
'total_memory'
])
}
total, "
f
"
{
format_memory_gb
(
stat
[
'total_memory'
])
}
total, "
f
"Utilization:
{
stat
[
'gpu_utilization'
]
}
%, "
f
"Utilization:
{
stat
[
'gpu_utilization'
]
}
% "
f
"Temperature:
{
stat
[
'temperature'
]
}
°C"
)
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
debug
(
f
"Failed to get GPU stats:
{
e
}
"
)
logger
.
debug
(
f
"Failed to get GPU stats:
{
e
}
"
)
...
...
deploy/sdk/src/dynamo/sdk/cli/circus.py
View file @
5465f9ea
...
@@ -104,7 +104,7 @@ def get_env_or_reserved_port(env_var):
...
@@ -104,7 +104,7 @@ def get_env_or_reserved_port(env_var):
if
port_env
:
if
port_env
:
return
int
(
port_env
)
return
int
(
port_env
)
else
:
else
:
with
reserve_free_port
()
as
port
:
with
reserve_free_port
()
as
port
:
# type: ignore
return
port
return
port
...
...
deploy/sdk/src/dynamo/sdk/cli/utils.py
View file @
5465f9ea
...
@@ -23,9 +23,8 @@ import json
...
@@ -23,9 +23,8 @@ import json
import
logging
import
logging
import
os
import
os
import
pathlib
import
pathlib
import
random
import
socket
import
socket
from
typing
import
Any
,
DefaultDict
,
Dict
,
Iterator
,
Optional
,
Protocol
,
TextIO
,
Union
from
typing
import
Any
,
DefaultDict
,
Dict
,
Iterator
,
Protocol
,
TextIO
,
Union
import
typer
import
typer
import
yaml
import
yaml
...
@@ -59,47 +58,46 @@ class ServiceProtocol(Protocol):
...
@@ -59,47 +58,46 @@ class ServiceProtocol(Protocol):
...
...
class
PortReserver
:
def
__init__
(
self
,
host
:
str
=
"localhost"
):
self
.
host
=
host
self
.
socket
:
socket
.
socket
|
None
=
None
self
.
port
:
int
|
None
=
None
def
__enter__
(
self
)
->
int
:
try
:
self
.
socket
=
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
self
.
socket
.
bind
((
self
.
host
,
0
))
_
,
self
.
port
=
self
.
socket
.
getsockname
()
return
self
.
port
except
socket
.
error
as
e
:
self
.
close_socket
()
logger
.
warning
(
f
"Failed to reserve port on
{
self
.
host
}
:
{
str
(
e
)
}
"
)
raise
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
self
.
close_socket
()
def
close_socket
(
self
):
try
:
if
self
.
socket
:
self
.
socket
.
close
()
except
socket
.
error
as
e
:
logger
.
warning
(
f
"Error while closing socket:
{
str
(
e
)
}
"
)
# Don't re-raise the exception as this is cleanup code
return
True
@
contextlib
.
contextmanager
@
contextlib
.
contextmanager
def
reserve_free_port
(
def
reserve_free_port
(
host
:
str
=
"localhost"
,
host
:
str
=
"localhost"
,
port
:
int
|
None
=
None
,
prefix
:
Optional
[
str
]
=
None
,
max_retry
:
int
=
50
,
enable_so_reuseport
:
bool
=
False
,
)
->
Iterator
[
int
]:
)
->
Iterator
[
int
]:
"""
"""
detect free port and reserve until exit the context
Detect free port and reserve until exit the context.
Returns a context manager that yields the reserved port.
"""
"""
sock
=
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
with
PortReserver
(
host
)
as
port
:
if
enable_so_reuseport
:
yield
port
sock
.
setsockopt
(
socket
.
SOL_SOCKET
,
socket
.
SO_REUSEPORT
,
1
)
if
sock
.
getsockopt
(
socket
.
SOL_SOCKET
,
socket
.
SO_REUSEPORT
)
==
0
:
raise
RuntimeError
(
"Failed to set SO_REUSEPORT."
)
from
None
if
prefix
is
not
None
:
prefix_num
=
int
(
prefix
)
*
10
**
(
5
-
len
(
prefix
))
suffix_range
=
min
(
65535
-
prefix_num
,
10
**
(
5
-
len
(
prefix
)))
for
_
in
range
(
max_retry
):
suffix
=
random
.
randint
(
0
,
suffix_range
)
port
=
int
(
f
"
{
prefix_num
+
suffix
}
"
)
try
:
sock
.
bind
((
host
,
port
))
break
except
OSError
:
continue
else
:
raise
RuntimeError
(
f
"Cannot find free port with prefix
{
prefix
}
after
{
max_retry
}
retries."
)
from
None
else
:
if
port
:
sock
.
bind
((
host
,
port
))
else
:
sock
.
bind
((
host
,
0
))
try
:
yield
sock
.
getsockname
()[
1
]
finally
:
sock
.
close
()
def
save_dynamo_state
(
def
save_dynamo_state
(
...
...
deploy/sdk/src/dynamo/sdk/lib/resource.py
View file @
5465f9ea
...
@@ -67,7 +67,6 @@ class GPUInfo:
...
@@ -67,7 +67,6 @@ class GPUInfo:
self
.
name
=
name
self
.
name
=
name
self
.
uuid
=
uuid
self
.
uuid
=
uuid
self
.
available
=
True
# Can be set to False if GPU is reserved/in use
self
.
available
=
True
# Can be set to False if GPU is reserved/in use
self
.
temperature
=
0
# in Celsius
self
.
utilization
=
0
# in percent (0-100)
self
.
utilization
=
0
# in percent (0-100)
self
.
processes
:
list
[
GPUProcess
]
=
[]
self
.
processes
:
list
[
GPUProcess
]
=
[]
...
@@ -142,14 +141,6 @@ class GPUManager:
...
@@ -142,14 +141,6 @@ class GPUManager:
index
=
i
,
total_memory
=
memory_info
.
total
,
name
=
name
,
uuid
=
uuid
index
=
i
,
total_memory
=
memory_info
.
total
,
name
=
name
,
uuid
=
uuid
)
)
# Get additional GPU information if available
try
:
gpu_info
.
temperature
=
pynvml
.
nvmlDeviceGetTemperature
(
handle
,
pynvml
.
NVML_TEMPERATURE_GPU
)
except
pynvml
.
NVMLError
:
logger
.
debug
(
f
"Could not get temperature for GPU
{
i
}
"
)
try
:
try
:
utilization
=
pynvml
.
nvmlDeviceGetUtilizationRates
(
handle
)
utilization
=
pynvml
.
nvmlDeviceGetUtilizationRates
(
handle
)
gpu_info
.
utilization
=
utilization
.
gpu
gpu_info
.
utilization
=
utilization
.
gpu
...
@@ -173,7 +164,7 @@ class GPUManager:
...
@@ -173,7 +164,7 @@ class GPUManager:
logger
.
warning
(
f
"Error discovering GPUs:
{
e
}
"
)
logger
.
warning
(
f
"Error discovering GPUs:
{
e
}
"
)
def
update_gpu_stats
(
self
):
def
update_gpu_stats
(
self
):
"""Update GPU statistics (utilization, memory
, temperature,
etc.)."""
"""Update GPU statistics (utilization, memory etc.)."""
if
not
self
.
_initialized
:
if
not
self
.
_initialized
:
return
return
...
@@ -185,14 +176,6 @@ class GPUManager:
...
@@ -185,14 +176,6 @@ class GPUManager:
memory_info
=
pynvml
.
nvmlDeviceGetMemoryInfo
(
handle
)
memory_info
=
pynvml
.
nvmlDeviceGetMemoryInfo
(
handle
)
gpu
.
total_memory
=
memory_info
.
total
gpu
.
total_memory
=
memory_info
.
total
# Update temperature
try
:
gpu
.
temperature
=
pynvml
.
nvmlDeviceGetTemperature
(
handle
,
pynvml
.
NVML_TEMPERATURE_GPU
)
except
pynvml
.
NVMLError
:
pass
# Update utilization
# Update utilization
try
:
try
:
utilization
=
pynvml
.
nvmlDeviceGetUtilizationRates
(
handle
)
utilization
=
pynvml
.
nvmlDeviceGetUtilizationRates
(
handle
)
...
@@ -242,97 +225,6 @@ class GPUManager:
...
@@ -242,97 +225,6 @@ class GPUManager:
logger
.
warning
(
f
"Error getting GPU memory for GPU
{
index
}
:
{
e
}
"
)
logger
.
warning
(
f
"Error getting GPU memory for GPU
{
index
}
:
{
e
}
"
)
return
(
0
,
0
)
return
(
0
,
0
)
def
get_gpu_utilization
(
self
,
index
:
int
)
->
int
:
"""
Return GPU utilization percentage for a specific GPU.
Args:
index: GPU index
Returns:
GPU utilization percentage (0-100)
"""
if
not
self
.
_initialized
or
index
>=
len
(
self
.
gpus
):
return
0
try
:
handle
=
pynvml
.
nvmlDeviceGetHandleByIndex
(
index
)
utilization
=
pynvml
.
nvmlDeviceGetUtilizationRates
(
handle
)
return
utilization
.
gpu
# Returns GPU utilization percentage (0-100)
except
pynvml
.
NVMLError
as
e
:
logger
.
warning
(
f
"Error getting GPU utilization for GPU
{
index
}
:
{
e
}
"
)
return
0
def
get_gpu_temperature
(
self
,
index
:
int
)
->
int
:
"""
Return GPU temperature for a specific GPU.
Args:
index: GPU index
Returns:
GPU temperature in Celsius
"""
if
not
self
.
_initialized
or
index
>=
len
(
self
.
gpus
):
return
0
try
:
handle
=
pynvml
.
nvmlDeviceGetHandleByIndex
(
index
)
return
pynvml
.
nvmlDeviceGetTemperature
(
handle
,
pynvml
.
NVML_TEMPERATURE_GPU
)
except
pynvml
.
NVMLError
as
e
:
logger
.
warning
(
f
"Error getting GPU temperature for GPU
{
index
}
:
{
e
}
"
)
return
0
def
get_gpu_processes
(
self
,
index
:
int
)
->
list
[
GPUProcess
]:
"""
Return processes running on a specific GPU.
Args:
index: GPU index
Returns:
List of processes running on the GPU
"""
if
not
self
.
_initialized
or
index
>=
len
(
self
.
gpus
):
return
[]
try
:
handle
=
pynvml
.
nvmlDeviceGetHandleByIndex
(
index
)
processes
=
pynvml
.
nvmlDeviceGetComputeRunningProcesses
(
handle
)
return
[
GPUProcess
(
pid
=
p
.
pid
,
used_memory
=
p
.
usedGpuMemory
)
for
p
in
processes
]
except
pynvml
.
NVMLError
as
e
:
logger
.
warning
(
f
"Error getting GPU processes for GPU
{
index
}
:
{
e
}
"
)
return
[]
def
get_best_gpu_for_memory
(
self
,
required_memory
:
int
)
->
int
:
"""
Return the index of the GPU with the most available memory that meets the requirement.
Args:
required_memory: Required memory in bytes
Returns:
GPU index, or -1 if no suitable GPU was found
"""
if
not
self
.
_initialized
:
return
-
1
best_gpu
=
-
1
max_free
=
0
for
gpu
in
self
.
gpus
:
if
not
gpu
.
available
:
continue
_
,
free
=
self
.
get_gpu_memory
(
gpu
.
index
)
if
free
>
required_memory
and
free
>
max_free
:
max_free
=
free
best_gpu
=
gpu
.
index
return
best_gpu
def
reset_allocations
(
self
):
def
reset_allocations
(
self
):
"""Reset all GPU allocations."""
"""Reset all GPU allocations."""
self
.
_gpu_fractions
=
[]
self
.
_gpu_fractions
=
[]
...
@@ -365,7 +257,6 @@ class GPUManager:
...
@@ -365,7 +257,6 @@ class GPUManager:
if
total_memory
>
0
if
total_memory
>
0
else
0
,
else
0
,
"gpu_utilization"
:
gpu
.
utilization
,
"gpu_utilization"
:
gpu
.
utilization
,
"temperature"
:
gpu
.
temperature
,
"process_count"
:
len
(
gpu
.
processes
),
"process_count"
:
len
(
gpu
.
processes
),
"processes"
:
[
"processes"
:
[
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment