Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
5465f9ea
"lib/bindings/vscode:/vscode.git/clone" did not exist on "d9657b34e4594c04a29ea4e56dbe142aa65d34b2"
Unverified
Commit
5465f9ea
authored
Jun 16, 2025
by
Biswa Panda
Committed by
GitHub
Jun 16, 2025
Browse files
fix: handle error in port reserve and cleanup allocator (#1536)
parent
a67e682b
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
38 additions
and
150 deletions
+38
-150
deploy/sdk/src/dynamo/sdk/cli/allocator.py
deploy/sdk/src/dynamo/sdk/cli/allocator.py
+1
-2
deploy/sdk/src/dynamo/sdk/cli/circus.py
deploy/sdk/src/dynamo/sdk/cli/circus.py
+1
-1
deploy/sdk/src/dynamo/sdk/cli/utils.py
deploy/sdk/src/dynamo/sdk/cli/utils.py
+35
-37
deploy/sdk/src/dynamo/sdk/lib/resource.py
deploy/sdk/src/dynamo/sdk/lib/resource.py
+1
-110
No files found.
deploy/sdk/src/dynamo/sdk/cli/allocator.py
View file @
5465f9ea
...
@@ -233,8 +233,7 @@ class ResourceAllocator:
...
@@ -233,8 +233,7 @@ class ResourceAllocator:
f
"GPU
{
stat
[
'index'
]
}
(
{
stat
[
'name'
]
}
): "
f
"GPU
{
stat
[
'index'
]
}
(
{
stat
[
'name'
]
}
): "
f
"Memory:
{
format_memory_gb
(
stat
[
'free_memory'
])
}
free / "
f
"Memory:
{
format_memory_gb
(
stat
[
'free_memory'
])
}
free / "
f
"
{
format_memory_gb
(
stat
[
'total_memory'
])
}
total, "
f
"
{
format_memory_gb
(
stat
[
'total_memory'
])
}
total, "
f
"Utilization:
{
stat
[
'gpu_utilization'
]
}
%, "
f
"Utilization:
{
stat
[
'gpu_utilization'
]
}
% "
f
"Temperature:
{
stat
[
'temperature'
]
}
°C"
)
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
debug
(
f
"Failed to get GPU stats:
{
e
}
"
)
logger
.
debug
(
f
"Failed to get GPU stats:
{
e
}
"
)
...
...
deploy/sdk/src/dynamo/sdk/cli/circus.py
View file @
5465f9ea
...
@@ -104,7 +104,7 @@ def get_env_or_reserved_port(env_var):
...
@@ -104,7 +104,7 @@ def get_env_or_reserved_port(env_var):
if
port_env
:
if
port_env
:
return
int
(
port_env
)
return
int
(
port_env
)
else
:
else
:
with
reserve_free_port
()
as
port
:
with
reserve_free_port
()
as
port
:
# type: ignore
return
port
return
port
...
...
deploy/sdk/src/dynamo/sdk/cli/utils.py
View file @
5465f9ea
...
@@ -23,9 +23,8 @@ import json
...
@@ -23,9 +23,8 @@ import json
import
logging
import
logging
import
os
import
os
import
pathlib
import
pathlib
import
random
import
socket
import
socket
from
typing
import
Any
,
DefaultDict
,
Dict
,
Iterator
,
Optional
,
Protocol
,
TextIO
,
Union
from
typing
import
Any
,
DefaultDict
,
Dict
,
Iterator
,
Protocol
,
TextIO
,
Union
import
typer
import
typer
import
yaml
import
yaml
...
@@ -59,47 +58,46 @@ class ServiceProtocol(Protocol):
...
@@ -59,47 +58,46 @@ class ServiceProtocol(Protocol):
...
...
class
PortReserver
:
def
__init__
(
self
,
host
:
str
=
"localhost"
):
self
.
host
=
host
self
.
socket
:
socket
.
socket
|
None
=
None
self
.
port
:
int
|
None
=
None
def
__enter__
(
self
)
->
int
:
try
:
self
.
socket
=
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
self
.
socket
.
bind
((
self
.
host
,
0
))
_
,
self
.
port
=
self
.
socket
.
getsockname
()
return
self
.
port
except
socket
.
error
as
e
:
self
.
close_socket
()
logger
.
warning
(
f
"Failed to reserve port on
{
self
.
host
}
:
{
str
(
e
)
}
"
)
raise
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
self
.
close_socket
()
def
close_socket
(
self
):
try
:
if
self
.
socket
:
self
.
socket
.
close
()
except
socket
.
error
as
e
:
logger
.
warning
(
f
"Error while closing socket:
{
str
(
e
)
}
"
)
# Don't re-raise the exception as this is cleanup code
return
True
@
contextlib
.
contextmanager
@
contextlib
.
contextmanager
def
reserve_free_port
(
def
reserve_free_port
(
host
:
str
=
"localhost"
,
host
:
str
=
"localhost"
,
port
:
int
|
None
=
None
,
prefix
:
Optional
[
str
]
=
None
,
max_retry
:
int
=
50
,
enable_so_reuseport
:
bool
=
False
,
)
->
Iterator
[
int
]:
)
->
Iterator
[
int
]:
"""
"""
detect free port and reserve until exit the context
Detect free port and reserve until exit the context.
Returns a context manager that yields the reserved port.
"""
"""
sock
=
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
with
PortReserver
(
host
)
as
port
:
if
enable_so_reuseport
:
yield
port
sock
.
setsockopt
(
socket
.
SOL_SOCKET
,
socket
.
SO_REUSEPORT
,
1
)
if
sock
.
getsockopt
(
socket
.
SOL_SOCKET
,
socket
.
SO_REUSEPORT
)
==
0
:
raise
RuntimeError
(
"Failed to set SO_REUSEPORT."
)
from
None
if
prefix
is
not
None
:
prefix_num
=
int
(
prefix
)
*
10
**
(
5
-
len
(
prefix
))
suffix_range
=
min
(
65535
-
prefix_num
,
10
**
(
5
-
len
(
prefix
)))
for
_
in
range
(
max_retry
):
suffix
=
random
.
randint
(
0
,
suffix_range
)
port
=
int
(
f
"
{
prefix_num
+
suffix
}
"
)
try
:
sock
.
bind
((
host
,
port
))
break
except
OSError
:
continue
else
:
raise
RuntimeError
(
f
"Cannot find free port with prefix
{
prefix
}
after
{
max_retry
}
retries."
)
from
None
else
:
if
port
:
sock
.
bind
((
host
,
port
))
else
:
sock
.
bind
((
host
,
0
))
try
:
yield
sock
.
getsockname
()[
1
]
finally
:
sock
.
close
()
def
save_dynamo_state
(
def
save_dynamo_state
(
...
...
deploy/sdk/src/dynamo/sdk/lib/resource.py
View file @
5465f9ea
...
@@ -67,7 +67,6 @@ class GPUInfo:
...
@@ -67,7 +67,6 @@ class GPUInfo:
self
.
name
=
name
self
.
name
=
name
self
.
uuid
=
uuid
self
.
uuid
=
uuid
self
.
available
=
True
# Can be set to False if GPU is reserved/in use
self
.
available
=
True
# Can be set to False if GPU is reserved/in use
self
.
temperature
=
0
# in Celsius
self
.
utilization
=
0
# in percent (0-100)
self
.
utilization
=
0
# in percent (0-100)
self
.
processes
:
list
[
GPUProcess
]
=
[]
self
.
processes
:
list
[
GPUProcess
]
=
[]
...
@@ -142,14 +141,6 @@ class GPUManager:
...
@@ -142,14 +141,6 @@ class GPUManager:
index
=
i
,
total_memory
=
memory_info
.
total
,
name
=
name
,
uuid
=
uuid
index
=
i
,
total_memory
=
memory_info
.
total
,
name
=
name
,
uuid
=
uuid
)
)
# Get additional GPU information if available
try
:
gpu_info
.
temperature
=
pynvml
.
nvmlDeviceGetTemperature
(
handle
,
pynvml
.
NVML_TEMPERATURE_GPU
)
except
pynvml
.
NVMLError
:
logger
.
debug
(
f
"Could not get temperature for GPU
{
i
}
"
)
try
:
try
:
utilization
=
pynvml
.
nvmlDeviceGetUtilizationRates
(
handle
)
utilization
=
pynvml
.
nvmlDeviceGetUtilizationRates
(
handle
)
gpu_info
.
utilization
=
utilization
.
gpu
gpu_info
.
utilization
=
utilization
.
gpu
...
@@ -173,7 +164,7 @@ class GPUManager:
...
@@ -173,7 +164,7 @@ class GPUManager:
logger
.
warning
(
f
"Error discovering GPUs:
{
e
}
"
)
logger
.
warning
(
f
"Error discovering GPUs:
{
e
}
"
)
def
update_gpu_stats
(
self
):
def
update_gpu_stats
(
self
):
"""Update GPU statistics (utilization, memory
, temperature,
etc.)."""
"""Update GPU statistics (utilization, memory etc.)."""
if
not
self
.
_initialized
:
if
not
self
.
_initialized
:
return
return
...
@@ -185,14 +176,6 @@ class GPUManager:
...
@@ -185,14 +176,6 @@ class GPUManager:
memory_info
=
pynvml
.
nvmlDeviceGetMemoryInfo
(
handle
)
memory_info
=
pynvml
.
nvmlDeviceGetMemoryInfo
(
handle
)
gpu
.
total_memory
=
memory_info
.
total
gpu
.
total_memory
=
memory_info
.
total
# Update temperature
try
:
gpu
.
temperature
=
pynvml
.
nvmlDeviceGetTemperature
(
handle
,
pynvml
.
NVML_TEMPERATURE_GPU
)
except
pynvml
.
NVMLError
:
pass
# Update utilization
# Update utilization
try
:
try
:
utilization
=
pynvml
.
nvmlDeviceGetUtilizationRates
(
handle
)
utilization
=
pynvml
.
nvmlDeviceGetUtilizationRates
(
handle
)
...
@@ -242,97 +225,6 @@ class GPUManager:
...
@@ -242,97 +225,6 @@ class GPUManager:
logger
.
warning
(
f
"Error getting GPU memory for GPU
{
index
}
:
{
e
}
"
)
logger
.
warning
(
f
"Error getting GPU memory for GPU
{
index
}
:
{
e
}
"
)
return
(
0
,
0
)
return
(
0
,
0
)
def
get_gpu_utilization
(
self
,
index
:
int
)
->
int
:
"""
Return GPU utilization percentage for a specific GPU.
Args:
index: GPU index
Returns:
GPU utilization percentage (0-100)
"""
if
not
self
.
_initialized
or
index
>=
len
(
self
.
gpus
):
return
0
try
:
handle
=
pynvml
.
nvmlDeviceGetHandleByIndex
(
index
)
utilization
=
pynvml
.
nvmlDeviceGetUtilizationRates
(
handle
)
return
utilization
.
gpu
# Returns GPU utilization percentage (0-100)
except
pynvml
.
NVMLError
as
e
:
logger
.
warning
(
f
"Error getting GPU utilization for GPU
{
index
}
:
{
e
}
"
)
return
0
def
get_gpu_temperature
(
self
,
index
:
int
)
->
int
:
"""
Return GPU temperature for a specific GPU.
Args:
index: GPU index
Returns:
GPU temperature in Celsius
"""
if
not
self
.
_initialized
or
index
>=
len
(
self
.
gpus
):
return
0
try
:
handle
=
pynvml
.
nvmlDeviceGetHandleByIndex
(
index
)
return
pynvml
.
nvmlDeviceGetTemperature
(
handle
,
pynvml
.
NVML_TEMPERATURE_GPU
)
except
pynvml
.
NVMLError
as
e
:
logger
.
warning
(
f
"Error getting GPU temperature for GPU
{
index
}
:
{
e
}
"
)
return
0
def
get_gpu_processes
(
self
,
index
:
int
)
->
list
[
GPUProcess
]:
"""
Return processes running on a specific GPU.
Args:
index: GPU index
Returns:
List of processes running on the GPU
"""
if
not
self
.
_initialized
or
index
>=
len
(
self
.
gpus
):
return
[]
try
:
handle
=
pynvml
.
nvmlDeviceGetHandleByIndex
(
index
)
processes
=
pynvml
.
nvmlDeviceGetComputeRunningProcesses
(
handle
)
return
[
GPUProcess
(
pid
=
p
.
pid
,
used_memory
=
p
.
usedGpuMemory
)
for
p
in
processes
]
except
pynvml
.
NVMLError
as
e
:
logger
.
warning
(
f
"Error getting GPU processes for GPU
{
index
}
:
{
e
}
"
)
return
[]
def
get_best_gpu_for_memory
(
self
,
required_memory
:
int
)
->
int
:
"""
Return the index of the GPU with the most available memory that meets the requirement.
Args:
required_memory: Required memory in bytes
Returns:
GPU index, or -1 if no suitable GPU was found
"""
if
not
self
.
_initialized
:
return
-
1
best_gpu
=
-
1
max_free
=
0
for
gpu
in
self
.
gpus
:
if
not
gpu
.
available
:
continue
_
,
free
=
self
.
get_gpu_memory
(
gpu
.
index
)
if
free
>
required_memory
and
free
>
max_free
:
max_free
=
free
best_gpu
=
gpu
.
index
return
best_gpu
def
reset_allocations
(
self
):
def
reset_allocations
(
self
):
"""Reset all GPU allocations."""
"""Reset all GPU allocations."""
self
.
_gpu_fractions
=
[]
self
.
_gpu_fractions
=
[]
...
@@ -365,7 +257,6 @@ class GPUManager:
...
@@ -365,7 +257,6 @@ class GPUManager:
if
total_memory
>
0
if
total_memory
>
0
else
0
,
else
0
,
"gpu_utilization"
:
gpu
.
utilization
,
"gpu_utilization"
:
gpu
.
utilization
,
"temperature"
:
gpu
.
temperature
,
"process_count"
:
len
(
gpu
.
processes
),
"process_count"
:
len
(
gpu
.
processes
),
"processes"
:
[
"processes"
:
[
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment