Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ac9fb732
Unverified
Commit
ac9fb732
authored
Jul 17, 2025
by
Eric Curtin
Committed by
GitHub
Jul 17, 2025
Browse files
On environments where numa cannot be detected we get 0 (#21115)
Signed-off-by:
Eric Curtin
<
ecurtin@redhat.com
>
parent
a3a6c695
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
111 additions
and
77 deletions
+111
-77
vllm/v1/worker/cpu_worker.py
vllm/v1/worker/cpu_worker.py
+111
-77
No files found.
vllm/v1/worker/cpu_worker.py
View file @
ac9fb732
...
@@ -13,12 +13,20 @@ from vllm.logger import init_logger
...
@@ -13,12 +13,20 @@ from vllm.logger import init_logger
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.platforms
import
CpuArchEnum
,
current_platform
from
vllm.platforms
import
CpuArchEnum
,
current_platform
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
PlaceholderModule
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.worker.cpu_model_runner
import
CPUModelRunner
from
vllm.v1.worker.cpu_model_runner
import
CPUModelRunner
from
vllm.v1.worker.gpu_worker
import
(
Worker
,
from
vllm.v1.worker.gpu_worker
import
(
Worker
,
init_worker_distributed_environment
)
init_worker_distributed_environment
)
try
:
import
psutil
from
numa
import
info
except
ImportError
:
psutil
=
PlaceholderModule
(
"psutil"
)
# type: ignore[assignment]
numa
=
PlaceholderModule
(
"numa"
)
# type: ignore[assignment]
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -37,6 +45,8 @@ class CPUWorker(Worker):
...
@@ -37,6 +45,8 @@ class CPUWorker(Worker):
is_driver_worker
=
is_driver_worker
)
is_driver_worker
=
is_driver_worker
)
self
.
parallel_config
.
disable_custom_all_reduce
=
True
self
.
parallel_config
.
disable_custom_all_reduce
=
True
self
.
manually_bind_threads_suggestion
=
(
"To get better performance, please try to manually bind threads."
)
def
init_device
(
self
):
def
init_device
(
self
):
# Setup OpenMP threads affinity.
# Setup OpenMP threads affinity.
...
@@ -112,25 +122,43 @@ class CPUWorker(Worker):
...
@@ -112,25 +122,43 @@ class CPUWorker(Worker):
assert
isinstance
(
output
,
ModelRunnerOutput
)
assert
isinstance
(
output
,
ModelRunnerOutput
)
return
output
if
self
.
is_driver_worker
else
None
return
output
if
self
.
is_driver_worker
else
None
def
get_cpus_id_binding_based_on_numa_nodes
(
self
)
->
str
:
def
warn_inability_to_detect_numa
(
self
)
->
None
:
"""Return CPUs id binding based on NUMA nodes.
logger
.
warning
(
"""
"Auto thread-binding failed due to the "
rank_to_cpus
=
self
.
local_omp_cpuid
"inability to detect numa nodes. %s"
,
# Setup OpenMP thread affinity based on NUMA nodes automatically
self
.
manually_bind_threads_suggestion
)
world_size
=
self
.
vllm_config
.
parallel_config
.
world_size
libnuma_found
=
util
.
find_spec
(
"numa"
)
is
not
None
def
warn_lack_of_numa_and_psutil
(
self
)
->
None
:
psutil_found
=
util
.
find_spec
(
"psutil"
)
is
not
None
logger
.
warning
(
if
libnuma_found
and
psutil_found
:
"Auto thread-binding failed due to "
import
psutil
"the lack of package numa and psutil. %s"
,
from
numa
import
info
self
.
manually_bind_threads_suggestion
)
cpu_count
=
psutil
.
cpu_count
(
logical
=
False
)
def
warn_world_size_too_large
(
self
,
world_size
:
int
,
node_to_cpus_len
:
int
)
->
None
:
logger
.
warning
(
"Auto thread-binding failed due to "
"world size: %d being larger than "
"allowed NUMA nodes number: %d. %s"
,
world_size
,
node_to_cpus_len
,
self
.
manually_bind_threads_suggestion
)
def
get_cpus_allow_list_and_numa_size
(
self
):
cpus_allow_list
=
psutil
.
Process
().
cpu_affinity
()
cpus_allow_list
=
psutil
.
Process
().
cpu_affinity
()
numa_size
=
info
.
get_num_configured_nodes
()
numa_size
=
info
.
get_num_configured_nodes
()
return
cpus_allow_list
,
numa_size
def
auto_thread_binding_based_on_numa_nodes
(
self
,
world_size
:
int
,
rank_to_cpus
:
str
)
->
str
:
cpu_count
=
psutil
.
cpu_count
(
logical
=
False
)
cpus_allow_list
,
numa_size
=
self
.
get_cpus_allow_list_and_numa_size
()
if
not
numa_size
:
self
.
warn_inability_to_detect_numa
()
return
rank_to_cpus
cpu_count_per_numa
=
cpu_count
//
numa_size
cpu_count_per_numa
=
cpu_count
//
numa_size
num_of_reserved_cpu
=
min
(
envs
.
VLLM_CPU_NUM_OF_RESERVED_CPU
,
num_of_reserved_cpu
=
min
(
envs
.
VLLM_CPU_NUM_OF_RESERVED_CPU
,
cpu_count_per_numa
//
2
)
cpu_count_per_numa
//
2
)
# check allow node_to_cpus list
node_to_cpus
=
[]
node_to_cpus
=
[]
for
i
in
range
(
numa_size
):
for
i
in
range
(
numa_size
):
node_intersect
=
set
(
node_intersect
=
set
(
...
@@ -138,46 +166,45 @@ class CPUWorker(Worker):
...
@@ -138,46 +166,45 @@ class CPUWorker(Worker):
if
bool
(
node_intersect
):
if
bool
(
node_intersect
):
node_to_cpus
.
append
(
list
(
node_intersect
))
node_to_cpus
.
append
(
list
(
node_intersect
))
if
world_size
>
len
(
node_to_cpus
):
node_to_cpus_len
=
len
(
node_to_cpus
)
logger
.
error
(
if
world_size
>
node_to_cpus_len
:
"Auto thread-binding failed due to "
self
.
warn_world_size_too_large
(
world_size
,
node_to_cpus_len
)
"world size: %d is larger than "
"allowed NUMA nodes number: %d."
"Please try to bind threads manually."
,
world_size
,
len
(
node_to_cpus
))
else
:
else
:
end
=
cpu_count_per_numa
-
num_of_reserved_cpu
end
=
cpu_count_per_numa
-
num_of_reserved_cpu
rank_to_cpus_list
=
node_to_cpus
[
self
.
rank
][:
end
]
rank_to_cpus_list
=
node_to_cpus
[
self
.
rank
][:
end
]
rank_to_cpus
=
','
.
join
(
str
(
x
)
for
x
in
rank_to_cpus_list
)
rank_to_cpus
=
','
.
join
(
str
(
x
)
for
x
in
rank_to_cpus_list
)
logger
.
info
(
"auto thread-binding list: %s"
,
rank_to_cpus
)
logger
.
info
(
"auto thread-binding list: %s"
,
rank_to_cpus
)
else
:
logger
.
warning
(
"Auto thread-binding is not supported due to "
"the lack of package numa and psutil,"
"fallback to no thread-binding. To get better performance,"
"please try to manually bind threads."
)
return
rank_to_cpus
return
rank_to_cpus
def
get_cpus_id_binding_based_on_numa_nodes_ppc64le
(
self
)
->
str
:
def
libnuma_and_psutil_found
(
self
)
->
bool
:
"""
libnuma_found
=
util
.
find_spec
(
"numa"
)
is
not
None
Power (ppc64le) specific: Selects a subset of threads per core for
psutil_found
=
util
.
find_spec
(
"psutil"
)
is
not
None
each NUMA node.This is robust to SMT mode (SMT-8, SMT-4, etc)
because the OS only exposes available threads.This maximizes
performance by avoiding oversubscription of logical CPUs on Power.
"""
def
select_threads_per_power_core
(
node_cpu_ids
):
return
libnuma_found
and
psutil_found
return
[
cpu
for
cpu
in
node_cpu_ids
if
cpu
%
8
<
4
]
def
get_cpus_id_binding_based_on_numa_nodes
(
self
)
->
str
:
"""Return CPUs id binding based on NUMA nodes.
"""
rank_to_cpus
=
self
.
local_omp_cpuid
rank_to_cpus
=
self
.
local_omp_cpuid
# Setup OpenMP thread affinity based on NUMA nodes automatically
world_size
=
self
.
vllm_config
.
parallel_config
.
world_size
world_size
=
self
.
vllm_config
.
parallel_config
.
world_size
libnuma_found
=
util
.
find_spec
(
"numa"
)
is
not
None
if
self
.
libnuma_and_psutil_found
():
psutil_found
=
util
.
find_spec
(
"psutil"
)
is
not
None
rank_to_cpus
=
self
.
auto_thread_binding_based_on_numa_nodes
(
if
libnuma_found
and
psutil_found
:
world_size
,
rank_to_cpus
)
import
psutil
else
:
from
numa
import
info
self
.
warn_lack_of_numa_and_psutil
()
cpus_allow_list
=
psutil
.
Process
().
cpu_affinity
()
return
rank_to_cpus
numa_size
=
info
.
get_num_configured_nodes
()
def
select_threads_per_power_core
(
self
,
node_cpu_ids
:
list
[
int
])
->
list
[
int
]:
return
[
cpu
for
cpu
in
node_cpu_ids
if
cpu
%
8
<
4
]
def
auto_thread_binding_based_on_numa_nodes_ppc64le
(
self
,
world_size
:
int
,
rank_to_cpus
:
str
)
->
str
:
cpus_allow_list
,
numa_size
=
self
.
get_cpus_allow_list_and_numa_size
()
if
not
numa_size
:
self
.
warn_inability_to_detect_numa
()
return
rank_to_cpus
node_to_cpus
=
[]
node_to_cpus
=
[]
for
i
in
range
(
numa_size
):
for
i
in
range
(
numa_size
):
...
@@ -186,16 +213,12 @@ class CPUWorker(Worker):
...
@@ -186,16 +213,12 @@ class CPUWorker(Worker):
if
bool
(
node_intersect
):
if
bool
(
node_intersect
):
node_to_cpus
.
append
(
sorted
(
list
(
node_intersect
)))
node_to_cpus
.
append
(
sorted
(
list
(
node_intersect
)))
if
world_size
>
len
(
node_to_cpus
):
node_to_cpus_len
=
len
(
node_to_cpus
)
logger
.
error
(
if
world_size
>
node_to_cpus_len
:
"Auto thread-binding failed due to "
self
.
warn_world_size_too_large
(
world_size
,
node_to_cpus_len
)
"world size: %d is larger than "
"allowed NUMA nodes number: %d."
"Please try to bind threads manually."
,
world_size
,
len
(
node_to_cpus
))
else
:
else
:
node_cpus_this_rank
=
node_to_cpus
[
self
.
rank
]
node_cpus_this_rank
=
node_to_cpus
[
self
.
rank
]
node_cpus_this_rank
=
select_threads_per_power_core
(
node_cpus_this_rank
=
self
.
select_threads_per_power_core
(
node_cpus_this_rank
)
node_cpus_this_rank
)
cpu_count_per_numa
=
len
(
node_cpus_this_rank
)
cpu_count_per_numa
=
len
(
node_cpus_this_rank
)
num_of_reserved_cpu
=
min
(
envs
.
VLLM_CPU_NUM_OF_RESERVED_CPU
,
num_of_reserved_cpu
=
min
(
envs
.
VLLM_CPU_NUM_OF_RESERVED_CPU
,
...
@@ -204,10 +227,21 @@ class CPUWorker(Worker):
...
@@ -204,10 +227,21 @@ class CPUWorker(Worker):
rank_to_cpus_list
=
node_cpus_this_rank
[:
end
]
rank_to_cpus_list
=
node_cpus_this_rank
[:
end
]
rank_to_cpus
=
','
.
join
(
str
(
x
)
for
x
in
rank_to_cpus_list
)
rank_to_cpus
=
','
.
join
(
str
(
x
)
for
x
in
rank_to_cpus_list
)
logger
.
info
(
"ppc64le thread-binding list: %s"
,
rank_to_cpus
)
logger
.
info
(
"ppc64le thread-binding list: %s"
,
rank_to_cpus
)
return
rank_to_cpus
def
get_cpus_id_binding_based_on_numa_nodes_ppc64le
(
self
)
->
str
:
"""
Power (ppc64le) specific: Selects a subset of threads per core for
each NUMA node.This is robust to SMT mode (SMT-8, SMT-4, etc)
because the OS only exposes available threads.This maximizes
performance by avoiding oversubscription of logical CPUs on Power.
"""
rank_to_cpus
=
self
.
local_omp_cpuid
world_size
=
self
.
vllm_config
.
parallel_config
.
world_size
if
self
.
libnuma_and_psutil_found
():
rank_to_cpus
=
self
.
auto_thread_binding_based_on_numa_nodes_ppc64le
(
world_size
,
rank_to_cpus
)
else
:
else
:
logger
.
warning
(
self
.
warn_lack_of_numa_and_psutil
()
"Auto thread-binding is not supported due to "
"the lack of package numa and psutil,"
"fallback to no thread-binding. To get better performance,"
"please try to manually bind threads."
)
return
rank_to_cpus
return
rank_to_cpus
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment