Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
34d317dc
Unverified
Commit
34d317dc
authored
Mar 25, 2026
by
Fadi Arafeh
Committed by
GitHub
Mar 25, 2026
Browse files
[CPU][UX][Perf] Enable tcmalloc by default (#37607)
Signed-off-by:
Fadi Arafeh
<
fadi.arafeh@arm.com
>
parent
7ac48fd3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
94 additions
and
3 deletions
+94
-3
setup.py
setup.py
+65
-0
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+29
-3
No files found.
setup.py
View file @
34d317dc
...
@@ -82,6 +82,66 @@ def is_freethreaded():
...
@@ -82,6 +82,66 @@ def is_freethreaded():
return
bool
(
sysconfig
.
get_config_var
(
"Py_GIL_DISABLED"
))
return
bool
(
sysconfig
.
get_config_var
(
"Py_GIL_DISABLED"
))
def
should_bundle_tcmalloc
()
->
bool
:
import
platform
return
(
VLLM_TARGET_DEVICE
==
"cpu"
and
sys
.
platform
.
startswith
(
"linux"
)
and
platform
.
machine
()
in
(
"aarch64"
,
"x86_64"
)
)
def
find_tcmalloc
()
->
Path
|
None
:
try
:
# get all shared libs the dynamic loader knows about
output
=
subprocess
.
check_output
(
[
"ldconfig"
,
"-p"
],
text
=
True
,
stderr
=
subprocess
.
DEVNULL
,
)
except
Exception
:
return
None
# search for libtcmalloc and libtcmalloc_minimal
for
library_pattern
in
(
r
"\blibtcmalloc_minimal\.so\.(\d+)\b"
,
r
"\blibtcmalloc\.so\.(\d+)\b"
,
):
candidates
:
list
[
tuple
[
int
,
Path
]]
=
[]
for
line
in
output
.
splitlines
():
match
=
re
.
search
(
library_pattern
,
line
)
if
match
is
None
or
"=>"
not
in
line
:
continue
candidate
=
Path
(
line
.
split
(
"=>"
)[
1
].
strip
())
if
candidate
.
exists
():
candidates
.
append
((
int
(
match
.
group
(
1
)),
candidate
))
if
candidates
:
# if multiple candidates are found, pick the one with the highest
# version number
return
max
(
candidates
,
key
=
lambda
item
:
item
[
0
])[
1
]
return
None
def
bundle_tcmalloc
(
build_lib
:
str
)
->
None
:
tcmalloc_library
=
find_tcmalloc
()
if
tcmalloc_library
is
None
:
logger
.
warning
(
"Failed to locate tcmalloc. For best performance, "
"please install tcmalloc (e.g. `sudo apt-get "
"install -y --no-install-recommends libtcmalloc-minimal4`)"
)
return
bundle_dir
=
os
.
path
.
join
(
build_lib
,
"vllm"
,
"libs"
)
os
.
makedirs
(
bundle_dir
,
exist_ok
=
True
)
bundle_path
=
os
.
path
.
join
(
bundle_dir
,
tcmalloc_library
.
name
)
shutil
.
copy2
(
tcmalloc_library
,
bundle_path
)
logger
.
info
(
"Bundled tcmalloc into wheel: %s"
,
bundle_path
)
class
CMakeExtension
(
Extension
):
class
CMakeExtension
(
Extension
):
def
__init__
(
self
,
name
:
str
,
cmake_lists_dir
:
str
=
"."
,
**
kwa
)
->
None
:
def
__init__
(
self
,
name
:
str
,
cmake_lists_dir
:
str
=
"."
,
**
kwa
)
->
None
:
super
().
__init__
(
name
,
sources
=
[],
py_limited_api
=
not
is_freethreaded
(),
**
kwa
)
super
().
__init__
(
name
,
sources
=
[],
py_limited_api
=
not
is_freethreaded
(),
**
kwa
)
...
@@ -285,6 +345,10 @@ class cmake_build_ext(build_ext):
...
@@ -285,6 +345,10 @@ class cmake_build_ext(build_ext):
# First, run the standard build_ext command to compile the extensions
# First, run the standard build_ext command to compile the extensions
super
().
run
()
super
().
run
()
# bundle tcmalloc into CPU wheels for best OOB perf
if
should_bundle_tcmalloc
():
bundle_tcmalloc
(
self
.
build_lib
)
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
# directory so that they can be included in the editable build
# directory so that they can be included in the editable build
import
glob
import
glob
...
@@ -944,6 +1008,7 @@ if _build_custom_ops():
...
@@ -944,6 +1008,7 @@ if _build_custom_ops():
package_data
=
{
package_data
=
{
"vllm"
:
[
"vllm"
:
[
"py.typed"
,
"py.typed"
,
"libs/*.so*"
,
"model_executor/layers/fused_moe/configs/*.json"
,
"model_executor/layers/fused_moe/configs/*.json"
,
"model_executor/layers/quantization/utils/configs/*.json"
,
"model_executor/layers/quantization/utils/configs/*.json"
,
"entrypoints/serve/instrumentator/static/*.js"
,
"entrypoints/serve/instrumentator/static/*.js"
,
...
...
vllm/platforms/cpu.py
View file @
34d317dc
...
@@ -284,8 +284,9 @@ class CpuPlatform(Platform):
...
@@ -284,8 +284,9 @@ class CpuPlatform(Platform):
# Avoid inductor generates num_thread() and breaks the thread binding
# Avoid inductor generates num_thread() and breaks the thread binding
os
.
environ
[
"TORCHINDUCTOR_CPP_DYNAMIC_THREADS"
]
=
"1"
os
.
environ
[
"TORCHINDUCTOR_CPP_DYNAMIC_THREADS"
]
=
"1"
# Intel OpenMP setting
ld_preload_str
=
os
.
getenv
(
"LD_PRELOAD"
,
""
)
ld_preload_str
=
os
.
getenv
(
"LD_PRELOAD"
,
""
)
# Intel OpenMP setting
if
"libiomp5.so"
in
ld_preload_str
:
if
"libiomp5.so"
in
ld_preload_str
:
# The time(milliseconds) that a thread should wait after
# The time(milliseconds) that a thread should wait after
# completing the execution of a parallel region, before sleeping.
# completing the execution of a parallel region, before sleeping.
...
@@ -297,10 +298,35 @@ class CpuPlatform(Platform):
...
@@ -297,10 +298,35 @@ class CpuPlatform(Platform):
os
.
environ
[
"KMP_PLAIN_BARRIER_PATTERN"
]
=
"dist,dist"
os
.
environ
[
"KMP_PLAIN_BARRIER_PATTERN"
]
=
"dist,dist"
os
.
environ
[
"KMP_REDUCTION_BARRIER_PATTERN"
]
=
"dist,dist"
os
.
environ
[
"KMP_REDUCTION_BARRIER_PATTERN"
]
=
"dist,dist"
cpu_architecture
=
Platform
.
get_cpu_architecture
()
# LD_PRELOAD libtcmalloc, bundled under vllm/libs to reduce
# memory allocation overhead
if
(
platform
.
system
()
==
"Linux"
and
cpu_architecture
in
(
CpuArchEnum
.
ARM
,
CpuArchEnum
.
X86
)
and
"libtcmalloc"
not
in
ld_preload_str
):
vllm_pkg
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
__file__
))
tcmalloc_so
=
None
for
pattern
in
(
"libtcmalloc_minimal*.so*"
,
"libtcmalloc.so*"
):
tcmalloc_so_candidates
=
glob
.
glob
(
os
.
path
.
join
(
vllm_pkg
,
"libs"
,
pattern
)
)
if
tcmalloc_so_candidates
:
tcmalloc_so
=
tcmalloc_so_candidates
[
0
]
break
if
tcmalloc_so
is
not
None
:
if
ld_preload_str
:
ld_preload_str
=
f
"
{
tcmalloc_so
}
:
{
ld_preload_str
}
"
else
:
ld_preload_str
=
tcmalloc_so
os
.
environ
[
"LD_PRELOAD"
]
=
ld_preload_str
if
(
if
(
platform
.
system
()
==
"Linux"
platform
.
system
()
==
"Linux"
and
Platform
.
get_cpu_architecture
()
and
cpu_architecture
in
(
CpuArchEnum
.
ARM
,
CpuArchEnum
.
POWERPC
)
in
(
CpuArchEnum
.
ARM
,
CpuArchEnum
.
POWERPC
)
and
not
(
"libomp"
in
ld_preload_str
or
"libgomp"
in
ld_preload_str
)
and
not
(
"libomp"
in
ld_preload_str
or
"libgomp"
in
ld_preload_str
)
):
):
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment