Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
81d7a50f
Unverified
Commit
81d7a50f
authored
Jul 05, 2024
by
Yuan
Committed by
GitHub
Jul 04, 2024
Browse files
[Hardware][Intel CPU] Adding intel openmp tunings in Docker file (#6008)
Signed-off-by:
Yuan Zhou
<
yuan.zhou@intel.com
>
parent
27902d42
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
37 additions
and
5 deletions
+37
-5
.buildkite/run-cpu-test.sh
.buildkite/run-cpu-test.sh
+4
-2
Dockerfile.cpu
Dockerfile.cpu
+8
-2
vllm/utils.py
vllm/utils.py
+21
-0
vllm/worker/cpu_worker.py
vllm/worker/cpu_worker.py
+4
-1
No files found.
.buildkite/run-cpu-test.sh
View file @
81d7a50f
...
...
@@ -12,8 +12,10 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image
docker run
-itd
-v
~/.cache/huggingface:/root/.cache/huggingface
--cpuset-cpus
=
48-95
--cpuset-mems
=
1
--network
host
-e
HF_TOKEN
--env
VLLM_CPU_KVCACHE_SPACE
=
4
--name
cpu-test cpu-test
docker run
-itd
-v
~/.cache/huggingface:/root/.cache/huggingface
--cpuset-cpus
=
48-95
--cpuset-mems
=
1
--network
host
-e
HF_TOKEN
--env
VLLM_CPU_KVCACHE_SPACE
=
4
--name
cpu-test-avx2 cpu-test-avx2
docker run
-itd
--entrypoint
/bin/bash
-v
~/.cache/huggingface:/root/.cache/huggingface
--cpuset-cpus
=
48-95
\
--cpuset-mems
=
1
--network
host
-e
HF_TOKEN
--env
VLLM_CPU_KVCACHE_SPACE
=
4
--name
cpu-test cpu-test
docker run
-itd
--entrypoint
/bin/bash
-v
~/.cache/huggingface:/root/.cache/huggingface
--cpuset-cpus
=
48-95
\
--cpuset-mems
=
1
--network
host
-e
HF_TOKEN
--env
VLLM_CPU_KVCACHE_SPACE
=
4
--name
cpu-test-avx2 cpu-test-avx2
# offline inference
docker
exec
cpu-test bash
-c
"python3 examples/offline_inference.py"
...
...
Dockerfile.cpu
View file @
81d7a50f
...
...
@@ -6,7 +6,13 @@ RUN apt-get update -y \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN pip install intel-openmp
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
...
...
@@ -31,4 +37,4 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
CMD ["/bin/bash
"]
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server
"]
vllm/utils.py
View file @
81d7a50f
...
...
@@ -398,6 +398,27 @@ def update_environment_variables(envs: Dict[str, str]):
os
.
environ
[
k
]
=
v
def
init_kmp_env
():
if
not
is_cpu
():
return
ld_prealod_str
=
os
.
getenv
(
"LD_PRELOAD"
,
""
)
if
"libiomp5.so"
not
in
ld_prealod_str
:
return
# The time(milliseconds) that a thread should wait after completing the
# execution of a parallel region, before sleeping.
os
.
environ
[
'KMP_BLOCKTIME'
]
=
"1"
# dump settings on start up
os
.
environ
[
'KMP_SETTINGS'
]
=
"1"
# Prevents the CPU to run into low performance state
os
.
environ
[
'KMP_TPAUSE'
]
=
"0"
# Provides fine granularity parallelism
os
.
environ
[
'KMP_FORKJOIN_BARRIER_PATTERN'
]
=
"dist,dist"
os
.
environ
[
'KMP_PLAIN_BARRIER_PATTERN'
]
=
"dist,dist"
os
.
environ
[
'KMP_REDUCTION_BARRIER_PATTERN'
]
=
"dist,dist"
def
chunk_list
(
lst
:
List
[
T
],
chunk_size
:
int
)
->
List
[
List
[
T
]]:
"""Yield successive chunk_size chunks from lst."""
return
[
lst
[
i
:
i
+
chunk_size
]
for
i
in
range
(
0
,
len
(
lst
),
chunk_size
)]
...
...
vllm/worker/cpu_worker.py
View file @
81d7a50f
...
...
@@ -13,7 +13,7 @@ from vllm.distributed import (ensure_model_parallel_initialized,
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
set_random_seed
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
,
init_kmp_env
from
vllm.worker.cpu_model_runner
import
CPUModelRunner
from
vllm.worker.worker_base
import
(
LocalOrDistributedWorkerBase
,
LoraNotSupportedWorkerBase
,
WorkerInput
)
...
...
@@ -150,6 +150,9 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
if
self
.
is_driver_worker
:
assert
self
.
rank
==
0
,
"The driver worker must have rank 0."
# try to initialize intel openmp optimized tunings
init_kmp_env
()
if
self
.
model_config
.
trust_remote_code
:
# note: lazy import to avoid importing torch before initializing
from
vllm.utils
import
init_cached_hf_modules
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment