Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
59922f9b
Unverified
Commit
59922f9b
authored
Aug 13, 2024
by
Wang, Yi
Committed by
GitHub
Aug 13, 2024
Browse files
add numa to improve cpu inference perf (#2330)
Signed-off-by:
Wang, Yi A
<
yi.a.wang@intel.com
>
parent
cd9b15d1
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
35 additions
and
8 deletions
+35
-8
Dockerfile_intel
Dockerfile_intel
+4
-8
server/text_generation_server/models/flash_causal_lm.py
server/text_generation_server/models/flash_causal_lm.py
+31
-0
No files found.
Dockerfile_intel
View file @
59922f9b
...
...
@@ -106,7 +106,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
g++ \
git \
wget \
cmake
cmake \
libnuma-dev
ENV HUGGINGFACE_HUB_CACHE=/data \
HF_HUB_ENABLE_HF_TRANSFER=1 \
...
...
@@ -135,7 +136,7 @@ RUN conda install -c conda-forge gperftools mkl
RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
RUN pip install triton
RUN pip install triton
numa
WORKDIR /usr/src
...
...
@@ -147,16 +148,11 @@ RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update
RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .
ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so
:/opt/conda/lib/libiomp5.so
ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so
ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib
ENV KMP_BLOCKTIME=1
ENV KMP_TPAUSE=0
ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist
ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist
# Install server
COPY proto proto
...
...
server/text_generation_server/models/flash_causal_lm.py
View file @
59922f9b
...
...
@@ -74,6 +74,36 @@ def get_sliding_windows() -> int:
return
SLIDING_WINDOW
def
init_cpu_threads_env
(
rank_id
:
int
,
world_size
:
int
):
import
importlib.util
if
importlib
.
util
.
find_spec
(
"numa"
)
is
not
None
:
import
numa
import
psutil
nodes
=
numa
.
get_max_node
()
+
1
rank_per_node
=
math
.
ceil
(
world_size
/
nodes
)
num_cpus_per_nodes
=
int
(
psutil
.
cpu_count
(
logical
=
False
)
/
nodes
)
node_id
=
int
(
rank_id
/
rank_per_node
)
rank_offset_per_node
=
rank_id
%
rank_per_node
if
os
.
getenv
(
"OMP_NUM_THREADS"
)
is
None
:
num_cpus_per_rank
=
max
(
int
(
num_cpus_per_nodes
/
rank_per_node
),
1
)
else
:
num_cpus_per_rank
=
int
(
os
.
getenv
(
"OMP_NUM_THREADS"
))
if
len
(
numa
.
get_membind
())
==
nodes
:
numa
.
set_membind
([
node_id
])
torch
.
set_num_threads
(
num_cpus_per_rank
)
if
len
(
numa
.
get_affinity
(
0
))
==
psutil
.
cpu_count
(
logical
=
True
):
cpu_start
=
num_cpus_per_rank
*
rank_offset_per_node
numa
.
set_affinity
(
0
,
list
(
numa
.
node_to_cpus
(
node_id
))[
cpu_start
:
cpu_start
+
num_cpus_per_rank
],
)
logger
.
info
(
f
"affinity=
{
numa
.
get_affinity
(
0
)
}
, membind =
{
numa
.
get_membind
()
}
"
)
@
dataclass
class
FlashCausalLMBatch
(
Batch
):
batch_id
:
int
...
...
@@ -854,6 +884,7 @@ class FlashCausalLM(Model):
device
=
torch
.
device
(
"cpu"
)
# Float16 doesn't exist on target.
dtype
=
torch
.
bfloat16
if
dtype
is
None
else
dtype
init_cpu_threads_env
(
rank_id
=
rank
,
world_size
=
world_size
)
else
:
raise
NotImplementedError
(
f
"
{
model_class
}
is only available on GPU"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment