add numa to improve cpu inference perf (#2330)

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add numa to improve cpu inference perf (#2330)
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
59922f9b · Wang, Yi · GitHub · cd9b15d1 · 59922f9b · 59922f9b
Unverified Commit 59922f9b authored Aug 13, 2024 by Wang, Yi Committed by GitHub Aug 13, 2024
Show whitespace changes
Inline Side-by-side

Showing with 35 additions and 8 deletions

Dockerfile_intel Dockerfile_intel +4 -8

server/text_generation_server/models/flash_causal_lm.py server/text_generation_server/models/flash_causal_lm.py +31 -0

No files found.
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -106,7 +106,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
    g++ \
    git \
    wget \
-    cmake
+    cmake \
+    libnuma-dev

 ENV HUGGINGFACE_HUB_CACHE=/data \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
@@ -135,7 +136,7 @@ RUN conda install -c conda-forge gperftools mkl
 RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
 RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
 RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
-RUN pip install triton
+RUN pip install triton numa

 WORKDIR /usr/src

@@ -147,16 +148,11 @@ RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update

 RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .

-ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so:/opt/conda/lib/libiomp5.so
+ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so
 ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
 ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
 ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
 ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib
-ENV KMP_BLOCKTIME=1
-ENV KMP_TPAUSE=0
-ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
-ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist
-ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist

 # Install server
 COPY proto proto

--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -74,6 +74,36 @@ def get_sliding_windows() -> int:
    return SLIDING_WINDOW


+def init_cpu_threads_env(rank_id: int, world_size: int):
+    import importlib.util
+
+    if importlib.util.find_spec("numa") is not None:
+        import numa
+        import psutil
+
+        nodes = numa.get_max_node() + 1
+        rank_per_node = math.ceil(world_size / nodes)
+        num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
+        node_id = int(rank_id / rank_per_node)
+        rank_offset_per_node = rank_id % rank_per_node
+        if os.getenv("OMP_NUM_THREADS") is None:
+            num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
+        else:
+            num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
+        if len(numa.get_membind()) == nodes:
+            numa.set_membind([node_id])
+        torch.set_num_threads(num_cpus_per_rank)
+        if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True):
+            cpu_start = num_cpus_per_rank * rank_offset_per_node
+            numa.set_affinity(
+                0,
+                list(numa.node_to_cpus(node_id))[
+                    cpu_start : cpu_start + num_cpus_per_rank
+                ],
+            )
+        logger.info(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}")
+
+
 @dataclass
 class FlashCausalLMBatch(Batch):
    batch_id: int
@@ -854,6 +884,7 @@ class FlashCausalLM(Model):
                device = torch.device("cpu")
                # Float16 doesn't exist on target.
                dtype = torch.bfloat16 if dtype is None else dtype
+                init_cpu_threads_env(rank_id=rank, world_size=world_size)
        else:
            raise NotImplementedError(f"{model_class} is only available on GPU")