Removing IPEX_AVAIL. (#2115)

* Removing IPEX_AVAIL. Chose to unify CPU and XPU under `ipex`. Most code is exactly similar except for a very few spots. The biggest number of spots is the kv-cache layout and the flash_xxx.py files. Since those files should be removed soon and factored away, we should not need them. * Forgot a few places. * Unrelated change. * Fixing HF_TOKEN. * HF_TOKEN

Removing IPEX_AVAIL. (#2115)
* Removing IPEX_AVAIL. Chose to unify CPU and XPU under `ipex`. Most code is exactly similar except for a very few spots. The biggest number of spots is the kv-cache layout and the flash_xxx.py files. Since those files should be removed soon and factored away, we should not need them. * Forgot a few places. * Unrelated change. * Fixing HF_TOKEN. * HF_TOKEN
9e2fdf57 · Nicolas Patry · GitHub · 3f3b7ffd · 9e2fdf57 · 9e2fdf57
Unverified Commit 9e2fdf57 authored Jun 25, 2024 by Nicolas Patry Committed by GitHub Jun 25, 2024
20 changed files
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -178,6 +178,6 @@ jobs:
          export DOCKER_VOLUME=/mnt/cache
          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
          export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
-          export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          echo $DOCKER_IMAGE
          pytest -s -vv integration-tests
--- a/.github/workflows/client-tests.yaml
+++ b/.github/workflows/client-tests.yaml
@@ -22,5 +22,5 @@ jobs:
      - name: Run tests
        run: |
          pip install pytest pytest-asyncio
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          make python-client-tests
--- a/.github/workflows/integration_tests.yaml
+++ b/.github/workflows/integration_tests.yaml
@@ -37,5 +37,5 @@ jobs:
          export DOCKER_VOLUME=/mnt/cache
          export DOCKER_IMAGE=${{ inputs.docker_image }}
          export DOCKER_DEVICES=${{ inputs.docker_devices }}
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          pytest -s -vv integration-tests
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@@ -28,7 +28,7 @@ jobs:
      - name: Start starcoder
        run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
+          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
          sleep 10
          wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health

--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -72,7 +72,7 @@ jobs:
      - name: Run server tests
        run: |
          pip install pytest
-          export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          pytest -s -vv server/tests
      - name: Pre-commit checks
        run: |

--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -455,6 +455,6 @@ class DeployedModel(BaseModel):
    # Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
    # with model_ prefixes, since this disables guardrails for colliding fields:
    # https://github.com/pydantic/pydantic/issues/9177
-    model_config  = ConfigDict(protected_namespaces=())
+    model_config = ConfigDict(protected_namespaces=())
    model_id: str
    sha: str
--- a/server/text_generation_server/layers/attention/__init__.py
+++ b/server/text_generation_server/layers/attention/__init__.py
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 import os
 if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
@@ -7,7 +7,7 @@ if SYSTEM == "cuda":
    from .cuda import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
 elif SYSTEM == "rocm":
    from .rocm import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
-elif IPEX_AVAIL:
+elif SYSTEM == "ipex":
-    from .xpu import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
+    from .ipex import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
 else:
    raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention")
--- a/server/text_generation_server/layers/attention/xpu.py
+++ b/server/text_generation_server/layers/attention/xpu.py
--- a/server/text_generation_server/layers/layernorm.py
+++ b/server/text_generation_server/layers/layernorm.py
@@ -3,7 +3,6 @@ from torch import nn
 from accelerate import init_empty_weights
 from text_generation_server.utils.import_utils import (
    SYSTEM,
-    IPEX_AVAIL,
 )
@@ -83,7 +82,7 @@ elif SYSTEM == "rocm":
            return super().forward(hidden_states), residual
-elif IPEX_AVAIL:
+elif SYSTEM == "ipex":
    import intel_extension_for_pytorch as ipex
    class FastLayerNorm(nn.LayerNorm):
@@ -112,7 +111,7 @@ class FastRMSNorm(nn.Module):
        return cls(weight, eps)
    def forward(self, hidden_states, residual=None):
-        if IPEX_AVAIL:
+        if SYSTEM == "ipex":
            out = ipex.llm.functional.add_rms_norm(
                residual,
                hidden_states,

--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@@ -2,14 +2,14 @@ import os
 import torch
 from torch import nn
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 if SYSTEM == "cuda":
    from flash_attn.layers.rotary import RotaryEmbedding
    import rotary_emb
 elif SYSTEM == "rocm":
    from vllm._C import ops
-elif IPEX_AVAIL:
+elif SYSTEM == "ipex":
    import intel_extension_for_pytorch as ipex
@@ -69,7 +69,7 @@ class PositionRotaryEmbedding(nn.Module):
            # Inplace operation, updating query and key.
            ops.rotary_embedding(query, key, head_size, cos, sin, True)
-        elif IPEX_AVAIL:
+        elif SYSTEM == "ipex":
            ipex.llm.functional.rotary_embedding(
                query, key, sin, cos, query.size(-1), True
            )

--- a/server/text_generation_server/layers/tensor_parallel.py
+++ b/server/text_generation_server/layers/tensor_parallel.py
@@ -3,9 +3,9 @@ from torch.nn import functional as F
 from typing import Iterable, List
 from text_generation_server.layers.linear import get_linear, FastLinear
 from text_generation_server.layers.exl2 import Exl2Weight
-from text_generation_server.utils.import_utils import IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
-if IPEX_AVAIL:
+if SYSTEM == "ipex":
    import intel_extension_for_pytorch as ipex
@@ -100,7 +100,7 @@ class TensorParallelHead(SuperLayer):
                local_out = gather_input.T
            torch.mm(input, self.linear.weight.T, out=local_out)
-            if IPEX_AVAIL:
+            if SYSTEM == "ipex":
                ipex.distributed.all_gather_into_tensor(
                    world_out, gather_input, group=self.process_group
                )
@@ -117,7 +117,7 @@ class TensorParallelHead(SuperLayer):
        world_output = [
            torch.empty_like(output) for _ in range(self.process_group.size())
        ]
-        if IPEX_AVAIL:
+        if SYSTEM == "ipex":
            ipex.distributed.all_gather(world_output, output, group=self.process_group)
        else:
            torch.distributed.all_gather(world_output, output, group=self.process_group)
@@ -217,7 +217,7 @@ class TensorParallelRowLinear(SuperLayer):
    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
        out = super().forward(input)
        if self.process_group.size() > 1 and reduce:
-            if IPEX_AVAIL:
+            if SYSTEM == "ipex":
                ipex.distributed.all_reduce(out, group=self.process_group)
            else:
                torch.distributed.all_reduce(out, group=self.process_group)
@@ -257,7 +257,7 @@ class TensorParallelEmbedding(torch.nn.Module):
        )
        out = torch.nn.functional.embedding(input, self.weight)
        if self.reduce and self.process_group.size() > 1:
-            if IPEX_AVAIL:
+            if SYSTEM == "ipex":
                ipex.distributed.all_reduce(out, group=self.process_group)
            else:
                torch.distributed.all_reduce(out, group=self.process_group)

--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -20,9 +20,9 @@ from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
-from text_generation_server.utils.import_utils import IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
-if not IPEX_AVAIL:
+if SYSTEM != "ipex":
    from vllm.model_executor.layers.fused_moe import fused_moe
 from text_generation_server.layers.attention import (

--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -24,9 +24,9 @@ import torch.distributed
 import numpy as np
 from torch import nn
-from text_generation_server.utils.import_utils import IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
-if not IPEX_AVAIL:
+if SYSTEM != "ipex":
    from vllm.model_executor.layers.fused_moe import fused_moe
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig

--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -15,7 +15,7 @@ from typing import Iterable, Optional, Tuple, List, Type, Dict
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from text_generation_server.utils.chunks import concat_text_chunks
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models import Model
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.utils.dist import RANK
@@ -768,12 +768,9 @@ class FlashCausalLM(Model):
        empty_cache()
        element_size = torch.tensor([], dtype=dtype).element_size()
-        if SYSTEM == "xpu":
+        x = BLOCK_SIZE // element_size
-            x = 1
-        else:
-            x = BLOCK_SIZE // element_size
-        if IPEX_AVAIL and SYSTEM == "cpu":
+        if SYSTEM == "ipex" and device == torch.device("cpu"):
            self.kv_cache = [
                (
                    torch.empty(

--- a/server/text_generation_server/models/flash_gpt2.py
+++ b/server/text_generation_server/models/flash_gpt2.py
@@ -15,7 +15,7 @@ from text_generation_server.utils import (
    weight_files,
    Weights,
 )
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 tracer = trace.get_tracer(__name__)
@@ -34,12 +34,12 @@ class FlashGPT2(FlashCausalLM):
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
+        elif SYSTEM == "ipex":
-            device = torch.device(f"xpu:{rank}")
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
            dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
        else:
            raise NotImplementedError("FlashGPT2 is only available on GPU")

--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -17,7 +17,7 @@ from text_generation_server.utils import (
 tracer = trace.get_tracer(__name__)
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 class FlashLlama(FlashCausalLM):
@@ -34,12 +34,12 @@ class FlashLlama(FlashCausalLM):
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
+        elif SYSTEM == "ipex":
-            device = torch.device(f"xpu:{rank}")
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
            dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
        else:
            raise NotImplementedError("FlashLlama is only available on GPU")

--- a/server/text_generation_server/models/flash_mistral.py
+++ b/server/text_generation_server/models/flash_mistral.py
@@ -16,7 +16,7 @@ from text_generation_server.utils import (
    weight_files,
    Weights,
 )
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 tracer = trace.get_tracer(__name__)
@@ -38,12 +38,12 @@ class BaseFlashMistral(FlashCausalLM):
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
+        elif SYSTEM == "ipex":
-            device = torch.device(f"xpu:{rank}")
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
            dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
        else:
            raise NotImplementedError("FlashMistral is only available on GPU")

--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -14,7 +14,7 @@ from text_generation_server.utils import (
    weight_files,
    Weights,
 )
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 tracer = trace.get_tracer(__name__)
@@ -33,12 +33,12 @@ class FlashNeoXSharded(FlashCausalLM):
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
+        elif SYSTEM == "ipex":
-            device = torch.device(f"xpu:{rank}")
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
            dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
        else:
            raise NotImplementedError("FlashNeoX is only available on GPU")

--- a/server/text_generation_server/models/flash_rw.py
+++ b/server/text_generation_server/models/flash_rw.py
@@ -15,7 +15,7 @@ from text_generation_server.utils import (
    weight_files,
    Weights,
 )
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 tracer = trace.get_tracer(__name__)
@@ -34,12 +34,12 @@ class FlashRWSharded(FlashCausalLM):
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
+        elif SYSTEM == "ipex":
-            device = torch.device(f"xpu:{rank}")
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
            dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
        else:
            raise NotImplementedError("FlashRW is only available on GPU")

--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -18,7 +18,7 @@ from text_generation_server.utils import (
    Weights,
 )
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 tracer = trace.get_tracer(__name__)
@@ -37,12 +37,12 @@ class FlashSantacoderSharded(FlashCausalLM):
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
+        elif SYSTEM == "ipex":
-            device = torch.device(f"xpu:{rank}")
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
            dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
        else:
            raise NotImplementedError("FlashSantacoderSharded is only available on GPU")