[Bugfix] Only require XGrammar on x86 (#10865)

Signed-off-by: mgoin <michael@neuralmagic.com>

[Bugfix] Only require XGrammar on x86 (#10865)
Signed-off-by: mgoin <michael@neuralmagic.com>
7090c27b · Michael Goin · GitHub · 2f2cdc74 · 7090c27b · 7090c27b
Unverified Commit 7090c27b authored Dec 03, 2024 by Michael Goin Committed by GitHub Dec 03, 2024
4 changed files
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,7 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines >= 0.0.43, < 0.1
-xgrammar
+xgrammar >= 0.1.5; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 from vllm.logger import init_logger
+from vllm.platforms import CpuArchEnum, current_platform
 if TYPE_CHECKING:
    from transformers import PreTrainedTokenizer
@@ -25,6 +26,12 @@ def maybe_backend_fallback(
        guided_params.backend = "xgrammar"
    if guided_params.backend == "xgrammar":
+        # xgrammar only has x86 wheels for linux, fallback to outlines
+        if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
+            logger.warning("xgrammar is only supported on x86 CPUs. "
+                           "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
        # xgrammar doesn't support regex or choice, fallback to outlines
        if guided_params.regex is not None or guided_params.choice is not None:
            logger.warning(

--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
 from .interface import _Backend  # noqa: F401
-from .interface import Platform, PlatformEnum, UnspecifiedPlatform
+from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform
 current_platform: Platform
@@ -120,4 +120,4 @@ elif is_openvino:
 else:
    current_platform = UnspecifiedPlatform()
-__all__ = ['Platform', 'PlatformEnum', 'current_platform']
+__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum']
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
 import enum
+import platform
 import random
 from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
@@ -37,6 +38,14 @@ class PlatformEnum(enum.Enum):
    UNSPECIFIED = enum.auto()
+class CpuArchEnum(enum.Enum):
+    X86 = enum.auto()
+    ARM = enum.auto()
+    POWERPC = enum.auto()
+    OTHER = enum.auto()
+    UNKNOWN = enum.auto()
 class DeviceCapability(NamedTuple):
    major: int
    minor: int
@@ -184,6 +193,23 @@ class Platform:
                f"{quant} quantization is currently not supported in "
                f"{cls.device_name}.")
+    @classmethod
+    def get_cpu_architecture(cls) -> CpuArchEnum:
+        """
+        Determine the CPU architecture of the current system.
+        Returns CpuArchEnum indicating the architecture type.
+        """
+        machine = platform.machine().lower()
+        if machine in ("x86_64", "amd64", "i386", "i686"):
+            return CpuArchEnum.X86
+        elif machine.startswith("arm") or machine.startswith("aarch"):
+            return CpuArchEnum.ARM
+        elif machine.startswith("ppc"):
+            return CpuArchEnum.POWERPC
+        return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
 class UnspecifiedPlatform(Platform):
    _enum = PlatformEnum.UNSPECIFIED