[Core] Update outlines and increase its threadpool size (#11140)

Signed-off-by: Russell Bryant <rbryant@redhat.com>

[Core] Update outlines and increase its threadpool size (#11140)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
48259264 · Russell Bryant · GitHub · 24a3d12b · 48259264 · 48259264
Unverified Commit 48259264 authored Dec 14, 2024 by Russell Bryant Committed by GitHub Dec 14, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 2 deletions

requirements-common.txt requirements-common.txt +1 -1

vllm/model_executor/guided_decoding/outlines_decoding.py vllm/model_executor/guided_decoding/outlines_decoding.py +10 -1

No files found.
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,7 +19,7 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.9
+outlines == 0.1.11
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317

--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
 import asyncio
 import concurrent.futures
+import os
 from enum import Enum
 from json import dumps as json_dumps
 from re import escape as regex_escape
@@ -48,6 +49,11 @@ pair   : UNESCAPED_STRING ":" value

 global_thread_pool = None  # used for generating logits processor fsm

+# It's not yet clear that using more provides a benefit, and it could
+# potentially starve other processes on the machine. We'll cap this for now and
+# adjust later if testing proves it to help overcome a bottleneck.
+_MAX_THREADPOOL_WORKERS = 16
+

 async def get_outlines_guided_decoding_logits_processor(
    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
@@ -65,8 +71,11 @@ async def get_outlines_guided_decoding_logits_processor(
        return None

    if global_thread_pool is None:
+        max_workers = os.cpu_count() or 2
+        if max_workers > _MAX_THREADPOOL_WORKERS:
+            max_workers = _MAX_THREADPOOL_WORKERS
        global_thread_pool = concurrent.futures.ThreadPoolExecutor(
-            max_workers=2)
+            max_workers=max_workers)
    loop = asyncio.get_running_loop()

    return await loop.run_in_executor(global_thread_pool,