Disable outlines cache by default (#14837)

776dcec8 · Russell Bryant · GitHub · ccf02fcb · 776dcec8 · 776dcec8
Unverified Commit 776dcec8 authored Mar 14, 2025 by Russell Bryant Committed by GitHub Mar 15, 2025
Show whitespace changes
Inline Side-by-side

Showing with 16 additions and 1 deletion

vllm/envs.py vllm/envs.py +7 -0

vllm/model_executor/guided_decoding/outlines_logits_processors.py ...el_executor/guided_decoding/outlines_logits_processors.py +9 -1

No files found.
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -95,6 +95,7 @@ if TYPE_CHECKING:
    VLLM_DP_MASTER_IP: str = ""
    VLLM_DP_MASTER_PORT: int = 0
    VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
+    VLLM_V0_USE_OUTLINES_CACHE: bool = False
 def get_default_cache_root():
@@ -623,6 +624,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
    "VLLM_MARLIN_USE_ATOMIC_ADD":
    lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
+    # Whether to turn on the outlines cache for V0
+    # This cache is unbounded and on disk, so it's not safe to use in
+    # an environment with potentially malicious users.
+    "VLLM_V0_USE_OUTLINES_CACHE":
+    lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
 }
 # end-env-vars-definition

--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -24,7 +24,7 @@ from typing import Callable, DefaultDict, Dict, List, Optional, Union
 import numpy as np
 import torch
 from outlines import grammars
-from outlines.caching import cache
+from outlines.caching import cache, disable_cache
 from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide,
                                RegexGuide, Write)
 from outlines.fsm.parsing import PartialLark
@@ -32,12 +32,20 @@ from outlines_core.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.reasoner import Reasoner
 from vllm.platforms import current_platform
 logger = init_logger(__name__)
+if envs.VLLM_V0_USE_OUTLINES_CACHE:
+    logger.warning("Enabling outlines cache. This is an unbounded on-disk "
+                   "cache. It may consume a lot of disk space and should "
+                   "not be used with untrusted clients.")
+else:
+    disable_cache()
 class BaseLogitsProcessor: