[Bugfix] Actually disable processing cache when API server is scaled out (#21839)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Bugfix] Actually disable processing cache when API server is scaled out (#21839)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
44bc46da · Cyrus Leung · GitHub · b7b23da4 · 44bc46da
Unverified Commit 44bc46da authored Jul 30, 2025 by Cyrus Leung Committed by GitHub Jul 29, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 5 deletions

vllm/entrypoints/cli/serve.py vllm/entrypoints/cli/serve.py +8 -5

No files found.
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -140,11 +140,16 @@ def run_multi_api_server(args: argparse.Namespace):
    num_api_servers = args.api_server_count
    assert num_api_servers > 0
+    orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
    # set_process_title("ProcManager")
    if num_api_servers > 1:
        setup_multiprocess_prometheus()
+        # Not compatible with API server scale-out
+        args.disable_mm_preprocessor_cache = True
    listen_address, sock = setup_server(args)
    engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
@@ -161,11 +166,9 @@ def run_multi_api_server(args: argparse.Namespace):
                             "with api_server_count > 1")
        if model_config.is_multimodal_model and not (
-                model_config.disable_mm_preprocessor_cache):
+                orig_disable_mm_preprocessor_cache):
-            logger.warning(
+            logger.warning("Multi-model preprocessor cache will be disabled "
-                "Multi-model preprocessor cache will be disabled for"
+                           "for api_server_count > 1")
-                " api_server_count > 1")
-            model_config.disable_mm_preprocessor_cache = True
    executor_class = Executor.get_class(vllm_config)
    log_stats = not engine_args.disable_log_stats