[Bugfix] Actually disable processing cache when API server is scaled out (#21839)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Bugfix] Actually disable processing cache when API server is scaled out (#21839)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
44bc46da · Cyrus Leung · GitHub · b7b23da4 · 44bc46da
Unverified Commit 44bc46da authored Jul 30, 2025 by Cyrus Leung Committed by GitHub Jul 29, 2025
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 5 deletions

vllm/entrypoints/cli/serve.py vllm/entrypoints/cli/serve.py +8 -5

No files found.
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -140,11 +140,16 @@ def run_multi_api_server(args: argparse.Namespace):
    num_api_servers = args.api_server_count
    assert num_api_servers > 0

+    orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
+
    # set_process_title("ProcManager")

    if num_api_servers > 1:
        setup_multiprocess_prometheus()

+        # Not compatible with API server scale-out
+        args.disable_mm_preprocessor_cache = True
+
    listen_address, sock = setup_server(args)

    engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
@@ -161,11 +166,9 @@ def run_multi_api_server(args: argparse.Namespace):
                             "with api_server_count > 1")

        if model_config.is_multimodal_model and not (
-                model_config.disable_mm_preprocessor_cache):
-            logger.warning(
-                "Multi-model preprocessor cache will be disabled for"
-                " api_server_count > 1")
-            model_config.disable_mm_preprocessor_cache = True
+                orig_disable_mm_preprocessor_cache):
+            logger.warning("Multi-model preprocessor cache will be disabled "
+                           "for api_server_count > 1")

    executor_class = Executor.get_class(vllm_config)
    log_stats = not engine_args.disable_log_stats