[UX] Defer some imports on CLI paths to save ~2s (#40056)

Signed-off-by: mgoin <mgoin64@gmail.com>

[UX] Defer some imports on CLI paths to save ~2s (#40056)
Signed-off-by: mgoin <mgoin64@gmail.com>
1948d0c4 · Michael Goin · GitHub · 4c47710b · 1948d0c4 · 1948d0c4
Unverified Commit 1948d0c4 authored Apr 16, 2026 by Michael Goin Committed by GitHub Apr 16, 2026
5 changed files
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -8,7 +8,7 @@ from dataclasses import dataclass
 from functools import partial
 from pathlib import Path
 from types import TracebackType
-from typing import ClassVar
+from typing import TYPE_CHECKING, ClassVar
 from typing_extensions import Self, override
@@ -17,20 +17,8 @@ from vllm.utils.import_utils import PlaceholderModule
 from .utils import sanitize_filename
-try:
+if TYPE_CHECKING:
-    import matplotlib.pyplot as plt
-except ImportError:
-    plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot")
-try:
    import pandas as pd
-except ImportError:
-    pd = PlaceholderModule("pandas")
-try:
-    import seaborn as sns
-except ImportError:
-    seaborn = PlaceholderModule("seaborn")
 @dataclass
@@ -265,6 +253,20 @@ def _plot_fig(
    fig_height: float,
    fig_dpi: int,
 ):
+    # Lazy-import matplotlib/pandas/seaborn
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot")
+    try:
+        import pandas as pd
+    except ImportError:
+        pd = PlaceholderModule("pandas")
+    try:
+        import seaborn as sns
+    except ImportError:
+        sns = PlaceholderModule("seaborn")
    fig_group, fig_data = fig_group_data
    row_groups = full_groupby(

--- a/vllm/benchmarks/sweep/plot_pareto.py
+++ b/vllm/benchmarks/sweep/plot_pareto.py
@@ -6,7 +6,7 @@ from concurrent.futures import ProcessPoolExecutor
 from dataclasses import dataclass
 from functools import partial
 from pathlib import Path
-from typing import ClassVar
+from typing import TYPE_CHECKING, ClassVar
 from vllm.utils.collection_utils import full_groupby
 from vllm.utils.import_utils import PlaceholderModule
@@ -14,20 +14,8 @@ from vllm.utils.import_utils import PlaceholderModule
 from .plot import DummyExecutor, _json_load_bytes
 from .utils import sanitize_filename
-try:
+if TYPE_CHECKING:
-    import matplotlib.pyplot as plt
-except ImportError:
-    plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot")
-try:
    import pandas as pd
-except ImportError:
-    pd = PlaceholderModule("pandas")
-try:
-    import seaborn as sns
-except ImportError:
-    seaborn = PlaceholderModule("seaborn")
 def _first_present(run_data: dict[str, object], keys: list[str]):
@@ -195,6 +183,20 @@ def _plot_fig(
        print("[END FIGURE]")
        return
+    # Lazy-import matplotlib/pandas/seaborn
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot")
+    try:
+        import pandas as pd
+    except ImportError:
+        pd = PlaceholderModule("pandas")
+    try:
+        import seaborn as sns
+    except ImportError:
+        sns = PlaceholderModule("seaborn")
    df = pd.DataFrame.from_records(fig_data)
    df = df.dropna(subset=["tokens_per_user", "tokens_per_gpu"])

--- a/vllm/entrypoints/cli/__init__.py
+++ b/vllm/entrypoints/cli/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
-from vllm.entrypoints.cli.benchmark.mm_processor import (
-    BenchmarkMMProcessorSubcommand,
-)
-from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
-from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand
-from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
-from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcommand
-__all__: list[str] = [
-    "BenchmarkLatencySubcommand",
-    "BenchmarkMMProcessorSubcommand",
-    "BenchmarkServingSubcommand",
-    "BenchmarkStartupSubcommand",
-    "BenchmarkSweepSubcommand",
-    "BenchmarkThroughputSubcommand",
-]
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
+import sys
 import typing
 from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
@@ -14,6 +15,17 @@ else:
    FlexibleArgumentParser = argparse.ArgumentParser
+def _import_bench_subcommand_modules() -> None:
+    # Imported lazily so `BenchmarkSubcommandBase` subclasses register only
+    # when `vllm bench` is actually invoked.
+    import vllm.entrypoints.cli.benchmark.latency  # noqa: F401
+    import vllm.entrypoints.cli.benchmark.mm_processor  # noqa: F401
+    import vllm.entrypoints.cli.benchmark.serve  # noqa: F401
+    import vllm.entrypoints.cli.benchmark.startup  # noqa: F401
+    import vllm.entrypoints.cli.benchmark.sweep  # noqa: F401
+    import vllm.entrypoints.cli.benchmark.throughput  # noqa: F401
 class BenchmarkSubcommand(CLISubcommand):
    """The `bench` subcommand for the vLLM CLI."""
@@ -38,18 +50,28 @@ class BenchmarkSubcommand(CLISubcommand):
        )
        bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type")
-        for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
+        # Only build the nested bench subparsers when the user is actually
-            cmd_subparser = bench_subparsers.add_parser(
+        # invoking `bench`; otherwise we'd drag in imports
-                cmd_cls.name,
+        # unnecessarily on every `vllm --help` and `vllm serve`.
-                help=cmd_cls.help,
+        # Scan for the first positional arg so global flags (e.g. `-v`)
-                description=cmd_cls.help,
+        # before the subcommand don't break detection.
-                usage=f"vllm {self.name} {cmd_cls.name} [options]",
+        first_positional = next(
-            )
+            (arg for arg in sys.argv[1:] if not arg.startswith("-")), None
-            cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
+        )
-            cmd_cls.add_cli_args(cmd_subparser)
+        if first_positional == self.name:
-            cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
+            _import_bench_subcommand_modules()
-                subcmd=f"{self.name} {cmd_cls.name}"
+            for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
-            )
+                cmd_subparser = bench_subparsers.add_parser(
+                    cmd_cls.name,
+                    help=cmd_cls.help,
+                    description=cmd_cls.help,
+                    usage=f"vllm {self.name} {cmd_cls.name} [options]",
+                )
+                cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
+                cmd_cls.add_cli_args(cmd_subparser)
+                cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
+                    subcmd=f"{self.name} {cmd_cls.name}"
+                )
        return bench_parser

--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -100,10 +100,9 @@ logger = init_logger(__name__)
 # it avoids unintentional cuda initialization from torch.cuda.is_available()
 os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1"
-# see https://github.com/vllm-project/vllm/issues/10480
+# see https://github.com/vllm-project/vllm/issues/10480 and
+# https://github.com/vllm-project/vllm/issues/10619.
 os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
-# see https://github.com/vllm-project/vllm/issues/10619
-torch._inductor.config.compile_threads = 1
 # Enable Triton autotuning result caching to disk by default.
 # Without this, Triton re-runs autotuning on every process restart,