Correct capitalisation: `VLLM` -> `vLLM` (#14562)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Correct capitalisation: `VLLM` -> `vLLM` (#14562)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
3b352a2f · Harry Mellor · GitHub · dea985ae · 3b352a2f · 3b352a2f
Unverified Commit 3b352a2f authored Mar 10, 2025 by Harry Mellor Committed by GitHub Mar 10, 2025
18 changed files
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
    print(f"Naive output={output_naive}")
    print(f"FlashInfer output={output_flashinfer}")
-    print(f"VLLM output={output_vllm}")
+    print(f"vLLM output={output_vllm}")
    if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
                      rtol=1e-2) and torch.allclose(

--- a/docs/source/contributing/vulnerability_management.md
+++ b/docs/source/contributing/vulnerability_management.md
@@ -37,7 +37,7 @@ you may contact the following individuals:
 ## Slack Discussion
-You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
+You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
 to discuss security-related topics. However, please do not disclose any
 vulnerabilities in this channel. If you need to report a vulnerability, please
 use the GitHub security advisory system or contact a VMT member privately.

--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@@ -509,7 +509,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
 memory. This is also known as "KV cache offloading" and is configured
 with `--swap-space` and `--preemption-mode`.
-In v0, [VLLM has long supported beam
+In v0, [vLLM has long supported beam
 search](gh-issue:6226). The
 SequenceGroup encapsulated the idea of N Sequences which
 all shared the same prompt kv blocks. This enabled KV cache block

--- a/examples/offline_inference/disaggregated_prefill_lmcache.py
+++ b/examples/offline_inference/disaggregated_prefill_lmcache.py
@@ -5,7 +5,7 @@ with LMCache.
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
 and launch an additional LMCache server.
 KV cache is transferred in the following manner: 
-VLLM prefill node -> LMCache server -> VLLM decode node.
+vLLM prefill node -> LMCache server -> vLLM decode node.
 Note that `pip install lmcache` is needed to run this example.
 Learn more about LMCache in https://github.com/LMCache/LMCache.

--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -25,7 +25,7 @@ ACCURACY_CONFIGS = [
    GSM8KAccuracyTestConfig(
        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        excepted_value=0.76),  # no bias
-    # NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
+    # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
    # so only one of these tests can run in a single call to pytest. As
    # a follow up, move this into the LM-EVAL section of the CI.
    # GSM8KAccuracyTestConfig(

--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -51,7 +51,7 @@ def get_env_variable_attn_backend() -> Optional[_Backend]:
 # (default behavior if this variable is None)
 #
 # THIS SELECTION TAKES PRECEDENCE OVER THE
-# VLLM ATTENTION BACKEND ENVIRONMENT VARIABLE
+# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
 forced_attn_backend: Optional[_Backend] = None

--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -278,7 +278,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
 class VllmBackend:
-    """The compilation backend for `torch.compile` with VLLM.
+    """The compilation backend for `torch.compile` with vLLM.
    It is used for compilation level of `CompilationLevel.PIECEWISE`,
    where we customize the compilation.

--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -31,7 +31,7 @@ class CompilerInterface:
    def compute_hash(self, vllm_config: VllmConfig) -> str:
        """
-        Gather all the relevant information from the VLLM config,
+        Gather all the relevant information from the vLLM config,
        to compute a hash so that we can cache the compiled model.
        See :meth:`VllmConfig.compute_hash` to check what information

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3572,11 +3572,11 @@ _current_vllm_config: Optional[VllmConfig] = None
 @contextmanager
 def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
    """
-    Temporarily set the current VLLM config.
+    Temporarily set the current vLLM config.
    Used during model initialization.
-    We save the current VLLM config in a global variable,
+    We save the current vLLM config in a global variable,
    so that all modules can access it, e.g. custom ops
-    can access the VLLM config to determine how to dispatch.
+    can access the vLLM config to determine how to dispatch.
    """
    global _current_vllm_config
    old_vllm_config = _current_vllm_config
@@ -3611,7 +3611,7 @@ def get_current_vllm_config() -> VllmConfig:
        # in ci, usually when we test custom ops/modules directly,
        # we don't set the vllm config. In that case, we set a default
        # config.
-        logger.warning("Current VLLM config is not set.")
+        logger.warning("Current vLLM config is not set.")
        from vllm.config import VllmConfig
        return VllmConfig()
    return _current_vllm_config
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -237,7 +237,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
    tool_choice: Optional[Union[Literal["none"], Literal["auto"],
                                ChatCompletionNamedToolChoiceParam]] = "none"
-    # NOTE this will be ignored by VLLM -- the model determines the behavior
+    # NOTE this will be ignored by vLLM -- the model determines the behavior
    parallel_tool_calls: Optional[bool] = False
    user: Optional[str] = None

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -164,7 +164,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VERBOSE":
    lambda: bool(int(os.getenv('VERBOSE', '0'))),
-    # Root directory for VLLM configuration files
+    # Root directory for vLLM configuration files
    # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
    # Note that this not only affects how vllm finds its configuration files
    # during runtime, but also affects how vllm installs its configuration
@@ -178,7 +178,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # ================== Runtime Env Vars ==================
-    # Root directory for VLLM cache files
+    # Root directory for vLLM cache files
    # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
    "VLLM_CACHE_ROOT":
    lambda: os.path.expanduser(
@@ -260,7 +260,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_ENGINE_ITERATION_TIMEOUT_S":
    lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
-    # API key for VLLM API server
+    # API key for vLLM API server
    "VLLM_API_KEY":
    lambda: os.environ.get("VLLM_API_KEY", None),

--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1414,7 +1414,7 @@ def cat_with_pad(tensors, dim, padding_value=0):
 @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
 class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
    """
-    Implements the Phi-4-multimodal-instruct model in VLLM.
+    Implements the Phi-4-multimodal-instruct model in vLLM.
    """
    packed_modules_mapping = {
        "qkv_proj": [

--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -119,7 +119,7 @@ class CudaPlatformBase(Platform):
                if envs.VLLM_USE_V1:
                    raise NotImplementedError(
                        "Multi-step scheduling is not supported (and not "
-                        "needed) on VLLM V1. Please launch without "
+                        "needed) on vLLM V1. Please launch without "
                        "--num-scheduler-steps.")
                else:
                    parallel_config.worker_cls = \

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -173,7 +173,7 @@ class RocmPlatform(Platform):
                if envs.VLLM_USE_V1:
                    raise NotImplementedError(
                        "Multi-step scheduling is not supported (and not "
-                        "needed) on VLLM V1. Please launch without "
+                        "needed) on vLLM V1. Please launch without "
                        "--num-scheduler-steps.")
                else:
                    parallel_config.worker_cls = \
@@ -181,7 +181,7 @@ class RocmPlatform(Platform):
            elif vllm_config.speculative_config:
                if envs.VLLM_USE_V1:
                    raise NotImplementedError(
-                        "Speculative decoding is not yet supported on VLLM V1."
+                        "Speculative decoding is not yet supported on vLLM V1."
                    )
                else:
                    parallel_config.worker_cls = \

--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -249,7 +249,7 @@ class MistralTokenizer(TokenizerBase):
                                         revision=revision)
        return tokenizer_file
-    # the following attributes are set to fit VLLM's design and are used
+    # the following attributes are set to fit vLLM's design and are used
    # by the guided structured output backends.
    @property
    def all_special_tokens_extended(self) -> List[str]:

--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -255,7 +255,7 @@ class MPClient(EngineCoreClient):
        # TODO(rob): rather than killing the main process, we should
        # figure out how to raise an AsyncEngineDeadError and
        # handle at the API server level so we can return a better
-        # error code to the clients calling VLLM.
+        # error code to the clients calling vLLM.
        def sigusr1_handler(signum, frame):
            logger.fatal("Got fatal signal from worker processes, shutting "
                         "down. See stack trace above for root cause issue.")

--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -248,7 +248,7 @@ class OutputProcessor:
        ****************** NOTE FOR DEVELOPERS ******************
-        VLLM V1 minimizes the number of python loops over the full
+        vLLM V1 minimizes the number of python loops over the full
        batch to ensure system overheads are minimized. This is the 
        only function that should loop over EngineCoreOutputs.

--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -93,10 +93,10 @@ class Processor:
    ) -> None:
        # Best of not yet supported.
        if params.best_of is not None and params.best_of > 1:
-            raise ValueError("VLLM V1 does not yet support best_of.")
+            raise ValueError("vLLM V1 does not yet support best_of.")
        # Logits processors not supported.
        if params.logits_processors:
-            raise ValueError("VLLM V1 does not support per request "
+            raise ValueError("vLLM V1 does not support per request "
                             "user provided logits processors.")
    def _validate_params(