Unverified Commit 3b352a2f authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Correct capitalisation: `VLLM` -> `vLLM` (#14562)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent dea985ae
...@@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True): ...@@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
print(f"Naive output={output_naive}") print(f"Naive output={output_naive}")
print(f"FlashInfer output={output_flashinfer}") print(f"FlashInfer output={output_flashinfer}")
print(f"VLLM output={output_vllm}") print(f"vLLM output={output_vllm}")
if torch.allclose(output_naive, output_flashinfer, atol=1e-2, if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
rtol=1e-2) and torch.allclose( rtol=1e-2) and torch.allclose(
......
...@@ -37,7 +37,7 @@ you may contact the following individuals: ...@@ -37,7 +37,7 @@ you may contact the following individuals:
## Slack Discussion ## Slack Discussion
You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai) You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
to discuss security-related topics. However, please do not disclose any to discuss security-related topics. However, please do not disclose any
vulnerabilities in this channel. If you need to report a vulnerability, please vulnerabilities in this channel. If you need to report a vulnerability, please
use the GitHub security advisory system or contact a VMT member privately. use the GitHub security advisory system or contact a VMT member privately.
......
...@@ -509,7 +509,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU ...@@ -509,7 +509,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
memory. This is also known as "KV cache offloading" and is configured memory. This is also known as "KV cache offloading" and is configured
with `--swap-space` and `--preemption-mode`. with `--swap-space` and `--preemption-mode`.
In v0, [VLLM has long supported beam In v0, [vLLM has long supported beam
search](gh-issue:6226). The search](gh-issue:6226). The
SequenceGroup encapsulated the idea of N Sequences which SequenceGroup encapsulated the idea of N Sequences which
all shared the same prompt kv blocks. This enabled KV cache block all shared the same prompt kv blocks. This enabled KV cache block
......
...@@ -5,7 +5,7 @@ with LMCache. ...@@ -5,7 +5,7 @@ with LMCache.
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode), We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and launch an additional LMCache server. and launch an additional LMCache server.
KV cache is transferred in the following manner: KV cache is transferred in the following manner:
VLLM prefill node -> LMCache server -> VLLM decode node. vLLM prefill node -> LMCache server -> vLLM decode node.
Note that `pip install lmcache` is needed to run this example. Note that `pip install lmcache` is needed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache. Learn more about LMCache in https://github.com/LMCache/LMCache.
......
...@@ -25,7 +25,7 @@ ACCURACY_CONFIGS = [ ...@@ -25,7 +25,7 @@ ACCURACY_CONFIGS = [
GSM8KAccuracyTestConfig( GSM8KAccuracyTestConfig(
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
excepted_value=0.76), # no bias excepted_value=0.76), # no bias
# NOTE(rob): We cannot re-initialize VLLM in the same process for TPU, # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As # so only one of these tests can run in a single call to pytest. As
# a follow up, move this into the LM-EVAL section of the CI. # a follow up, move this into the LM-EVAL section of the CI.
# GSM8KAccuracyTestConfig( # GSM8KAccuracyTestConfig(
......
...@@ -51,7 +51,7 @@ def get_env_variable_attn_backend() -> Optional[_Backend]: ...@@ -51,7 +51,7 @@ def get_env_variable_attn_backend() -> Optional[_Backend]:
# (default behavior if this variable is None) # (default behavior if this variable is None)
# #
# THIS SELECTION TAKES PRECEDENCE OVER THE # THIS SELECTION TAKES PRECEDENCE OVER THE
# VLLM ATTENTION BACKEND ENVIRONMENT VARIABLE # VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
forced_attn_backend: Optional[_Backend] = None forced_attn_backend: Optional[_Backend] = None
......
...@@ -278,7 +278,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): ...@@ -278,7 +278,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
class VllmBackend: class VllmBackend:
"""The compilation backend for `torch.compile` with VLLM. """The compilation backend for `torch.compile` with vLLM.
It is used for compilation level of `CompilationLevel.PIECEWISE`, It is used for compilation level of `CompilationLevel.PIECEWISE`,
where we customize the compilation. where we customize the compilation.
......
...@@ -31,7 +31,7 @@ class CompilerInterface: ...@@ -31,7 +31,7 @@ class CompilerInterface:
def compute_hash(self, vllm_config: VllmConfig) -> str: def compute_hash(self, vllm_config: VllmConfig) -> str:
""" """
Gather all the relevant information from the VLLM config, Gather all the relevant information from the vLLM config,
to compute a hash so that we can cache the compiled model. to compute a hash so that we can cache the compiled model.
See :meth:`VllmConfig.compute_hash` to check what information See :meth:`VllmConfig.compute_hash` to check what information
......
...@@ -3572,11 +3572,11 @@ _current_vllm_config: Optional[VllmConfig] = None ...@@ -3572,11 +3572,11 @@ _current_vllm_config: Optional[VllmConfig] = None
@contextmanager @contextmanager
def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
""" """
Temporarily set the current VLLM config. Temporarily set the current vLLM config.
Used during model initialization. Used during model initialization.
We save the current VLLM config in a global variable, We save the current vLLM config in a global variable,
so that all modules can access it, e.g. custom ops so that all modules can access it, e.g. custom ops
can access the VLLM config to determine how to dispatch. can access the vLLM config to determine how to dispatch.
""" """
global _current_vllm_config global _current_vllm_config
old_vllm_config = _current_vllm_config old_vllm_config = _current_vllm_config
...@@ -3611,7 +3611,7 @@ def get_current_vllm_config() -> VllmConfig: ...@@ -3611,7 +3611,7 @@ def get_current_vllm_config() -> VllmConfig:
# in ci, usually when we test custom ops/modules directly, # in ci, usually when we test custom ops/modules directly,
# we don't set the vllm config. In that case, we set a default # we don't set the vllm config. In that case, we set a default
# config. # config.
logger.warning("Current VLLM config is not set.") logger.warning("Current vLLM config is not set.")
from vllm.config import VllmConfig from vllm.config import VllmConfig
return VllmConfig() return VllmConfig()
return _current_vllm_config return _current_vllm_config
...@@ -237,7 +237,7 @@ class ChatCompletionRequest(OpenAIBaseModel): ...@@ -237,7 +237,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
tool_choice: Optional[Union[Literal["none"], Literal["auto"], tool_choice: Optional[Union[Literal["none"], Literal["auto"],
ChatCompletionNamedToolChoiceParam]] = "none" ChatCompletionNamedToolChoiceParam]] = "none"
# NOTE this will be ignored by VLLM -- the model determines the behavior # NOTE this will be ignored by vLLM -- the model determines the behavior
parallel_tool_calls: Optional[bool] = False parallel_tool_calls: Optional[bool] = False
user: Optional[str] = None user: Optional[str] = None
......
...@@ -164,7 +164,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -164,7 +164,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VERBOSE": "VERBOSE":
lambda: bool(int(os.getenv('VERBOSE', '0'))), lambda: bool(int(os.getenv('VERBOSE', '0'))),
# Root directory for VLLM configuration files # Root directory for vLLM configuration files
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
# Note that this not only affects how vllm finds its configuration files # Note that this not only affects how vllm finds its configuration files
# during runtime, but also affects how vllm installs its configuration # during runtime, but also affects how vllm installs its configuration
...@@ -178,7 +178,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -178,7 +178,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# ================== Runtime Env Vars ================== # ================== Runtime Env Vars ==================
# Root directory for VLLM cache files # Root directory for vLLM cache files
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
"VLLM_CACHE_ROOT": "VLLM_CACHE_ROOT":
lambda: os.path.expanduser( lambda: os.path.expanduser(
...@@ -260,7 +260,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -260,7 +260,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "VLLM_ENGINE_ITERATION_TIMEOUT_S":
lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")), lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
# API key for VLLM API server # API key for vLLM API server
"VLLM_API_KEY": "VLLM_API_KEY":
lambda: os.environ.get("VLLM_API_KEY", None), lambda: os.environ.get("VLLM_API_KEY", None),
......
...@@ -1414,7 +1414,7 @@ def cat_with_pad(tensors, dim, padding_value=0): ...@@ -1414,7 +1414,7 @@ def cat_with_pad(tensors, dim, padding_value=0):
@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm) @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
""" """
Implements the Phi-4-multimodal-instruct model in VLLM. Implements the Phi-4-multimodal-instruct model in vLLM.
""" """
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": [ "qkv_proj": [
......
...@@ -119,7 +119,7 @@ class CudaPlatformBase(Platform): ...@@ -119,7 +119,7 @@ class CudaPlatformBase(Platform):
if envs.VLLM_USE_V1: if envs.VLLM_USE_V1:
raise NotImplementedError( raise NotImplementedError(
"Multi-step scheduling is not supported (and not " "Multi-step scheduling is not supported (and not "
"needed) on VLLM V1. Please launch without " "needed) on vLLM V1. Please launch without "
"--num-scheduler-steps.") "--num-scheduler-steps.")
else: else:
parallel_config.worker_cls = \ parallel_config.worker_cls = \
......
...@@ -173,7 +173,7 @@ class RocmPlatform(Platform): ...@@ -173,7 +173,7 @@ class RocmPlatform(Platform):
if envs.VLLM_USE_V1: if envs.VLLM_USE_V1:
raise NotImplementedError( raise NotImplementedError(
"Multi-step scheduling is not supported (and not " "Multi-step scheduling is not supported (and not "
"needed) on VLLM V1. Please launch without " "needed) on vLLM V1. Please launch without "
"--num-scheduler-steps.") "--num-scheduler-steps.")
else: else:
parallel_config.worker_cls = \ parallel_config.worker_cls = \
...@@ -181,7 +181,7 @@ class RocmPlatform(Platform): ...@@ -181,7 +181,7 @@ class RocmPlatform(Platform):
elif vllm_config.speculative_config: elif vllm_config.speculative_config:
if envs.VLLM_USE_V1: if envs.VLLM_USE_V1:
raise NotImplementedError( raise NotImplementedError(
"Speculative decoding is not yet supported on VLLM V1." "Speculative decoding is not yet supported on vLLM V1."
) )
else: else:
parallel_config.worker_cls = \ parallel_config.worker_cls = \
......
...@@ -249,7 +249,7 @@ class MistralTokenizer(TokenizerBase): ...@@ -249,7 +249,7 @@ class MistralTokenizer(TokenizerBase):
revision=revision) revision=revision)
return tokenizer_file return tokenizer_file
# the following attributes are set to fit VLLM's design and are used # the following attributes are set to fit vLLM's design and are used
# by the guided structured output backends. # by the guided structured output backends.
@property @property
def all_special_tokens_extended(self) -> List[str]: def all_special_tokens_extended(self) -> List[str]:
......
...@@ -255,7 +255,7 @@ class MPClient(EngineCoreClient): ...@@ -255,7 +255,7 @@ class MPClient(EngineCoreClient):
# TODO(rob): rather than killing the main process, we should # TODO(rob): rather than killing the main process, we should
# figure out how to raise an AsyncEngineDeadError and # figure out how to raise an AsyncEngineDeadError and
# handle at the API server level so we can return a better # handle at the API server level so we can return a better
# error code to the clients calling VLLM. # error code to the clients calling vLLM.
def sigusr1_handler(signum, frame): def sigusr1_handler(signum, frame):
logger.fatal("Got fatal signal from worker processes, shutting " logger.fatal("Got fatal signal from worker processes, shutting "
"down. See stack trace above for root cause issue.") "down. See stack trace above for root cause issue.")
......
...@@ -248,7 +248,7 @@ class OutputProcessor: ...@@ -248,7 +248,7 @@ class OutputProcessor:
****************** NOTE FOR DEVELOPERS ****************** ****************** NOTE FOR DEVELOPERS ******************
VLLM V1 minimizes the number of python loops over the full vLLM V1 minimizes the number of python loops over the full
batch to ensure system overheads are minimized. This is the batch to ensure system overheads are minimized. This is the
only function that should loop over EngineCoreOutputs. only function that should loop over EngineCoreOutputs.
......
...@@ -93,10 +93,10 @@ class Processor: ...@@ -93,10 +93,10 @@ class Processor:
) -> None: ) -> None:
# Best of not yet supported. # Best of not yet supported.
if params.best_of is not None and params.best_of > 1: if params.best_of is not None and params.best_of > 1:
raise ValueError("VLLM V1 does not yet support best_of.") raise ValueError("vLLM V1 does not yet support best_of.")
# Logits processors not supported. # Logits processors not supported.
if params.logits_processors: if params.logits_processors:
raise ValueError("VLLM V1 does not support per request " raise ValueError("vLLM V1 does not support per request "
"user provided logits processors.") "user provided logits processors.")
def _validate_params( def _validate_params(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment