Unverified Commit 30a14b03 authored by wangxiyuan's avatar wangxiyuan Committed by GitHub
Browse files

[V0 deprecation] Remove VLLM_USE_V1 usage in platform and v1 module (#27798)


Signed-off-by: default avatarwangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 799ce45c
...@@ -276,17 +276,12 @@ class CudaPlatformBase(Platform): ...@@ -276,17 +276,12 @@ class CudaPlatformBase(Platform):
"FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set " "FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set "
"VLLM_MLA_DISABLE=1 to disable MLA for this model." "VLLM_MLA_DISABLE=1 to disable MLA for this model."
) )
if not use_v1:
raise RuntimeError(
"MLA attention backends require the V1 engine. "
"Set VLLM_USE_V1=1 to enable them."
)
from vllm.attention.ops.flashmla import is_flashmla_dense_supported from vllm.attention.ops.flashmla import is_flashmla_dense_supported
from vllm.attention.utils.fa_utils import flash_attn_supports_mla from vllm.attention.utils.fa_utils import flash_attn_supports_mla
if use_sparse: if use_sparse:
logger.info_once("Using Sparse MLA backend on V1 engine.") logger.info_once("Using Sparse MLA backend.")
return ( return (
"vllm.v1.attention.backends.mla.flashmla_sparse." "vllm.v1.attention.backends.mla.flashmla_sparse."
"FlashMLASparseBackend" "FlashMLASparseBackend"
...@@ -313,15 +308,13 @@ class CudaPlatformBase(Platform): ...@@ -313,15 +308,13 @@ class CudaPlatformBase(Platform):
) )
if use_cutlassmla: if use_cutlassmla:
logger.info_once( logger.info_once("Using Cutlass MLA backend.", scope="local")
"Using Cutlass MLA backend on V1 engine.", scope="local"
)
return "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend" return "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
if use_flashinfermla: if use_flashinfermla:
from vllm.v1.attention.backends.utils import set_kv_cache_layout from vllm.v1.attention.backends.utils import set_kv_cache_layout
set_kv_cache_layout("HND") set_kv_cache_layout("HND")
logger.info_once("Using FlashInfer MLA backend on V1 engine.") logger.info_once("Using FlashInfer MLA backend.")
return ( return (
"vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend" "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
) )
...@@ -333,116 +326,107 @@ class CudaPlatformBase(Platform): ...@@ -333,116 +326,107 @@ class CudaPlatformBase(Platform):
block_size, block_size,
) )
else: else:
logger.info_once("Using FlashMLA backend on V1 engine.") logger.info_once("Using FlashMLA backend.")
return "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend" return "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend"
if use_flashattn: if use_flashattn:
logger.info_once("Using FlashAttention MLA backend on V1 engine.") logger.info_once("Using FlashAttention MLA backend.")
return ( return (
"vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend" "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend"
) )
if use_triton: if use_triton:
logger.info_once("Using Triton MLA backend on V1 engine.") logger.info_once("Using Triton MLA backend.")
return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend" return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
if use_v1:
FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend" # noqa: E501
FLEX_ATTENTION_V1 = (
"vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501
)
TRITON_ATTN = (
"vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501
)
FLASH_ATTN_V1 = (
"vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501
)
TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501
XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501
use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith( FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend" # noqa: E501
"fp8" FLEX_ATTENTION_V1 = (
) "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501
)
TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501
FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501
TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501
XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501
if selected_backend == _Backend.FLASHINFER: use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith(
logger.info_once("Using FlashInfer backend on V1 engine.") "fp8"
if cls.has_device_capability(100): )
from vllm.v1.attention.backends.utils import set_kv_cache_layout
set_kv_cache_layout("HND") if selected_backend == _Backend.FLASHINFER:
return FLASHINFER_V1 logger.info_once("Using FlashInfer backend.")
elif selected_backend == _Backend.FLEX_ATTENTION: if cls.has_device_capability(100):
logger.info_once("Using FlexAttention backend on V1 engine.") from vllm.v1.attention.backends.utils import set_kv_cache_layout
return FLEX_ATTENTION_V1
elif selected_backend == _Backend.TRITON_ATTN:
logger.info_once("Using Triton backend on V1 engine.")
return TRITON_ATTN
elif selected_backend == _Backend.FLASH_ATTN:
logger.info_once("Using Flash Attention backend on V1 engine.")
return FLASH_ATTN_V1
elif selected_backend == _Backend.TREE_ATTN:
logger.info_once("Using Tree Attention backend on V1 engine.")
return TREE_ATTN_V1
elif selected_backend == _Backend.XFORMERS:
logger.info_once("Using XFormers backend on V1 engine.")
return XFORMERS_V1
from vllm.attention.selector import is_attn_backend_supported set_kv_cache_layout("HND")
return FLASHINFER_V1
elif selected_backend == _Backend.FLEX_ATTENTION:
logger.info_once("Using FlexAttention backend.")
return FLEX_ATTENTION_V1
elif selected_backend == _Backend.TRITON_ATTN:
logger.info_once("Using Triton backend.")
return TRITON_ATTN
elif selected_backend == _Backend.FLASH_ATTN:
logger.info_once("Using Flash Attention backend.")
return FLASH_ATTN_V1
elif selected_backend == _Backend.TREE_ATTN:
logger.info_once("Using Tree Attention backend.")
return TREE_ATTN_V1
elif selected_backend == _Backend.XFORMERS:
logger.info_once("Using XFormers backend.")
return XFORMERS_V1
from vllm.attention.selector import is_attn_backend_supported
# Default backends for V1 engine
# Prefer FlashInfer for Blackwell GPUs if installed
if cls.is_device_capability(100):
if is_default_backend_supported := is_attn_backend_supported(
FLASHINFER_V1, head_size, dtype
):
from vllm.v1.attention.backends.utils import set_kv_cache_layout
# Default backends for V1 engine logger.info_once(
# Prefer FlashInfer for Blackwell GPUs if installed "Using FlashInfer backend with HND KV cache layout on "
if cls.is_device_capability(100): "V1 engine by default for Blackwell (SM 10.0) GPUs."
if is_default_backend_supported := is_attn_backend_supported( )
FLASHINFER_V1, head_size, dtype set_kv_cache_layout("HND")
):
from vllm.v1.attention.backends.utils import set_kv_cache_layout
logger.info_once(
"Using FlashInfer backend with HND KV cache layout on "
"V1 engine by default for Blackwell (SM 10.0) GPUs."
)
set_kv_cache_layout("HND")
return FLASHINFER_V1 return FLASHINFER_V1
if not is_default_backend_supported.can_import: if not is_default_backend_supported.can_import:
logger.warning_once( logger.warning_once(
"FlashInfer failed to import for V1 engine on " "FlashInfer failed to import on Blackwell (SM 10.0) GPUs; "
"Blackwell (SM 10.0) GPUs; it is recommended to " "it is recommended to install FlashInfer for better "
"install FlashInfer for better performance." "performance."
) )
# FlashAttention is the default for SM 8.0+ GPUs # FlashAttention is the default for SM 8.0+ GPUs
if cls.has_device_capability(80): if cls.has_device_capability(80):
if (has_sink or use_fp8_kv_cache) and not cls.is_device_capability(90): if (has_sink or use_fp8_kv_cache) and not cls.is_device_capability(90):
logger.info_once("Using Triton backend on V1 engine.") logger.info_once("Using Triton backend.")
return TRITON_ATTN return TRITON_ATTN
elif is_default_backend_supported := is_attn_backend_supported( elif is_default_backend_supported := is_attn_backend_supported(
FLASH_ATTN_V1, head_size, dtype, allow_import_error=False FLASH_ATTN_V1, head_size, dtype, allow_import_error=False
): ):
logger.info_once("Using Flash Attention backend on V1 engine.") logger.info_once("Using Flash Attention backend.")
return FLASH_ATTN_V1 return FLASH_ATTN_V1
# FlexAttention is the default for older GPUs
else:
logger.info_once("Using FlexAttention backend on V1 engine.")
return FLEX_ATTENTION_V1
assert not is_default_backend_supported # FlexAttention is the default for older GPUs
else:
logger.info_once("Using FlexAttention backend.")
return FLEX_ATTENTION_V1
use_flex_attention_reason = {} assert not is_default_backend_supported
if not is_default_backend_supported.head_size:
use_flex_attention_reason["head_size"] = head_size
if not is_default_backend_supported.dtype:
use_flex_attention_reason["dtype"] = dtype
logger.info_once( use_flex_attention_reason = {}
"Using FlexAttention backend for %s on V1 engine.", if not is_default_backend_supported.head_size:
", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()), use_flex_attention_reason["head_size"] = head_size
) if not is_default_backend_supported.dtype:
return FLEX_ATTENTION_V1 use_flex_attention_reason["dtype"] = dtype
raise RuntimeError( logger.info_once(
"V0 attention backends have been removed. Set VLLM_USE_V1=1 " "Using FlexAttention backend for %s.",
"to select a supported backend." ", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()),
) )
return FLEX_ATTENTION_V1
@classmethod @classmethod
def get_punica_wrapper(cls) -> str: def get_punica_wrapper(cls) -> str:
......
...@@ -467,14 +467,7 @@ class Platform: ...@@ -467,14 +467,7 @@ class Platform:
""" """
Whether to use allgather in LogitsProcessor to gather the logits. Whether to use allgather in LogitsProcessor to gather the logits.
""" """
import vllm.envs as envs return True
from vllm.config import get_current_vllm_config
parallel_config = get_current_vllm_config().parallel_config
return (
envs.VLLM_USE_V1
or parallel_config.distributed_executor_backend == "external_launcher"
)
@classmethod @classmethod
def use_custom_allreduce(cls) -> bool: def use_custom_allreduce(cls) -> bool:
......
...@@ -149,7 +149,7 @@ def use_rocm_custom_paged_attention( ...@@ -149,7 +149,7 @@ def use_rocm_custom_paged_attention(
# disabled due to observed numerical discrepancy. # disabled due to observed numerical discrepancy.
if ON_GFX9: if ON_GFX9:
return ( return (
(not envs.VLLM_USE_V1 or sliding_window == 0 or sliding_window == (-1, -1)) (sliding_window == 0 or sliding_window == (-1, -1))
and (qtype == torch.half or qtype == torch.bfloat16) and (qtype == torch.half or qtype == torch.bfloat16)
and (head_size == 64 or head_size == 128) and (head_size == 64 or head_size == 128)
and (block_size == 16 or block_size == 32) and (block_size == 16 or block_size == 32)
...@@ -163,11 +163,7 @@ def use_rocm_custom_paged_attention( ...@@ -163,11 +163,7 @@ def use_rocm_custom_paged_attention(
else: else:
return ( return (
ON_GFX11_GFX12 ON_GFX11_GFX12
and ( and (sliding_window == 0 or sliding_window == (-1, -1))
not envs.VLLM_USE_V1
or sliding_window == 0
or sliding_window == (-1, -1)
)
and (qtype == torch.half or qtype == torch.bfloat16) and (qtype == torch.half or qtype == torch.bfloat16)
and head_size == 128 and head_size == 128
and block_size == 16 and block_size == 16
...@@ -236,12 +232,6 @@ class RocmPlatform(Platform): ...@@ -236,12 +232,6 @@ class RocmPlatform(Platform):
if use_sparse: if use_sparse:
raise NotImplementedError("Sparse Attention is not supported on ROCm.") raise NotImplementedError("Sparse Attention is not supported on ROCm.")
if use_mla: if use_mla:
if not use_v1:
raise RuntimeError(
"MLA attention backends require the V1 engine. "
"Set VLLM_USE_V1=1 to enable them."
)
from vllm.v1.attention.backends.mla.rocm_aiter_mla import ( from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
is_aiter_mla_enabled, is_aiter_mla_enabled,
) )
...@@ -255,7 +245,7 @@ class RocmPlatform(Platform): ...@@ -255,7 +245,7 @@ class RocmPlatform(Platform):
if selected_backend == _Backend.TRITON_MLA: if selected_backend == _Backend.TRITON_MLA:
if block_size != 1: if block_size != 1:
logger.info_once("Using Triton MLA backend on V1 engine.") logger.info_once("Using Triton MLA backend.")
return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend" return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend"
raise ValueError( raise ValueError(
f" The selected backend, {selected_backend.name}," f" The selected backend, {selected_backend.name},"
...@@ -263,7 +253,7 @@ class RocmPlatform(Platform): ...@@ -263,7 +253,7 @@ class RocmPlatform(Platform):
) )
if selected_backend == _Backend.ROCM_AITER_MLA: if selected_backend == _Backend.ROCM_AITER_MLA:
if block_size == 1: if block_size == 1:
logger.info("Using AITER MLA backend on V1 engine.") logger.info("Using AITER MLA backend.")
return ( return (
"vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" # noqa: E501 "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" # noqa: E501
) )
...@@ -277,41 +267,33 @@ class RocmPlatform(Platform): ...@@ -277,41 +267,33 @@ class RocmPlatform(Platform):
f"is not MLA type while requested for MLA backend." f"is not MLA type while requested for MLA backend."
) )
if envs.VLLM_USE_V1: if selected_backend == _Backend.FLEX_ATTENTION:
if selected_backend == _Backend.FLEX_ATTENTION: logger.info("Using FlexAttention backend.")
logger.info("Using FlexAttention backend on V1 engine.") return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" if (
if ( envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9()
envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9() ) or selected_backend == _Backend.ROCM_AITER_FA:
) or selected_backend == _Backend.ROCM_AITER_FA: logger.info("Using Aiter Flash Attention backend.")
logger.info("Using Aiter Flash Attention backend on V1 engine.") return "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
return ( if (
"vllm.v1.attention.backends." envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION
"rocm_aiter_fa.AiterFlashAttentionBackend" ) or selected_backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
) logger.info("Using Aiter Unified Attention backend.")
if ( return (
envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION "vllm.v1.attention.backends."
) or selected_backend == _Backend.ROCM_AITER_UNIFIED_ATTN: "rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend"
logger.info("Using Aiter Unified Attention backend on V1 engine.") )
return ( if (
"vllm.v1.attention.backends." envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
"rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend" or selected_backend == _Backend.ROCM_ATTN
) ):
if ( # rocm specific backend, with aiter and/or
envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION # triton prefix-prefill
or selected_backend == _Backend.ROCM_ATTN logger.info("Using Rocm Attention backend.")
): return "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
# rocm specific backend, with aiter and/or # default case, using triton unified attention
# triton prefix-prefill logger.info("Using Triton Attention backend.")
logger.info("Using Rocm Attention backend on V1 engine.") return "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
return "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
# default case, using triton unified attention
logger.info("Using Triton Attention backend on V1 engine.")
return "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
raise RuntimeError(
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
"to select a supported backend."
)
@classmethod @classmethod
def set_device(cls, device: torch.device) -> None: def set_device(cls, device: torch.device) -> None:
...@@ -372,7 +354,6 @@ class RocmPlatform(Platform): ...@@ -372,7 +354,6 @@ class RocmPlatform(Platform):
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
is_eager_execution = compilation_config == CUDAGraphMode.NONE is_eager_execution = compilation_config == CUDAGraphMode.NONE
use_v1 = envs.VLLM_USE_V1
use_aiter_rms_norm = ( use_aiter_rms_norm = (
envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_RMSNORM envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_RMSNORM
) )
...@@ -384,8 +365,7 @@ class RocmPlatform(Platform): ...@@ -384,8 +365,7 @@ class RocmPlatform(Platform):
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
# Aiter rms norm perform best when CUDA Graph capture is enabled. # Aiter rms norm perform best when CUDA Graph capture is enabled.
if ( if (
use_v1 use_aiter_rms_norm
and use_aiter_rms_norm
and not is_eager_execution and not is_eager_execution
and "-rms_norm" not in compilation_config.custom_ops and "-rms_norm" not in compilation_config.custom_ops
): ):
......
...@@ -204,10 +204,6 @@ class TpuPlatform(Platform): ...@@ -204,10 +204,6 @@ class TpuPlatform(Platform):
def get_device_communicator_cls(cls) -> str: def get_device_communicator_cls(cls) -> str:
return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator" # noqa return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator" # noqa
@classmethod
def use_all_gather(cls) -> bool:
return True
@classmethod @classmethod
def validate_request( def validate_request(
cls, cls,
......
...@@ -66,16 +66,13 @@ class XPUPlatform(Platform): ...@@ -66,16 +66,13 @@ class XPUPlatform(Platform):
if use_sparse: if use_sparse:
raise NotImplementedError("Sparse Attention is not supported on XPU.") raise NotImplementedError("Sparse Attention is not supported on XPU.")
use_v1 = envs.VLLM_USE_V1
if not use_v1:
raise ValueError("XPU backend only supports V1.")
TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501 TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501
FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501 FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501
if selected_backend == _Backend.TRITON_ATTN: if selected_backend == _Backend.TRITON_ATTN:
logger.info_once("Using Triton backend on V1 engine.") logger.info_once("Using Triton backend.")
return TRITON_ATTN return TRITON_ATTN
elif selected_backend == _Backend.FLASH_ATTN: elif selected_backend == _Backend.FLASH_ATTN:
logger.info_once("Using Flash Attention backend on V1 engine.") logger.info_once("Using Flash Attention backend.")
return FLASH_ATTN return FLASH_ATTN
elif selected_backend: elif selected_backend:
raise ValueError( raise ValueError(
...@@ -83,7 +80,7 @@ class XPUPlatform(Platform): ...@@ -83,7 +80,7 @@ class XPUPlatform(Platform):
f"with use_v1: {use_v1} use_mla: {use_mla}" f"with use_v1: {use_v1} use_mla: {use_mla}"
) )
logger.info("Using Flash Attention backend on V1 engine.") logger.info("Using Flash Attention backend.")
return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
@classmethod @classmethod
......
...@@ -88,14 +88,6 @@ class AsyncLLM(EngineClient): ...@@ -88,14 +88,6 @@ class AsyncLLM(EngineClient):
Returns: Returns:
None None
""" """
if not envs.VLLM_USE_V1:
raise ValueError(
"Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
"This should not happen. As a workaround, try using "
"AsyncLLMEngine.from_vllm_config(...) or explicitly set "
"VLLM_USE_V1=0 or 1 and report this issue on Github."
)
# Ensure we can serialize custom transformer configs # Ensure we can serialize custom transformer configs
maybe_register_config_serialize_by_value() maybe_register_config_serialize_by_value()
...@@ -206,14 +198,6 @@ class AsyncLLM(EngineClient): ...@@ -206,14 +198,6 @@ class AsyncLLM(EngineClient):
client_index: int = 0, client_index: int = 0,
disable_log_requests: bool = True, # Deprecated, will be removed disable_log_requests: bool = True, # Deprecated, will be removed
) -> "AsyncLLM": ) -> "AsyncLLM":
if not envs.VLLM_USE_V1:
raise ValueError(
"Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
"This should not happen. As a workaround, try using "
"AsyncLLMEngine.from_vllm_config(...) or explicitly set "
"VLLM_USE_V1=0 or 1 and report this issue on Github."
)
# Create the LLMEngine. # Create the LLMEngine.
return cls( return cls(
vllm_config=vllm_config, vllm_config=vllm_config,
......
...@@ -58,18 +58,9 @@ class LLMEngine: ...@@ -58,18 +58,9 @@ class LLMEngine:
use_cached_outputs: bool = False, use_cached_outputs: bool = False,
multiprocess_mode: bool = False, multiprocess_mode: bool = False,
) -> None: ) -> None:
if not envs.VLLM_USE_V1:
raise ValueError(
"Using V1 LLMEngine, but envs.VLLM_USE_V1=False. "
"This should not happen. As a workaround, try using "
"LLMEngine.from_vllm_config(...) or explicitly set "
"VLLM_USE_V1=0 or 1 and report this issue on Github."
)
if stat_loggers is not None: if stat_loggers is not None:
raise NotImplementedError( raise NotImplementedError(
"Passing StatLoggers to LLMEngine in V1 is not yet supported. " "Passing StatLoggers to LLMEngine is not yet supported."
"Set VLLM_USE_V1=0 and file and issue on Github."
) )
self.vllm_config = vllm_config self.vllm_config = vllm_config
......
...@@ -124,11 +124,10 @@ class ExecutorWithExternalLauncher(UniProcExecutor): ...@@ -124,11 +124,10 @@ class ExecutorWithExternalLauncher(UniProcExecutor):
def _init_executor(self) -> None: def _init_executor(self) -> None:
"""Initialize the worker and load the model.""" """Initialize the worker and load the model."""
if envs.VLLM_USE_V1: assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, (
assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, ( "To get deterministic execution, "
"To get deterministic execution in V1, " "please set VLLM_ENABLE_V1_MULTIPROCESSING=0"
"please set VLLM_ENABLE_V1_MULTIPROCESSING=0" )
)
super()._init_executor() super()._init_executor()
def _distributed_args(self) -> tuple[str, int, int]: def _distributed_args(self) -> tuple[str, int, int]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment