import os from typing import TYPE_CHECKING, Any, Callable, Dict, Optional if TYPE_CHECKING: VLLM_HOST_IP: str = "" VLLM_USE_MODELSCOPE: bool = False VLLM_INSTANCE_ID: Optional[str] = None VLLM_NCCL_SO_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None VLLM_USE_TRITON_FLASH_ATTN: bool = False LOCAL_RANK: int = 0 CUDA_VISIBLE_DEVICES: Optional[str] = None VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 VLLM_API_KEY: Optional[str] = None S3_ACCESS_KEY_ID: Optional[str] = None S3_SECRET_ACCESS_KEY: Optional[str] = None S3_ENDPOINT_URL: Optional[str] = None VLLM_CONFIG_ROOT: str = "" VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" VLLM_NO_USAGE_STATS: bool = False VLLM_DO_NOT_TRACK: bool = False VLLM_USAGE_SOURCE: str = "" VLLM_CONFIGURE_LOGGING: int = 1 VLLM_LOGGING_CONFIG_PATH: Optional[str] = None VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" # The begin-* and end* here are used by the documentation generator # to extract the used env vars. # begin-env-vars-definition environment_variables: Dict[str, Callable[[], Any]] = { # used in distributed environment to determine the master address 'VLLM_HOST_IP': lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""), # If true, will load models from ModelScope instead of Hugging Face Hub. # note that the value is true or false, not numbers "VLLM_USE_MODELSCOPE": lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true", # Instance id represents an instance of the VLLM. All processes in the same # instance should have the same instance id. "VLLM_INSTANCE_ID": lambda: os.environ.get("VLLM_INSTANCE_ID", None), # path to cudatoolkit home directory, under which should be bin, include, # and lib directories. "CUDA_HOME": lambda: os.environ.get("CUDA_HOME", None), # Path to the NCCL library file. It is needed because nccl>=2.19 brought # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234 "VLLM_NCCL_SO_PATH": lambda: os.environ.get("VLLM_NCCL_SO_PATH", None), # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl # library file in the locations specified by `LD_LIBRARY_PATH` "LD_LIBRARY_PATH": lambda: os.environ.get("LD_LIBRARY_PATH", None), # flag to control if vllm should use triton flash attention "VLLM_USE_TRITON_FLASH_ATTN": lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")), # local rank of the process in the distributed setting, used to determine # the GPU device id "LOCAL_RANK": lambda: int(os.environ.get("LOCAL_RANK", "0")), # used to control the visible devices in the distributed setting "CUDA_VISIBLE_DEVICES": lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None), # timeout for each iteration in the engine "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")), # API key for VLLM API server "VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None), # S3 access information, used for tensorizer to load model from S3 "S3_ACCESS_KEY_ID": lambda: os.environ.get("S3_ACCESS_KEY", None), "S3_SECRET_ACCESS_KEY": lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None), "S3_ENDPOINT_URL": lambda: os.environ.get("S3_ENDPOINT_URL", None), # Root directory for VLLM configuration files # Note that this not only affects how vllm finds its configuration files # during runtime, but also affects how vllm installs its configuration # files during **installation**. "VLLM_CONFIG_ROOT": lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv( "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"), # Usage stats collection "VLLM_USAGE_STATS_SERVER": lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"), "VLLM_NO_USAGE_STATS": lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1", "VLLM_DO_NOT_TRACK": lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get( "DO_NOT_TRACK", None) or "0") == "1", "VLLM_USAGE_SOURCE": lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"), # Logging configuration # If set to 0, vllm will not configure logging # If set to 1, vllm will configure logging using the default configuration # or the configuration file specified by VLLM_LOGGING_CONFIG_PATH "VLLM_CONFIGURE_LOGGING": lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")), "VLLM_LOGGING_CONFIG_PATH": lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"), # Trace function calls # If set to 1, vllm will trace function calls # Useful for debugging "VLLM_TRACE_FUNCTION": lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")), # Backend for attention computation # Available options: # - "TORCH_SDPA": use torch.nn.MultiheadAttention # - "FLASH_ATTN": use FlashAttention # - "XFORMERS": use XFormers # - "ROCM_FLASH": use ROCmFlashAttention "VLLM_ATTENTION_BACKEND": lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), # CPU key-value cache space # default is 4GB "VLLM_CPU_KVCACHE_SPACE": lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")), # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. "VLLM_USE_RAY_COMPILED_DAG": lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)), # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"), } # end-env-vars-definition def __getattr__(name): # lazy evaluation of environment variables if name in environment_variables: return environment_variables[name]() raise AttributeError(f"module {__name__!r} has no attribute {name!r}") def __dir__(): return list(environment_variables.keys())