Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 from concurrent.futures.thread import ThreadPoolExecutor
 from http import HTTPStatus

--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 import pathlib
 from dataclasses import dataclass

--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 import base64
 import time

--- a/vllm/entrypoints/openai/serving_rerank.py
+++ b/vllm/entrypoints/openai/serving_rerank.py
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast

--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 import time
 from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast

--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import Final, List, Optional, Union
 from fastapi import Request

--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
+# SPDX-License-Identifier: Apache-2.0
 from .abstract_tool_parser import ToolParser, ToolParserManager
 from .granite_20b_fc_tool_parser import Granite20bFCToolParser
 from .granite_tool_parser import GraniteToolParser

--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 from functools import cached_property
 from typing import Callable, Dict, List, Optional, Sequence, Type, Union

--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 import re
 from json import JSONDecoder

--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 from typing import Dict, Sequence, Union

--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 import re
 from typing import Dict, List, Sequence, Union

--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 from typing import Dict, Sequence, Union

--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 import re
 from typing import Dict, List, Sequence, Union

--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 import re
 from json import JSONDecoder

--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 import re
 from random import choices

--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
 import ast
 import json
 import re

--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 from json import JSONDecodeError, JSONDecoder
 from typing import Any, List, Tuple

--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 import functools

--- a/vllm/envs.py
+++ b/vllm/envs.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import tempfile
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
@@ -34,6 +36,7 @@ if TYPE_CHECKING:
    VLLM_LOGGING_LEVEL: str = "INFO"
    VLLM_LOGGING_PREFIX: str = ""
    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
+    VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
    VLLM_TRACE_FUNCTION: int = 0
    VLLM_ATTENTION_BACKEND: Optional[str] = None
    VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
@@ -86,6 +89,10 @@ if TYPE_CHECKING:
    VLLM_MLA_DISABLE: bool = False
    VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
    VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
+    VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
+    VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
+    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
+    VLLM_RAY_BUNDLE_INDICES: str = ""
 def get_default_cache_root():
@@ -309,6 +316,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_LOGGING_PREFIX":
    lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
+    # if set, vllm will call logits processors in a thread pool with this many
+    # threads. This is useful when using custom logits processors that either
+    # (a) launch additional CUDA kernels or (b) do significant CPU-bound work
+    # while not holding the python GIL, or both.
+    "VLLM_LOGITS_PROCESSOR_THREADS":
+    lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
+    if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
    # Trace function calls
    # If set to 1, vllm will trace function calls
    # Useful for debugging
@@ -565,7 +580,34 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # matrices to match the activation type. This can lead to higher memory and
    # compute usage but better preserves the accuracy of the original model.
    "VLLM_MLA_DISABLE_REQUANTIZATION":
-    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
+    # If set, vLLM will use the Triton implementation of moe_align_block_size,
+    # i.e. moe_align_block_size_triton in fused_moe.py.
+    "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
+                 ),
+    # Number of GPUs per worker in Ray, if it is set to be a fraction,
+    # it allows ray to schedule multiple actors on a single GPU,
+    # so that users can colocate other actors on the same GPUs as vLLM.
+    "VLLM_RAY_PER_WORKER_GPUS":
+    lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")),
+    # Bundle indices for Ray, if it is set, it can control precisely
+    # which indices are used for the Ray bundle, for every worker.
+    # Format: comma-separated list of integers, e.g. "0,1,2,3"
+    "VLLM_RAY_BUNDLE_INDICES":
+    lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
+    # When on a Nvidia GPU aligns single entries (within a page) so they are 256
+    # byte aligned for better performance, this increases the memory usage of
+    # the cache. Currently this only affects MLA that results in non-256
+    # byte aligned entries. This matches the alignment the CUDA runtime uses
+    # for all allocations. Currently this primarily affects MLA, for most other
+    # models the alignment is already naturally aligned to 256 bytes.
+    "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
+    lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
 }
 # end-env-vars-definition

--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 from abc import ABC, abstractmethod
 from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,