Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
import json import json
from concurrent.futures.thread import ThreadPoolExecutor from concurrent.futures.thread import ThreadPoolExecutor
from http import HTTPStatus from http import HTTPStatus
......
# SPDX-License-Identifier: Apache-2.0
import json import json
import pathlib import pathlib
from dataclasses import dataclass from dataclasses import dataclass
......
# SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
import base64 import base64
import time import time
......
# SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
......
# SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
import time import time
from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
......
# SPDX-License-Identifier: Apache-2.0
from typing import Final, List, Optional, Union from typing import Final, List, Optional, Union
from fastapi import Request from fastapi import Request
......
# SPDX-License-Identifier: Apache-2.0
from .abstract_tool_parser import ToolParser, ToolParserManager from .abstract_tool_parser import ToolParser, ToolParserManager
from .granite_20b_fc_tool_parser import Granite20bFCToolParser from .granite_20b_fc_tool_parser import Granite20bFCToolParser
from .granite_tool_parser import GraniteToolParser from .granite_tool_parser import GraniteToolParser
......
# SPDX-License-Identifier: Apache-2.0
import os import os
from functools import cached_property from functools import cached_property
from typing import Callable, Dict, List, Optional, Sequence, Type, Union from typing import Callable, Dict, List, Optional, Sequence, Type, Union
......
# SPDX-License-Identifier: Apache-2.0
import json import json
import re import re
from json import JSONDecoder from json import JSONDecoder
......
# SPDX-License-Identifier: Apache-2.0
import json import json
from typing import Dict, Sequence, Union from typing import Dict, Sequence, Union
......
# SPDX-License-Identifier: Apache-2.0
import json import json
import re import re
from typing import Dict, List, Sequence, Union from typing import Dict, List, Sequence, Union
......
# SPDX-License-Identifier: Apache-2.0
import json import json
from typing import Dict, Sequence, Union from typing import Dict, Sequence, Union
......
# SPDX-License-Identifier: Apache-2.0
import json import json
import re import re
from typing import Dict, List, Sequence, Union from typing import Dict, List, Sequence, Union
......
# SPDX-License-Identifier: Apache-2.0
import json import json
import re import re
from json import JSONDecoder from json import JSONDecoder
......
# SPDX-License-Identifier: Apache-2.0
import json import json
import re import re
from random import choices from random import choices
......
# SPDX-License-Identifier: Apache-2.0
import ast import ast
import json import json
import re import re
......
# SPDX-License-Identifier: Apache-2.0
import json import json
from json import JSONDecodeError, JSONDecoder from json import JSONDecodeError, JSONDecoder
from typing import Any, List, Tuple from typing import Any, List, Tuple
......
# SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
import functools import functools
......
# SPDX-License-Identifier: Apache-2.0
import os import os
import tempfile import tempfile
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
...@@ -34,6 +36,7 @@ if TYPE_CHECKING: ...@@ -34,6 +36,7 @@ if TYPE_CHECKING:
VLLM_LOGGING_LEVEL: str = "INFO" VLLM_LOGGING_LEVEL: str = "INFO"
VLLM_LOGGING_PREFIX: str = "" VLLM_LOGGING_PREFIX: str = ""
VLLM_LOGGING_CONFIG_PATH: Optional[str] = None VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
VLLM_TRACE_FUNCTION: int = 0 VLLM_TRACE_FUNCTION: int = 0
VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_ATTENTION_BACKEND: Optional[str] = None
VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
...@@ -86,6 +89,10 @@ if TYPE_CHECKING: ...@@ -86,6 +89,10 @@ if TYPE_CHECKING:
VLLM_MLA_DISABLE: bool = False VLLM_MLA_DISABLE: bool = False
VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
VLLM_MLA_DISABLE_REQUANTIZATION: bool = False VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = ""
def get_default_cache_root(): def get_default_cache_root():
...@@ -309,6 +316,14 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -309,6 +316,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_LOGGING_PREFIX": "VLLM_LOGGING_PREFIX":
lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
# if set, vllm will call logits processors in a thread pool with this many
# threads. This is useful when using custom logits processors that either
# (a) launch additional CUDA kernels or (b) do significant CPU-bound work
# while not holding the python GIL, or both.
"VLLM_LOGITS_PROCESSOR_THREADS":
lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
# Trace function calls # Trace function calls
# If set to 1, vllm will trace function calls # If set to 1, vllm will trace function calls
# Useful for debugging # Useful for debugging
...@@ -565,7 +580,34 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -565,7 +580,34 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# matrices to match the activation type. This can lead to higher memory and # matrices to match the activation type. This can lead to higher memory and
# compute usage but better preserves the accuracy of the original model. # compute usage but better preserves the accuracy of the original model.
"VLLM_MLA_DISABLE_REQUANTIZATION": "VLLM_MLA_DISABLE_REQUANTIZATION":
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))) lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
# If set, vLLM will use the Triton implementation of moe_align_block_size,
# i.e. moe_align_block_size_triton in fused_moe.py.
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
),
# Number of GPUs per worker in Ray, if it is set to be a fraction,
# it allows ray to schedule multiple actors on a single GPU,
# so that users can colocate other actors on the same GPUs as vLLM.
"VLLM_RAY_PER_WORKER_GPUS":
lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")),
# Bundle indices for Ray, if it is set, it can control precisely
# which indices are used for the Ray bundle, for every worker.
# Format: comma-separated list of integers, e.g. "0,1,2,3"
"VLLM_RAY_BUNDLE_INDICES":
lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
# When on a Nvidia GPU aligns single entries (within a page) so they are 256
# byte aligned for better performance, this increases the memory usage of
# the cache. Currently this only affects MLA that results in non-256
# byte aligned entries. This matches the alignment the CUDA runtime uses
# for all allocations. Currently this primarily affects MLA, for most other
# models the alignment is already naturally aligned to 256 bytes.
"VLLM_CUDA_MEM_ALIGN_KV_CACHE":
lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
} }
# end-env-vars-definition # end-env-vars-definition
......
# SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment