Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
import json
from concurrent.futures.thread import ThreadPoolExecutor
from http import HTTPStatus
......
# SPDX-License-Identifier: Apache-2.0
import json
import pathlib
from dataclasses import dataclass
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
import base64
import time
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
import time
from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
......
# SPDX-License-Identifier: Apache-2.0
from typing import Final, List, Optional, Union
from fastapi import Request
......
# SPDX-License-Identifier: Apache-2.0
from .abstract_tool_parser import ToolParser, ToolParserManager
from .granite_20b_fc_tool_parser import Granite20bFCToolParser
from .granite_tool_parser import GraniteToolParser
......
# SPDX-License-Identifier: Apache-2.0
import os
from functools import cached_property
from typing import Callable, Dict, List, Optional, Sequence, Type, Union
......
# SPDX-License-Identifier: Apache-2.0
import json
import re
from json import JSONDecoder
......
# SPDX-License-Identifier: Apache-2.0
import json
from typing import Dict, Sequence, Union
......
# SPDX-License-Identifier: Apache-2.0
import json
import re
from typing import Dict, List, Sequence, Union
......
# SPDX-License-Identifier: Apache-2.0
import json
from typing import Dict, Sequence, Union
......
# SPDX-License-Identifier: Apache-2.0
import json
import re
from typing import Dict, List, Sequence, Union
......
# SPDX-License-Identifier: Apache-2.0
import json
import re
from json import JSONDecoder
......
# SPDX-License-Identifier: Apache-2.0
import json
import re
from random import choices
......
# SPDX-License-Identifier: Apache-2.0
import ast
import json
import re
......
# SPDX-License-Identifier: Apache-2.0
import json
from json import JSONDecodeError, JSONDecoder
from typing import Any, List, Tuple
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
import functools
......
# SPDX-License-Identifier: Apache-2.0
import os
import tempfile
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
......@@ -34,6 +36,7 @@ if TYPE_CHECKING:
VLLM_LOGGING_LEVEL: str = "INFO"
VLLM_LOGGING_PREFIX: str = ""
VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
VLLM_TRACE_FUNCTION: int = 0
VLLM_ATTENTION_BACKEND: Optional[str] = None
VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
......@@ -86,6 +89,10 @@ if TYPE_CHECKING:
VLLM_MLA_DISABLE: bool = False
VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = ""
def get_default_cache_root():
......@@ -309,6 +316,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_LOGGING_PREFIX":
lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
# if set, vllm will call logits processors in a thread pool with this many
# threads. This is useful when using custom logits processors that either
# (a) launch additional CUDA kernels or (b) do significant CPU-bound work
# while not holding the python GIL, or both.
"VLLM_LOGITS_PROCESSOR_THREADS":
lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
# Trace function calls
# If set to 1, vllm will trace function calls
# Useful for debugging
......@@ -565,7 +580,34 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# matrices to match the activation type. This can lead to higher memory and
# compute usage but better preserves the accuracy of the original model.
"VLLM_MLA_DISABLE_REQUANTIZATION":
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
# If set, vLLM will use the Triton implementation of moe_align_block_size,
# i.e. moe_align_block_size_triton in fused_moe.py.
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
),
# Number of GPUs per worker in Ray, if it is set to be a fraction,
# it allows ray to schedule multiple actors on a single GPU,
# so that users can colocate other actors on the same GPUs as vLLM.
"VLLM_RAY_PER_WORKER_GPUS":
lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")),
# Bundle indices for Ray, if it is set, it can control precisely
# which indices are used for the Ray bundle, for every worker.
# Format: comma-separated list of integers, e.g. "0,1,2,3"
"VLLM_RAY_BUNDLE_INDICES":
lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
# When on a Nvidia GPU aligns single entries (within a page) so they are 256
# byte aligned for better performance, this increases the memory usage of
# the cache. Currently this only affects MLA that results in non-256
# byte aligned entries. This matches the alignment the CUDA runtime uses
# for all allocations. Currently this primarily affects MLA, for most other
# models the alignment is already naturally aligned to 256 bytes.
"VLLM_CUDA_MEM_ALIGN_KV_CACHE":
lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
}
# end-env-vars-definition
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
from abc import ABC, abstractmethod
from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment