Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66b809cc
Commit
66b809cc
authored
Feb 08, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.2' into v0.7.2-dev
parents
37b63c24
0408efc6
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
81 additions
and
1 deletion
+81
-1
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+2
-0
vllm/entrypoints/openai/serving_models.py
vllm/entrypoints/openai/serving_models.py
+2
-0
vllm/entrypoints/openai/serving_pooling.py
vllm/entrypoints/openai/serving_pooling.py
+2
-0
vllm/entrypoints/openai/serving_rerank.py
vllm/entrypoints/openai/serving_rerank.py
+2
-0
vllm/entrypoints/openai/serving_score.py
vllm/entrypoints/openai/serving_score.py
+2
-0
vllm/entrypoints/openai/serving_tokenization.py
vllm/entrypoints/openai/serving_tokenization.py
+2
-0
vllm/entrypoints/openai/tool_parsers/__init__.py
vllm/entrypoints/openai/tool_parsers/__init__.py
+2
-0
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+2
-0
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
...ypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+2
-0
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+2
-0
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+2
-0
vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
.../entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+2
-0
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+2
-0
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+2
-0
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+2
-0
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+2
-0
vllm/entrypoints/openai/tool_parsers/utils.py
vllm/entrypoints/openai/tool_parsers/utils.py
+2
-0
vllm/entrypoints/utils.py
vllm/entrypoints/utils.py
+2
-0
vllm/envs.py
vllm/envs.py
+43
-1
vllm/executor/executor_base.py
vllm/executor/executor_base.py
+2
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
vllm/entrypoints/openai/serving_engine.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
json
from
concurrent.futures.thread
import
ThreadPoolExecutor
from
http
import
HTTPStatus
...
...
vllm/entrypoints/openai/serving_models.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
json
import
pathlib
from
dataclasses
import
dataclass
...
...
vllm/entrypoints/openai/serving_pooling.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
base64
import
time
...
...
vllm/entrypoints/openai/serving_rerank.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
asyncio
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
List
,
Optional
,
Union
,
cast
...
...
vllm/entrypoints/openai/serving_score.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
time
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
List
,
Optional
,
Union
,
cast
...
...
vllm/entrypoints/openai/serving_tokenization.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Final
,
List
,
Optional
,
Union
from
fastapi
import
Request
...
...
vllm/entrypoints/openai/tool_parsers/__init__.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
.abstract_tool_parser
import
ToolParser
,
ToolParserManager
from
.granite_20b_fc_tool_parser
import
Granite20bFCToolParser
from
.granite_tool_parser
import
GraniteToolParser
...
...
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
from
functools
import
cached_property
from
typing
import
Callable
,
Dict
,
List
,
Optional
,
Sequence
,
Type
,
Union
...
...
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
json
import
re
from
json
import
JSONDecoder
...
...
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
json
from
typing
import
Dict
,
Sequence
,
Union
...
...
vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
json
import
re
from
typing
import
Dict
,
List
,
Sequence
,
Union
...
...
vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
json
from
typing
import
Dict
,
Sequence
,
Union
...
...
vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
json
import
re
from
typing
import
Dict
,
List
,
Sequence
,
Union
...
...
vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
json
import
re
from
json
import
JSONDecoder
...
...
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
json
import
re
from
random
import
choices
...
...
vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
ast
import
json
import
re
...
...
vllm/entrypoints/openai/tool_parsers/utils.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
json
from
json
import
JSONDecodeError
,
JSONDecoder
from
typing
import
Any
,
List
,
Tuple
...
...
vllm/entrypoints/utils.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
functools
...
...
vllm/envs.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
tempfile
from
typing
import
TYPE_CHECKING
,
Any
,
Callable
,
Dict
,
List
,
Optional
...
...
@@ -34,6 +36,7 @@ if TYPE_CHECKING:
VLLM_LOGGING_LEVEL
:
str
=
"INFO"
VLLM_LOGGING_PREFIX
:
str
=
""
VLLM_LOGGING_CONFIG_PATH
:
Optional
[
str
]
=
None
VLLM_LOGITS_PROCESSOR_THREADS
:
Optional
[
int
]
=
None
VLLM_TRACE_FUNCTION
:
int
=
0
VLLM_ATTENTION_BACKEND
:
Optional
[
str
]
=
None
VLLM_USE_FLASHINFER_SAMPLER
:
Optional
[
bool
]
=
None
...
...
@@ -86,6 +89,10 @@ if TYPE_CHECKING:
VLLM_MLA_DISABLE
:
bool
=
False
VLLM_MLA_PERFORM_MATRIX_ABSORPTION
:
bool
=
True
VLLM_MLA_DISABLE_REQUANTIZATION
:
bool
=
False
VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE
:
bool
=
True
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON
:
bool
=
False
VLLM_RAY_PER_WORKER_GPUS
:
float
=
1.0
VLLM_RAY_BUNDLE_INDICES
:
str
=
""
def
get_default_cache_root
():
...
...
@@ -309,6 +316,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_LOGGING_PREFIX"
:
lambda
:
os
.
getenv
(
"VLLM_LOGGING_PREFIX"
,
""
),
# if set, vllm will call logits processors in a thread pool with this many
# threads. This is useful when using custom logits processors that either
# (a) launch additional CUDA kernels or (b) do significant CPU-bound work
# while not holding the python GIL, or both.
"VLLM_LOGITS_PROCESSOR_THREADS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_LOGITS_PROCESSOR_THREADS"
,
"0"
))
if
"VLLM_LOGITS_PROCESSOR_THREADS"
in
os
.
environ
else
None
,
# Trace function calls
# If set to 1, vllm will trace function calls
# Useful for debugging
...
...
@@ -565,7 +580,34 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# matrices to match the activation type. This can lead to higher memory and
# compute usage but better preserves the accuracy of the original model.
"VLLM_MLA_DISABLE_REQUANTIZATION"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_MLA_DISABLE_REQUANTIZATION"
,
"0"
)))
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_MLA_DISABLE_REQUANTIZATION"
,
"0"
))),
# If set, vLLM will use the Triton implementation of moe_align_block_size,
# i.e. moe_align_block_size_triton in fused_moe.py.
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON"
,
"0"
))
),
# Number of GPUs per worker in Ray, if it is set to be a fraction,
# it allows ray to schedule multiple actors on a single GPU,
# so that users can colocate other actors on the same GPUs as vLLM.
"VLLM_RAY_PER_WORKER_GPUS"
:
lambda
:
float
(
os
.
getenv
(
"VLLM_RAY_PER_WORKER_GPUS"
,
"1.0"
)),
# Bundle indices for Ray, if it is set, it can control precisely
# which indices are used for the Ray bundle, for every worker.
# Format: comma-separated list of integers, e.g. "0,1,2,3"
"VLLM_RAY_BUNDLE_INDICES"
:
lambda
:
os
.
getenv
(
"VLLM_RAY_BUNDLE_INDICES"
,
""
),
# When on a Nvidia GPU aligns single entries (within a page) so they are 256
# byte aligned for better performance, this increases the memory usage of
# the cache. Currently this only affects MLA that results in non-256
# byte aligned entries. This matches the alignment the CUDA runtime uses
# for all allocations. Currently this primarily affects MLA, for most other
# models the alignment is already naturally aligned to 256 bytes.
"VLLM_CUDA_MEM_ALIGN_KV_CACHE"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_CUDA_MEM_ALIGN_KV_CACHE"
,
"1"
))),
}
# end-env-vars-definition
...
...
vllm/executor/executor_base.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
asyncio
from
abc
import
ABC
,
abstractmethod
from
typing
import
(
Any
,
Awaitable
,
Callable
,
Dict
,
List
,
Optional
,
Set
,
Tuple
,
...
...
Prev
1
…
28
29
30
31
32
33
34
35
36
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment