Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fcfc474d
Commit
fcfc474d
authored
Apr 09, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.3' into v0.8.3-dev
parents
bb94d2e5
296c6572
Changes
503
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
282 additions
and
150 deletions
+282
-150
vllm/env_override.py
vllm/env_override.py
+21
-0
vllm/envs.py
vllm/envs.py
+72
-7
vllm/executor/executor_base.py
vllm/executor/executor_base.py
+20
-5
vllm/executor/ray_distributed_executor.py
vllm/executor/ray_distributed_executor.py
+17
-10
vllm/inputs/parse.py
vllm/inputs/parse.py
+14
-8
vllm/inputs/preprocess.py
vllm/inputs/preprocess.py
+8
-6
vllm/inputs/registry.py
vllm/inputs/registry.py
+23
-16
vllm/lora/models.py
vllm/lora/models.py
+3
-1
vllm/lora/ops/triton_ops/kernel_utils.py
vllm/lora/ops/triton_ops/kernel_utils.py
+1
-1
vllm/lora/ops/triton_ops/lora_expand.py
vllm/lora/ops/triton_ops/lora_expand.py
+12
-1
vllm/lora/ops/triton_ops/lora_kernel_metadata.py
vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+36
-6
vllm/lora/ops/triton_ops/lora_shrink.py
vllm/lora/ops/triton_ops/lora_shrink.py
+12
-1
vllm/model_executor/guided_decoding/__init__.py
vllm/model_executor/guided_decoding/__init__.py
+11
-10
vllm/model_executor/guided_decoding/guidance_decoding.py
vllm/model_executor/guided_decoding/guidance_decoding.py
+10
-2
vllm/model_executor/guided_decoding/outlines_decoding.py
vllm/model_executor/guided_decoding/outlines_decoding.py
+4
-4
vllm/model_executor/guided_decoding/outlines_logits_processors.py
...el_executor/guided_decoding/outlines_logits_processors.py
+7
-7
vllm/model_executor/guided_decoding/reasoner/__init__.py
vllm/model_executor/guided_decoding/reasoner/__init__.py
+4
-0
vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
...el_executor/guided_decoding/reasoner/deepseek_reasoner.py
+0
-38
vllm/model_executor/guided_decoding/reasoner/reasoner.py
vllm/model_executor/guided_decoding/reasoner/reasoner.py
+0
-23
vllm/model_executor/guided_decoding/xgrammar_decoding.py
vllm/model_executor/guided_decoding/xgrammar_decoding.py
+7
-4
No files found.
vllm/env_override.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
os
import
torch
# set some common config/environment variables that should be set
# for all processes created by vllm and all processes
# that interact with vllm workers.
# they are executed whenever `import vllm` is called.
# see https://github.com/NVIDIA/nccl/issues/1234
os
.
environ
[
'NCCL_CUMEM_ENABLE'
]
=
'0'
# see https://github.com/vllm-project/vllm/pull/15951
# it avoids unintentional cuda initialization from torch.cuda.is_available()
os
.
environ
[
'PYTORCH_NVML_BASED_CUDA_CHECK'
]
=
'1'
# see https://github.com/vllm-project/vllm/issues/10480
os
.
environ
[
'TORCHINDUCTOR_COMPILE_THREADS'
]
=
'1'
# see https://github.com/vllm-project/vllm/issues/10619
# torch._inductor.config.compile_threads = 1
vllm/envs.py
View file @
fcfc474d
...
...
@@ -2,6 +2,7 @@
import
hashlib
import
os
import
sys
import
tempfile
from
typing
import
TYPE_CHECKING
,
Any
,
Callable
,
Optional
...
...
@@ -29,6 +30,7 @@ if TYPE_CHECKING:
S3_ACCESS_KEY_ID
:
Optional
[
str
]
=
None
S3_SECRET_ACCESS_KEY
:
Optional
[
str
]
=
None
S3_ENDPOINT_URL
:
Optional
[
str
]
=
None
VLLM_MODEL_REDIRECT_PATH
:
Optional
[
str
]
=
None
VLLM_CACHE_ROOT
:
str
=
os
.
path
.
expanduser
(
"~/.cache/vllm"
)
VLLM_CONFIG_ROOT
:
str
=
os
.
path
.
expanduser
(
"~/.config/vllm"
)
VLLM_USAGE_STATS_SERVER
:
str
=
"https://stats.vllm.ai"
...
...
@@ -53,7 +55,7 @@ if TYPE_CHECKING:
VLLM_FUSED_MOE_CHUNK_SIZE
:
int
=
64
*
1024
VLLM_USE_RAY_SPMD_WORKER
:
bool
=
False
VLLM_USE_RAY_COMPILED_DAG
:
bool
=
False
VLLM_USE_RAY_COMPILED_DAG_
NCCL_
CHANNEL
:
bool
=
True
VLLM_USE_RAY_COMPILED_DAG_CHANNEL
_TYPE
:
str
=
"auto"
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM
:
bool
=
False
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"spawn"
VLLM_ASSETS_CACHE
:
str
=
os
.
path
.
join
(
VLLM_CACHE_ROOT
,
"assets"
)
...
...
@@ -81,9 +83,13 @@ if TYPE_CHECKING:
VLLM_DISABLED_KERNELS
:
list
[
str
]
=
[]
VLLM_USE_V1
:
bool
=
True
VLLM_ROCM_USE_AITER
:
bool
=
False
VLLM_ROCM_USE_AITER_LINEAR
:
bool
=
True
VLLM_ROCM_USE_AITER_MOE
:
bool
=
True
VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE
:
bool
=
False
VLLM_ROCM_USE_AITER_RMSNORM
:
bool
=
True
VLLM_ROCM_FP8_PADDING
:
bool
=
True
VLLM_ROCM_MOE_PADDING
:
bool
=
True
VLLM_ROCM_CUSTOM_PAGED_ATTN
:
bool
=
True
VLLM_ENABLE_V1_MULTIPROCESSING
:
bool
=
True
VLLM_LOG_BATCHSIZE_INTERVAL
:
float
=
-
1
VLLM_DISABLE_COMPILE_CACHE
:
bool
=
False
...
...
@@ -99,12 +105,15 @@ if TYPE_CHECKING:
VLLM_CUDART_SO_PATH
:
Optional
[
str
]
=
None
VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH
:
bool
=
True
VLLM_DP_RANK
:
int
=
0
VLLM_DP_RANK_LOCAL
:
int
=
-
1
VLLM_DP_SIZE
:
int
=
1
VLLM_DP_MASTER_IP
:
str
=
""
VLLM_DP_MASTER_PORT
:
int
=
0
VLLM_MARLIN_USE_ATOMIC_ADD
:
bool
=
False
VLLM_V0_USE_OUTLINES_CACHE
:
bool
=
False
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION
:
bool
=
False
VLLM_TPU_BUCKET_PADDING_GAP
:
int
=
0
VLLM_USE_DEEP_GEMM
:
bool
=
False
def
get_default_cache_root
():
...
...
@@ -302,6 +311,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_API_KEY"
:
lambda
:
os
.
environ
.
get
(
"VLLM_API_KEY"
,
None
),
# Whether to log responses from API Server for debugging
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE"
:
lambda
:
os
.
environ
.
get
(
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE"
,
"False"
).
lower
()
==
"true"
,
# S3 access information, used for tensorizer to load model from S3
"S3_ACCESS_KEY_ID"
:
lambda
:
os
.
environ
.
get
(
"S3_ACCESS_KEY_ID"
,
None
),
...
...
@@ -404,15 +418,21 @@ environment_variables: dict[str, Callable[[], Any]] = {
# (previously known as ADAG) API which optimizes the
# control plane overhead.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
# Note that this variable is set to 1 in V1 by default
# when ray distributed executor is used.
"VLLM_USE_RAY_COMPILED_DAG"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_RAY_COMPILED_DAG"
,
"0"
))),
# If the env var is set, it uses NCCL for communication in
# Ray's Compiled Graph. This flag is ignored if
# VLLM_USE_RAY_COMPILED_DAG is not set.
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
,
"1"
))
),
# If the env var is set, Ray Compiled Graph uses the specified
# channel type to communicate between workers belonging to
# different pipeline-parallel stages.
# Available options:
# - "auto": use the default channel type
# - "nccl": use NCCL for communication
# - "shm": use shared memory and gRPC for communication
# This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"
:
lambda
:
os
.
getenv
(
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"
,
"auto"
),
# If the env var is set, it enables GPU communication overlap
# (experimental feature) in Ray's Compiled Graph. This flag is ignored if
...
...
@@ -554,6 +574,26 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda
:
(
os
.
getenv
(
"VLLM_ROCM_USE_AITER"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# use aiter linear op if aiter ops are enabled
# The following list of related ops
# - scaled_mm (per-tensor / rowwise)
"VLLM_ROCM_USE_AITER_LINEAR"
:
lambda
:
(
os
.
getenv
(
"VLLM_ROCM_USE_AITER_LINEAR"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
# Whether to use aiter moe ops.
# By default is enabled.
"VLLM_ROCM_USE_AITER_MOE"
:
lambda
:
(
os
.
getenv
(
"VLLM_ROCM_USE_AITER_MOE"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
# Whether to use aiter block scaled moe kernel.
# By default this is disabled.
"VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE"
:
lambda
:
(
os
.
getenv
(
"VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE"
,
"false"
).
lower
()
in
(
"true"
,
"1"
)),
# use aiter rms norm op if aiter ops are enabled.
"VLLM_ROCM_USE_AITER_RMSNORM"
:
lambda
:
(
os
.
getenv
(
"VLLM_ROCM_USE_AITER_RMSNORM"
,
"True"
).
lower
()
in
...
...
@@ -567,6 +607,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ROCM_MOE_PADDING"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ROCM_MOE_PADDING"
,
"1"
))),
# custom paged attention kernel for MI3* cards
"VLLM_ROCM_CUSTOM_PAGED_ATTN"
:
lambda
:
(
os
.
getenv
(
"VLLM_ROCM_CUSTOM_PAGED_ATTN"
,
"True"
).
lower
()
in
(
"true"
,
"1"
)),
# Divisor for dynamic query scale factor calculation for FP8 KV Cache
"Q_SCALE_CONSTANT"
:
lambda
:
int
(
os
.
getenv
(
"Q_SCALE_CONSTANT"
,
"200"
)),
...
...
@@ -643,6 +688,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DP_RANK"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_DP_RANK"
,
"0"
)),
# Rank of the process in the data parallel setting.
# Defaults to VLLM_DP_RANK when not set.
"VLLM_DP_RANK_LOCAL"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_DP_RANK_LOCAL"
,
sys
.
modules
[
__name__
].
VLLM_DP_RANK
)),
# World size of the data parallel setting
"VLLM_DP_SIZE"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_DP_SIZE"
,
"1"
)),
...
...
@@ -659,6 +710,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_CI_USE_S3"
:
lambda
:
os
.
environ
.
get
(
"VLLM_CI_USE_S3"
,
"0"
)
==
"1"
,
# Use model_redirect to redirect the model name to a local folder.
"VLLM_MODEL_REDIRECT_PATH"
:
lambda
:
os
.
environ
.
get
(
"VLLM_MODEL_REDIRECT_PATH"
,
None
),
# Whether to use atomicAdd reduce in gptq/awq marlin kernel.
"VLLM_MARLIN_USE_ATOMIC_ADD"
:
lambda
:
os
.
environ
.
get
(
"VLLM_MARLIN_USE_ATOMIC_ADD"
,
"0"
)
==
"1"
,
...
...
@@ -673,6 +728,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION"
:
lambda
:
bool
(
int
(
os
.
environ
[
"VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION"
]))
if
"VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION"
in
os
.
environ
else
None
,
# Gap between padding buckets for the forward pass. So we have
# 8, we will run forward pass with [16, 24, 32, ...].
"VLLM_TPU_BUCKET_PADDING_GAP"
:
lambda
:
int
(
os
.
environ
[
"VLLM_TPU_BUCKET_PADDING_GAP"
])
if
"VLLM_TPU_BUCKET_PADDING_GAP"
in
os
.
environ
else
0
,
# Allow use of DeepGemm kernels for fused moe ops.
"VLLM_USE_DEEP_GEMM"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_DEEP_GEMM"
,
"0"
))),
}
# end-env-vars-definition
...
...
vllm/executor/executor_base.py
View file @
fcfc474d
...
...
@@ -51,6 +51,7 @@ class ExecutorBase(ABC):
self
.
observability_config
=
vllm_config
.
observability_config
self
.
_init_executor
()
self
.
is_sleeping
=
False
self
.
sleeping_tags
:
set
[
str
]
=
set
()
@
abstractmethod
def
_init_executor
(
self
)
->
None
:
...
...
@@ -204,20 +205,34 @@ class ExecutorBase(ABC):
time_before_sleep
=
time
.
perf_counter
()
self
.
collective_rpc
(
"sleep"
,
kwargs
=
dict
(
level
=
level
))
time_after_sleep
=
time
.
perf_counter
()
self
.
sleeping_tags
=
{
"weights"
,
"kv_cache"
}
self
.
is_sleeping
=
True
logger
.
info
(
"It took %.6f seconds to fall asleep."
,
time_after_sleep
-
time_before_sleep
)
def
wake_up
(
self
):
def
wake_up
(
self
,
tags
:
Optional
[
list
[
str
]]
=
None
):
if
not
self
.
is_sleeping
:
logger
.
warning
(
"Executor is not sleeping."
)
return
if
tags
:
for
tag
in
tags
:
if
tag
not
in
self
.
sleeping_tags
:
logger
.
warning
(
"Tag %s is not in sleeping tags %s"
,
tag
,
self
.
sleeping_tags
)
return
time_before_wakeup
=
time
.
perf_counter
()
self
.
collective_rpc
(
"wake_up"
)
self
.
collective_rpc
(
"wake_up"
,
kwargs
=
dict
(
tags
=
tags
)
)
time_after_wakeup
=
time
.
perf_counter
()
self
.
is_sleeping
=
False
logger
.
info
(
"It took %.6f seconds to wake up."
,
time_after_wakeup
-
time_before_wakeup
)
logger
.
info
(
"It took %.6f seconds to wake up tags %s."
,
time_after_wakeup
-
time_before_wakeup
,
tags
if
tags
is
not
None
else
self
.
sleeping_tags
)
if
tags
:
for
tag
in
tags
:
self
.
sleeping_tags
.
remove
(
tag
)
else
:
self
.
sleeping_tags
.
clear
()
if
not
self
.
sleeping_tags
:
self
.
is_sleeping
=
False
def
save_sharded_state
(
self
,
...
...
vllm/executor/ray_distributed_executor.py
View file @
fcfc474d
...
...
@@ -79,7 +79,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
# For TPU, avoid compiling NVIDIA's NCCL
if
current_platform
.
is_tpu
():
os
.
environ
[
"VLLM_USE_RAY_COMPILED_DAG_
NCCL_
CHANNEL"
]
=
"
0
"
os
.
environ
[
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL
_TYPE
"
]
=
"
shm
"
# If the env var is set, it uses the Ray's compiled DAG API
# which optimizes the control plane overhead.
...
...
@@ -546,10 +546,11 @@ class RayDistributedExecutor(DistributedExecutorBase):
"Run `pip install ray[cgraph]` to install it."
)
cupy_spec
=
importlib
.
util
.
find_spec
(
"cupy"
)
if
cupy_spec
is
None
and
envs
.
VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL
:
if
(
cupy_spec
is
None
and
envs
.
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
==
"nccl"
):
raise
ValueError
(
"cupy is not installed but required since "
"VLLM_USE_RAY_COMPILED_DAG_
NCCL_
CHANNEL is set. "
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL
_TYPE
is set
to 'nccl'
. "
"Run `pip install ray[cgraph]` and check cupy installation."
)
def
_compiled_ray_dag
(
self
,
enable_asyncio
:
bool
):
...
...
@@ -557,10 +558,17 @@ class RayDistributedExecutor(DistributedExecutorBase):
self
.
_check_ray_cgraph_installation
()
from
ray.dag
import
InputNode
,
MultiOutputNode
logger
.
info
(
"VLLM_USE_RAY_COMPILED_DAG_
NCCL_
CHANNEL = %s"
,
envs
.
VLLM_USE_RAY_COMPILED_DAG_
NCCL_
CHANNEL
)
logger
.
info
(
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL
_TYPE
= %s"
,
envs
.
VLLM_USE_RAY_COMPILED_DAG_CHANNEL
_TYPE
)
logger
.
info
(
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s"
,
envs
.
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM
)
channel_type
=
envs
.
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
if
channel_type
not
in
(
"auto"
,
"nccl"
,
"shm"
):
raise
ValueError
(
"Invalid value for VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: "
f
"
{
channel_type
}
. Valid values are: 'auto', 'nccl', or 'shm'."
)
# Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
# (it is 10 seconds by default). This is a Ray environment variable to
# control the timeout of getting result from a compiled graph execution,
...
...
@@ -605,13 +613,12 @@ class RayDistributedExecutor(DistributedExecutorBase):
]
last_pp_rank
=
len
(
self
.
pp_tp_workers
)
-
1
if
pp_rank
<
last_pp_rank
:
if
(
pp_rank
<
last_pp_rank
and
envs
.
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
!=
"shm"
):
# Specify how intermediate tensors should be passed
# between pp stages, no need to specify for the last
# pp stage.
transport
=
"nccl"
\
if
envs
.
VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL
\
else
"auto"
# pp stage or when using shared memory (the default).
transport
=
envs
.
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
outputs
=
[
output
.
with_tensor_transport
(
transport
=
transport
)
for
output
in
outputs
...
...
vllm/inputs/parse.py
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
from
collections.abc
import
Sequence
from
typing
import
Literal
,
TypedDict
,
Union
,
cast
,
overload
from
typing
import
Literal
,
Optional
,
TypedDict
,
Union
,
cast
,
overload
from
typing_extensions
import
TypeIs
from
vllm.utils
import
is_list_of
from
.data
import
(
EncoderDecoderInputs
,
ExplicitEncoderDecoderPrompt
,
ProcessorInputs
,
PromptType
,
SingletonPrompt
,
TextPrompt
,
TokensPrompt
)
from
.data
import
(
ExplicitEncoderDecoderPrompt
,
ProcessorInputs
,
PromptType
,
SingletonInputs
,
SingletonPrompt
,
TextPrompt
,
TokensPrompt
)
class
ParsedText
(
TypedDict
):
...
...
@@ -110,6 +108,14 @@ def is_explicit_encoder_decoder_prompt(
return
isinstance
(
prompt
,
dict
)
and
"encoder_prompt"
in
prompt
def
is_encoder_decoder_inputs
(
inputs
:
ProcessorInputs
)
->
TypeIs
[
EncoderDecoderInputs
]:
return
"encoder"
in
inputs
and
"decoder"
in
inputs
def
split_enc_dec_inputs
(
inputs
:
ProcessorInputs
,
)
->
tuple
[
Optional
[
SingletonInputs
],
SingletonInputs
]:
if
"encoder"
in
inputs
and
"decoder"
in
inputs
:
# NOTE: This passes pyright but not mypy
return
(
inputs
[
"encoder"
],
# type: ignore[typeddict-item]
inputs
[
"decoder"
],
# type: ignore[typeddict-item]
)
return
None
,
inputs
vllm/inputs/preprocess.py
View file @
fcfc474d
...
...
@@ -261,13 +261,13 @@ class InputPreprocessor:
# initialized without a tokenizer while using also multi-modal
# input.
if
not
self
.
tokenizer
:
tokenizer
=
None
tokenizer
=
object
()
# Dummy
else
:
tokenizer_group
=
self
.
get_tokenizer_group
()
tokenizer
=
tokenizer_group
.
get_lora_tokenizer
(
lora_request
)
mm_processor
=
self
.
mm_registry
.
create_processor
(
self
.
model_config
,
tokenizer
)
mm_processor
=
self
.
mm_registry
.
create_processor
(
self
.
model_config
,
tokenizer
=
tokenizer
)
if
mm_processor_kwargs
is
None
:
mm_processor_kwargs
=
{}
...
...
@@ -288,14 +288,14 @@ class InputPreprocessor:
# initialized without a tokenizer while using also multi-modal
# input.
if
not
self
.
tokenizer
:
tokenizer
=
None
tokenizer
=
object
()
# Dummy
else
:
tokenizer_group
=
self
.
get_tokenizer_group
()
tokenizer
=
await
tokenizer_group
.
get_lora_tokenizer_async
(
lora_request
)
mm_processor
=
self
.
mm_registry
.
create_processor
(
self
.
model_config
,
tokenizer
)
mm_processor
=
self
.
mm_registry
.
create_processor
(
self
.
model_config
,
tokenizer
=
tokenizer
)
if
mm_processor_kwargs
is
None
:
mm_processor_kwargs
=
{}
...
...
@@ -528,6 +528,7 @@ class InputPreprocessor:
prompt_token_ids
=
decoder_inputs_to_override
[
"prompt_token_ids"
],
mm_kwargs
=
inputs
[
"mm_kwargs"
],
mm_hashes
=
inputs
[
"mm_hashes"
],
mm_placeholders
=
inputs
[
"mm_placeholders"
],
)
else
:
...
...
@@ -536,6 +537,7 @@ class InputPreprocessor:
prompt
=
inputs
[
"prompt"
],
prompt_token_ids
=
inputs
[
"prompt_token_ids"
],
mm_kwargs
=
inputs
[
"mm_kwargs"
],
mm_hashes
=
inputs
[
"mm_hashes"
],
mm_placeholders
=
inputs
[
"mm_placeholders"
],
)
elif
inputs
[
"type"
]
==
"token"
:
...
...
vllm/inputs/registry.py
View file @
fcfc474d
...
...
@@ -13,13 +13,12 @@ from typing_extensions import TypeVar, assert_never
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.processor
import
cached_processor_from_config
from
vllm.transformers_utils.tokenizer
import
(
AnyTokenizer
,
cached_tokenizer_from_config
)
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
(
ClassRegistry
,
get_allowed_kwarg_only_overrides
,
resolve_mm_processor_kwargs
)
from
.data
import
ProcessorInputs
,
SingletonInputs
from
.parse
import
is_encoder_decoder
_inputs
from
.parse
import
split_enc_dec
_inputs
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
...
...
@@ -329,17 +328,27 @@ class InputRegistry:
from
vllm.model_executor.model_loader
import
get_model_architecture
from
vllm.multimodal
import
MultiModalKwargs
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.sequence
import
SequenceData
if
mm_registry
.
has_processor
(
model_config
):
tokenizer
=
cached_tokenizer_from_config
(
model_config
)
processor
=
mm_registry
.
create_processor
(
model_config
,
tokenizer
,
disable_cache
=
True
)
profiler
=
MultiModalProfiler
(
processor
)
dummy_data_factory
=
(
profiler
.
get_encoder_dummy_data
if
is_encoder_data
else
profiler
.
get_decoder_dummy_data
)
dummy_data
=
dummy_data_factory
(
seq_len
)
dummy_data_v1
=
(
profiler
.
get_encoder_dummy_data
(
seq_len
)
if
is_encoder_data
else
profiler
.
get_decoder_dummy_data
(
seq_len
))
_seq_data
=
SequenceData
.
from_seqs
(
dummy_data_v1
.
prompt_token_ids
)
# type: ignore[attr-defined]
dummy_data
=
DummyData
(
seq_data
=
_seq_data
,
multi_modal_data
=
getattr
(
dummy_data_v1
,
"multi_modal_data"
,
None
),
multi_modal_placeholders
=
getattr
(
dummy_data_v1
,
"multi_modal_placeholders"
,
None
),
)
else
:
model_cls
,
_
=
get_model_architecture
(
model_config
)
if
is_encoder_data
:
...
...
@@ -462,13 +471,11 @@ class InputRegistry:
**
mm_processor_kwargs
,
)
if
is_encoder_decoder_inputs
(
processed_inputs
):
self
.
_ensure_mm_kwargs
(
processed_inputs
[
"encoder"
],
mm_processor_kwargs
)
self
.
_ensure_mm_kwargs
(
processed_inputs
[
"decoder"
],
mm_processor_kwargs
)
else
:
self
.
_ensure_mm_kwargs
(
processed_inputs
,
mm_processor_kwargs
)
encoder_inputs
,
decoder_inputs
=
split_enc_dec_inputs
(
processed_inputs
)
if
encoder_inputs
is
not
None
:
self
.
_ensure_mm_kwargs
(
encoder_inputs
,
mm_processor_kwargs
)
if
decoder_inputs
is
not
None
:
self
.
_ensure_mm_kwargs
(
decoder_inputs
,
mm_processor_kwargs
)
return
processed_inputs
...
...
vllm/lora/models.py
View file @
fcfc474d
...
...
@@ -272,7 +272,9 @@ class LoRAModel(AdapterModel):
f
" target modules in
{
expected_lora_modules
}
"
f
" but received
{
unexpected_modules
}
."
f
" Please verify that the loaded LoRA module is correct"
)
tensors
=
torch
.
load
(
lora_bin_file_path
,
map_location
=
device
)
tensors
=
torch
.
load
(
lora_bin_file_path
,
map_location
=
device
,
weights_only
=
True
)
else
:
raise
ValueError
(
f
"
{
lora_dir
}
doesn't contain tensors"
)
...
...
vllm/lora/ops/triton_ops/kernel_utils.py
View file @
fcfc474d
...
...
@@ -130,7 +130,7 @@ def do_expand_kernel(
# Identify A and B block pointers
offset_k
=
tl
.
arange
(
0
,
BLOCK_K
)
a_ptr
=
(
cur_input_ptr
+
ram
[:,
None
]
*
input_d1_stride
+
offset_k
[
None
,
:]
*
input_d2_stride
,
)
offset_k
[
None
,
:]
*
input_d2_stride
)
b_ptr
=
(
cur_lora_ptr
+
cur_lora_d0_stride
*
lora_index
+
offset_k
[:,
None
]
*
cur_lora_d2_stride
+
rbn
[
None
,
:]
*
cur_lora_d1_stride
)
...
...
vllm/lora/ops/triton_ops/lora_expand.py
View file @
fcfc474d
...
...
@@ -136,6 +136,7 @@ def _lora_expand(
num_tokens_per_lora
:
torch
.
Tensor
,
# shape [max-loras + 1]
lora_token_start_loc
:
torch
.
Tensor
,
# shape [max-loras + 2]
lora_ids
:
torch
.
Tensor
,
# shape [max-loras + 1]
no_lora_flag_cpu
:
torch
.
Tensor
,
# shape [1]
offset_start
:
int
=
0
,
add_inputs
:
bool
=
False
,
)
->
None
:
...
...
@@ -157,11 +158,19 @@ def _lora_expand(
identifies the the region in token_indices_sorted_by_lora_ids that
LoRA lora_ids[i] should process.
lora_ids (torch.Tensor): LoRA ids to process.
no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
if there are any requests that require LoRA.
offset_start (int, optional): Offset start for output_tensor.
Defaults to 0.
add_inputs (bool, optional): Whether to add the input tensor to the
output tensor. Defaults to False.
"""
assert
no_lora_flag_cpu
.
numel
()
==
1
if
no_lora_flag_cpu
.
item
():
# None of the inputs require LoRA.
return
assert
inputs
.
dtype
in
[
torch
.
float16
,
torch
.
bfloat16
,
torch
.
float32
]
for
weight
in
lora_b_weights
:
assert
weight
.
dtype
in
[
torch
.
float16
,
torch
.
bfloat16
]
...
...
@@ -170,6 +179,8 @@ def _lora_expand(
assert
output_tensor
.
is_contiguous
()
# metadata sanity check.
M
=
inputs
.
size
(
1
)
assert
token_lora_mapping
.
size
(
0
)
==
M
assert
token_lora_mapping
.
size
(
0
)
==
token_indices_sorted_by_lora_ids
.
size
(
0
)
assert
lora_ids
.
size
(
0
)
==
num_tokens_per_lora
.
size
(
0
)
...
...
@@ -181,7 +192,6 @@ def _lora_expand(
inputs
.
device
)
K
=
lora_b_weights
[
0
].
shape
[
-
1
]
# K= rank
M
=
inputs
.
size
(
1
)
ADD_INPUTS
=
add_inputs
MAX_LORAS
=
lora_ids
.
size
(
0
)
CAST_TYPE
=
False
...
...
@@ -263,6 +273,7 @@ def _lora_expand_fake(
num_tokens_per_lora
:
torch
.
Tensor
,
lora_token_start_loc
:
torch
.
Tensor
,
lora_ids
:
torch
.
Tensor
,
no_lora_flag_cpu
:
torch
.
Tensor
,
offset_start
:
int
=
0
,
add_inputs
:
bool
=
False
,
)
->
None
:
...
...
vllm/lora/ops/triton_ops/lora_kernel_metadata.py
View file @
fcfc474d
...
...
@@ -17,6 +17,17 @@ class LoRAKernelMeta:
num_tokens_per_lora
:
torch
.
Tensor
lora_token_start_loc
:
torch
.
Tensor
# The V1 architecture uses the traced torch.compile graphs to execute
# a forward pass. Things to note about this process,
# 1. The tracing infers all python scalar datatype objects into a constant
# value.
# 2. The tracing cannot handle dynamic control flow. (dynamic control flow
# is an experimental feature in pytorch)
# 3. The internals of torch.ops functions are not traced.
# We disguise the "no_lora" flag as a cpu tensor and leverage point number 3
# to early exit from inside the lora_expand / lora_shrink torch operation.
no_lora_flag_cpu
:
torch
.
Tensor
@
staticmethod
def
make
(
max_loras
:
int
,
max_num_tokens
:
int
,
device
:
Union
[
torch
.
device
,
str
])
->
"LoRAKernelMeta"
:
...
...
@@ -47,17 +58,24 @@ class LoRAKernelMeta:
lora_token_start_loc
=
torch
.
zeros
(
max_loras
+
2
,
dtype
=
torch
.
int32
,
device
=
device
)
no_lora_flag_cpu
=
torch
.
tensor
([
False
],
dtype
=
torch
.
bool
,
device
=
'cpu'
)
return
LoRAKernelMeta
(
token_lora_mapping
=
token_lora_mapping
,
token_indices_sorted_by_lora_ids
=
token_indices_sorted_by_lora_ids
,
active_lora_ids
=
active_lora_ids
,
num_tokens_per_lora
=
num_tokens_per_lora
,
lora_token_start_loc
=
lora_token_start_loc
)
lora_token_start_loc
=
lora_token_start_loc
,
no_lora_flag_cpu
=
no_lora_flag_cpu
)
def
_reset
(
self
):
self
.
active_lora_ids
.
fill_
(
-
1
)
self
.
num_tokens_per_lora
.
fill_
(
0
)
self
.
lora_token_start_loc
.
fill_
(
0
)
self
.
no_lora_flag_cpu
.
fill_
(
False
)
def
prepare_tensors
(
self
,
token_lora_mapping
:
torch
.
Tensor
)
->
None
:
"""
...
...
@@ -70,6 +88,14 @@ class LoRAKernelMeta:
self
.
_reset
()
# Check and record no-lora case.
no_lora
=
torch
.
all
(
token_lora_mapping
==
-
1
)
self
.
no_lora_flag_cpu
[
0
]
=
no_lora
if
no_lora
:
# Early exit. LoRA kernels will not be run.
return
num_tokens
=
token_lora_mapping
.
size
(
0
)
# copy token lora mapping
...
...
@@ -100,7 +126,7 @@ class LoRAKernelMeta:
def
meta_args
(
self
,
token_nums
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
torch
.
Tensor
,
torch
.
Tensor
]:
"""
This function returns the kernel metadata required for the current
forward pass execution of the kernel. The function returns all the
...
...
@@ -111,7 +137,11 @@ class LoRAKernelMeta:
token_nums (int): Number of input tokens in the current forward
pass.
"""
return
(
self
.
token_lora_mapping
[:
token_nums
],
self
.
token_indices_sorted_by_lora_ids
[:
token_nums
],
self
.
num_tokens_per_lora
,
self
.
lora_token_start_loc
,
self
.
active_lora_ids
)
return
(
self
.
token_lora_mapping
[:
token_nums
],
self
.
token_indices_sorted_by_lora_ids
[:
token_nums
],
self
.
num_tokens_per_lora
,
self
.
lora_token_start_loc
,
self
.
active_lora_ids
,
self
.
no_lora_flag_cpu
,
)
vllm/lora/ops/triton_ops/lora_shrink.py
View file @
fcfc474d
...
...
@@ -106,6 +106,7 @@ def _lora_shrink(
num_tokens_per_lora
:
torch
.
Tensor
,
# shape [max-loras + 1]
lora_token_start_loc
:
torch
.
Tensor
,
# shape [max-loras + 2]
lora_ids
:
torch
.
Tensor
,
# shape [max-loras + 1]
no_lora_flag_cpu
:
torch
.
Tensor
,
# shape [1]
scaling
:
float
,
)
->
None
:
"""
...
...
@@ -126,8 +127,16 @@ def _lora_shrink(
identifies the region in token_indices_sorted_by_lora_ids that
LoRA lora_ids[i] should process.
lora_ids (torch.Tensor): LoRA ids to process.
no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
if there are any requests that require LoRA.
scaling (float): Scaling factor.
"""
assert
no_lora_flag_cpu
.
numel
()
==
1
if
no_lora_flag_cpu
.
item
():
# None of the inputs require LoRA.
return
assert
inputs
.
dtype
==
lora_a_weights
[
0
].
dtype
assert
inputs
.
dtype
in
[
torch
.
float16
,
torch
.
bfloat16
]
for
weight
in
lora_a_weights
:
...
...
@@ -138,6 +147,8 @@ def _lora_shrink(
assert
output_tensor
.
is_contiguous
()
# metadata sanity check
M
=
inputs
.
size
(
0
)
assert
token_lora_mapping
.
size
(
0
)
==
M
assert
token_lora_mapping
.
size
(
0
)
==
token_indices_sorted_by_lora_ids
.
size
(
0
)
assert
lora_ids
.
size
(
0
)
==
num_tokens_per_lora
.
size
(
0
)
...
...
@@ -146,7 +157,6 @@ def _lora_shrink(
(
lora_ptr_tensor
,
lora_strides_d0
,
lora_strides_d1
,
lora_strides_d2
)
=
_get_lora_a_ptr
(
lora_a_weights
,
inputs
.
device
)
N
,
K
=
lora_a_weights
[
0
].
shape
[
-
2
:]
# K=hidden_size,N=rank
M
=
inputs
.
size
(
0
)
NUM_SLICES
=
len
(
lora_a_weights
)
MAX_LORAS
=
lora_ids
.
size
(
0
)
...
...
@@ -218,6 +228,7 @@ def _lora_shrink_fake(
num_tokens_per_lora
:
torch
.
Tensor
,
lora_token_start_loc
:
torch
.
Tensor
,
lora_ids
:
torch
.
Tensor
,
no_lora_flag_cpu
:
torch
.
Tensor
,
scaling
:
float
,
)
->
None
:
return
...
...
vllm/model_executor/guided_decoding/__init__.py
View file @
fcfc474d
...
...
@@ -5,10 +5,10 @@ from __future__ import annotations
from
typing
import
TYPE_CHECKING
from
vllm.logger
import
init_logger
from
vllm.model_executor.guided_decoding.reasoner
import
get_reasoner
from
vllm.model_executor.guided_decoding.utils
import
(
convert_lark_to_gbnf
,
grammar_is_likely_lark
,
has_lmf_unsupported_json_features
,
has_xgrammar_unsupported_json_features
)
from
vllm.reasoning
import
ReasoningParserManager
if
TYPE_CHECKING
:
from
transformers
import
PreTrainedTokenizer
...
...
@@ -79,12 +79,6 @@ def maybe_backend_fallback(
"xgrammar does not support Lark grammars and the "
"grammar failed to convert to GBNF."
,
"outlines"
)
elif
guided_params
.
json_object
:
# https://github.com/mlc-ai/xgrammar/issues/256
fallback_or_error
(
guided_params
,
"xgrammar does not support json_object."
,
"guidance"
)
# If the xgrammar module cannot be imported successfully,
# we should still allow users to use guided decoding with a fallback.
elif
not
xgr_installed
:
...
...
@@ -107,7 +101,11 @@ async def get_guided_decoding_logits_processor(
model_config
:
ModelConfig
,
reasoning_backend
:
str
|
None
=
None
)
->
LogitsProcessor
|
None
:
reasoner
=
get_reasoner
(
tokenizer
,
reasoning_backend
)
reasoner
=
None
if
reasoning_backend
is
not
None
:
reasoner_class
=
ReasoningParserManager
.
get_reasoning_parser
(
reasoning_backend
)
reasoner
=
reasoner_class
(
tokenizer
)
guided_params
=
maybe_backend_fallback
(
guided_params
)
...
...
@@ -146,8 +144,11 @@ def get_local_guided_decoding_logits_processor(
reasoning_backend
:
str
|
None
=
None
)
->
LogitsProcessor
|
None
:
guided_params
=
maybe_backend_fallback
(
guided_params
)
# Get the reasoner if needed, it will be None if reasoning_
reasoner
=
get_reasoner
(
tokenizer
,
reasoning_backend
)
reasoner
=
None
if
reasoning_backend
is
not
None
:
reasoner_class
=
ReasoningParserManager
.
get_reasoning_parser
(
reasoning_backend
)
reasoner
=
reasoner_class
(
tokenizer
)
# CFG grammar not supported by LMFE, so we use outlines instead
if
guided_params
.
backend_name
==
'outlines'
:
...
...
vllm/model_executor/guided_decoding/guidance_decoding.py
View file @
fcfc474d
...
...
@@ -18,14 +18,22 @@ def get_local_guidance_guided_decoding_logits_processor(
"""
grm
=
""
any_whitespace
=
'disable-any-whitespace'
not
in
\
guided_params
.
backend_options
()
if
guided_params
.
json
:
grm
=
llguidance
.
LLMatcher
.
grammar_from_json_schema
(
guided_params
.
json
,
overrides
=
{
"whitespace_pattern"
:
guided_params
.
whitespace_pattern
})
overrides
=
{
"whitespace_pattern"
:
guided_params
.
whitespace_pattern
},
defaults
=
{
"whitespace_flexible"
:
any_whitespace
,
})
elif
guided_params
.
json_object
:
grm
=
llguidance
.
LLMatcher
.
grammar_from_json_schema
(
'{"type": "object"}'
,
overrides
=
{
"whitespace_pattern"
:
guided_params
.
whitespace_pattern
})
overrides
=
{
"whitespace_pattern"
:
guided_params
.
whitespace_pattern
},
defaults
=
{
"whitespace_flexible"
:
any_whitespace
,
})
elif
guided_params
.
regex
:
grm
=
llguidance
.
grammar_from
(
"regex"
,
guided_params
.
regex
)
elif
guided_params
.
choice
:
...
...
vllm/model_executor/guided_decoding/outlines_decoding.py
View file @
fcfc474d
...
...
@@ -12,7 +12,7 @@ from transformers import PreTrainedTokenizerBase
from
vllm.model_executor.guided_decoding.outlines_logits_processors
import
(
CFGLogitsProcessor
,
JSONLogitsProcessor
,
RegexLogitsProcessor
)
from
vllm.
model_executor.guided_decoding.reasoner
import
Reasoner
from
vllm.
reasoning
import
Reason
ingPars
er
from
vllm.sampling_params
import
GuidedDecodingParams
...
...
@@ -61,7 +61,7 @@ _MAX_THREADPOOL_WORKERS = 16
async
def
get_outlines_guided_decoding_logits_processor
(
guided_params
:
GuidedDecodingParams
,
tokenizer
:
PreTrainedTokenizerBase
,
reasoner
:
Optional
[
Reasoner
],
reasoner
:
Optional
[
Reason
ingPars
er
],
)
->
Union
[
JSONLogitsProcessor
,
RegexLogitsProcessor
,
CFGLogitsProcessor
,
None
]:
"""
...
...
@@ -92,7 +92,7 @@ async def get_outlines_guided_decoding_logits_processor(
def
get_local_outlines_guided_decoding_logits_processor
(
guided_params
:
GuidedDecodingParams
,
tokenizer
:
PreTrainedTokenizerBase
,
reasoner
:
Optional
[
Reasoner
],
reasoner
:
Optional
[
Reason
ingPars
er
],
)
->
Union
[
JSONLogitsProcessor
,
RegexLogitsProcessor
,
CFGLogitsProcessor
,
None
]:
"""
...
...
@@ -141,7 +141,7 @@ def _get_logits_processor(
tokenizer
:
PreTrainedTokenizerBase
,
mode
:
GuidedDecodingMode
,
whitespace_pattern
:
Union
[
str
,
None
],
reasoner
:
Optional
[
Reasoner
],
reasoner
:
Optional
[
Reason
ingPars
er
],
)
->
Union
[
JSONLogitsProcessor
,
RegexLogitsProcessor
,
CFGLogitsProcessor
]:
if
mode
==
GuidedDecodingMode
.
JSON
:
return
JSONLogitsProcessor
(
guide
,
tokenizer
,
whitespace_pattern
,
...
...
vllm/model_executor/guided_decoding/outlines_logits_processors.py
View file @
fcfc474d
...
...
@@ -34,8 +34,8 @@ from transformers import PreTrainedTokenizerBase
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
vllm.model_executor.guided_decoding.reasoner
import
Reasoner
from
vllm.platforms
import
current_platform
from
vllm.reasoning
import
ReasoningParser
logger
=
init_logger
(
__name__
)
...
...
@@ -49,9 +49,9 @@ else:
class
BaseLogitsProcessor
:
def
__init__
(
self
,
guide
:
Guide
,
reasoner
:
Optional
[
Reasoner
]):
def
__init__
(
self
,
guide
:
Guide
,
reasoner
:
Optional
[
Reason
ingPars
er
]):
self
.
_guide
:
Guide
=
guide
self
.
_reasoner
:
Optional
[
Reasoner
]
=
reasoner
self
.
_reasoner
:
Optional
[
Reason
ingPars
er
]
=
reasoner
# CFGState is used for the FSM state for CFGGuide
self
.
_fsm_state
:
DefaultDict
[
int
,
Union
[
int
,
CFGState
]]
=
defaultdict
(
int
)
...
...
@@ -69,7 +69,7 @@ class BaseLogitsProcessor:
# Remove the reasoning tokens from the input_ids
# We need this because our implementation relies on the
# hash of the input_ids to store the FSM state.
input_ids
=
self
.
_reasoner
.
extract_content
(
input_ids
)
input_ids
=
self
.
_reasoner
.
extract_content
_ids
(
input_ids
)
seq_id
=
hash
(
tuple
(
input_ids
))
...
...
@@ -142,7 +142,7 @@ class RegexLogitsProcessor(BaseLogitsProcessor):
self
,
regex_string
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
reasoner
:
Optional
[
Reasoner
],
reasoner
:
Optional
[
Reason
ingPars
er
],
):
"""Compile the FSM that drives the regex-structured generation.
...
...
@@ -163,7 +163,7 @@ class JSONLogitsProcessor(RegexLogitsProcessor):
def
__init__
(
self
,
schema
:
Union
[
str
,
Dict
,
BaseModel
],
tokenizer
:
PreTrainedTokenizerBase
,
whitespace_pattern
:
Union
[
str
,
None
],
reasoner
:
Optional
[
Reasoner
]):
reasoner
:
Optional
[
Reason
ingPars
er
]):
"""Compile the FSM that drives the JSON-guided generation.
Parameters
...
...
@@ -203,7 +203,7 @@ class CFGLogitsProcessor(BaseLogitsProcessor):
return
CFGGuide
(
cfg
,
tokenizer
)
def
__init__
(
self
,
cfg
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
reasoner
:
Optional
[
Reasoner
]):
reasoner
:
Optional
[
Reason
ingPars
er
]):
"""Compile the FSM that drives the context free grammar generation.
Parameters
...
...
vllm/model_executor/guided_decoding/reasoner/__init__.py
View file @
fcfc474d
...
...
@@ -19,6 +19,10 @@ def get_reasoner(tokenizer: PreTrainedTokenizer,
return
None
elif
reasoning_backend
==
"deepseek_r1"
:
return
DeepSeekReasoner
.
from_tokenizer
(
tokenizer
)
elif
reasoning_backend
==
"granite"
:
logger
.
warning
(
"Granite reasoner not yet implemented for structured outputs"
)
return
None
else
:
# Raise a warning for unknown reasoning backend and return None
# We cannot raise an error here because some reasoning models
...
...
vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
deleted
100644 → 0
View file @
bb94d2e5
# SPDX-License-Identifier: Apache-2.0
from
dataclasses
import
dataclass
from
transformers
import
PreTrainedTokenizer
from
vllm.model_executor.guided_decoding.reasoner.reasoner
import
Reasoner
@
dataclass
class
DeepSeekReasoner
(
Reasoner
):
"""
Reasoner for DeepSeek R series models.
"""
start_token_id
:
int
end_token_id
:
int
start_token
:
str
=
"<think>"
end_token
:
str
=
"</think>"
@
classmethod
def
from_tokenizer
(
cls
,
tokenizer
:
PreTrainedTokenizer
)
->
Reasoner
:
return
cls
(
start_token_id
=
tokenizer
.
encode
(
"<think>"
,
add_special_tokens
=
False
)[
0
],
end_token_id
=
tokenizer
.
encode
(
"</think>"
,
add_special_tokens
=
False
)[
0
])
def
is_reasoning_end
(
self
,
input_ids
:
list
[
int
])
->
bool
:
return
self
.
end_token_id
in
input_ids
def
extract_content
(
self
,
input_ids
:
list
[
int
])
->
list
[
int
]:
"""
Extract the content after the end tokens
"""
if
self
.
end_token_id
not
in
input_ids
or
\
input_ids
.
index
(
self
.
end_token_id
)
+
1
==
len
(
input_ids
):
return
[]
else
:
return
input_ids
[
input_ids
.
index
(
self
.
end_token_id
)
+
1
:]
vllm/model_executor/guided_decoding/reasoner/reasoner.py
deleted
100644 → 0
View file @
bb94d2e5
# SPDX-License-Identifier: Apache-2.0
from
__future__
import
annotations
from
abc
import
ABC
,
abstractmethod
from
dataclasses
import
dataclass
from
transformers
import
PreTrainedTokenizer
@
dataclass
class
Reasoner
(
ABC
):
@
abstractmethod
def
from_tokenizer
(
cls
,
tokenizer
:
PreTrainedTokenizer
)
->
Reasoner
:
pass
@
abstractmethod
def
is_reasoning_end
(
self
,
input_ids
:
list
[
int
])
->
bool
:
pass
@
abstractmethod
def
extract_content
(
self
,
input_ids
:
list
[
int
])
->
list
[
int
]:
pass
vllm/model_executor/guided_decoding/xgrammar_decoding.py
View file @
fcfc474d
...
...
@@ -27,7 +27,7 @@ if TYPE_CHECKING:
from
transformers
import
PreTrainedTokenizer
from
vllm.config
import
ModelConfig
from
vllm.
model_executor.guided_decoding.reasoner
import
Reasoner
from
vllm.
reasoning
import
Reason
ingPars
er
from
vllm.sampling_params
import
GuidedDecodingParams
logger
=
init_logger
(
__name__
)
...
...
@@ -37,7 +37,7 @@ def get_local_xgrammar_guided_decoding_logits_processor(
guided_params
:
GuidedDecodingParams
,
tokenizer
:
PreTrainedTokenizer
,
model_config
:
ModelConfig
,
reasoner
:
Reasoner
|
None
,
reasoner
:
Reason
ingPars
er
|
None
,
max_threads
:
int
=
8
):
config
=
GrammarConfig
.
from_guided_params
(
guided_params
=
guided_params
,
model_config
=
model_config
,
...
...
@@ -280,7 +280,7 @@ class GrammarConfig:
class
XGrammarLogitsProcessor
:
"""Wrapper class to support pickle protocol"""
config
:
GrammarConfig
reasoner
:
Reasoner
|
None
=
None
reasoner
:
Reason
ingPars
er
|
None
=
None
ctx
:
xgr
.
CompiledGrammar
|
None
=
None
tokenizer_info
:
xgr
.
TokenizerInfo
=
None
# type: ignore[assignment]
...
...
@@ -320,7 +320,10 @@ class XGrammarLogitsProcessor:
elif
self
.
config
.
grammar_str
is
not
None
:
self
.
ctx
=
compiler
.
compile_grammar
(
self
.
config
.
grammar_str
)
elif
self
.
config
.
json_object
:
self
.
ctx
=
compiler
.
compile_builtin_json_grammar
()
any_whitespace
=
self
.
config
.
any_whitespace
self
.
ctx
=
compiler
\
.
compile_json_schema
(
'{"type": "object"}'
,
any_whitespace
=
any_whitespace
)
else
:
raise
ValueError
(
"Invalid configuration for xgrammar logits processor"
)
...
...
Prev
1
…
13
14
15
16
17
18
19
20
21
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment