Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
31f6b24f
Commit
31f6b24f
authored
Mar 26, 2025
by
zhuwenwen
Browse files
Merge remote-tracking branch 'mirror/v0.8.2' into v0.8.2-ori
parents
89d1dd57
25f560a6
Changes
88
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
261 additions
and
34 deletions
+261
-34
vllm/v1/structured_output/__init__.py
vllm/v1/structured_output/__init__.py
+3
-0
vllm/v1/structured_output/backend_guidance.py
vllm/v1/structured_output/backend_guidance.py
+164
-0
vllm/v1/structured_output/request.py
vllm/v1/structured_output/request.py
+26
-21
vllm/v1/structured_output/utils.py
vllm/v1/structured_output/utils.py
+1
-1
vllm/v1/worker/gpu_input_batch.py
vllm/v1/worker/gpu_input_batch.py
+5
-0
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+40
-10
vllm/vllm_flash_attn/fa_utils.py
vllm/vllm_flash_attn/fa_utils.py
+6
-0
vllm/worker/hpu_model_runner.py
vllm/worker/hpu_model_runner.py
+16
-2
No files found.
vllm/v1/structured_output/__init__.py
View file @
31f6b24f
...
...
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Optional
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.v1.structured_output.backend_guidance
import
GuidanceBackend
from
vllm.v1.structured_output.backend_types
import
(
StructuredOutputBackend
,
StructuredOutputGrammar
)
...
...
@@ -50,6 +51,8 @@ class StructuredOutputManager:
XgrammarBackend
)
self
.
backend
=
XgrammarBackend
(
self
.
vllm_config
)
elif
backend_name
==
"guidance"
:
self
.
backend
=
GuidanceBackend
(
self
.
vllm_config
)
else
:
raise
ValueError
(
f
"Unsupported structured output backend:
{
backend_name
}
"
)
...
...
vllm/v1/structured_output/backend_guidance.py
0 → 100644
View file @
31f6b24f
# SPDX-License-Identifier: Apache-2.0
import
os
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Optional
import
torch
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.sampling_params
import
SamplingParams
from
vllm.transformers_utils.tokenizer_group
import
init_tokenizer_from_configs
from
vllm.utils
import
LazyLoader
from
vllm.v1.structured_output.backend_types
import
(
StructuredOutputBackend
,
StructuredOutputGrammar
,
StructuredOutputOptions
)
from
vllm.v1.structured_output.request
import
get_structured_output_key
if
TYPE_CHECKING
:
import
llguidance
import
llguidance.hf
as
llguidance_hf
import
llguidance.torch
as
llguidance_torch
else
:
llguidance
=
LazyLoader
(
"llguidance"
,
globals
(),
"llguidance"
)
llguidance_hf
=
LazyLoader
(
"llguidance.hf"
,
globals
(),
"llguidance.hf"
)
llguidance_torch
=
LazyLoader
(
"llguidance.torch"
,
globals
(),
"llguidance.torch"
)
logger
=
init_logger
(
__name__
)
class
GuidanceBackend
(
StructuredOutputBackend
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
):
self
.
vllm_config
=
vllm_config
tokenizer_group
=
init_tokenizer_from_configs
(
model_config
=
vllm_config
.
model_config
,
scheduler_config
=
vllm_config
.
scheduler_config
,
parallel_config
=
vllm_config
.
parallel_config
,
lora_config
=
vllm_config
.
lora_config
)
# type: ignore[arg-type]
tokenizer_group
.
ping
()
self
.
vllm_config
=
vllm_config
self
.
vocab_size
=
vllm_config
.
model_config
.
get_vocab_size
()
tokenizer
=
tokenizer_group
.
get_lora_tokenizer
(
None
)
self
.
ll_tokenizer
=
llguidance_hf
.
from_tokenizer
(
tokenizer
,
None
)
def
compile_grammar
(
self
,
request_type
:
StructuredOutputOptions
,
grammar_spec
:
str
)
->
StructuredOutputGrammar
:
self
.
serialized_grammar
=
serialize_guidance_grammar
(
request_type
,
grammar_spec
)
ll_matcher
=
llguidance
.
LLMatcher
(
self
.
ll_tokenizer
,
self
.
serialized_grammar
,
log_level
=
int
(
os
.
environ
.
get
(
"LLGUIDANCE_LOG_LEVEL"
,
"1"
)),
)
r
=
GuidanceGrammar
(
ll_matcher
=
ll_matcher
,
ll_tokenizer
=
self
.
ll_tokenizer
,
vocab_size
=
self
.
vocab_size
,
)
r
.
check_error
()
return
r
def
allocate_token_bitmask
(
self
,
max_num_seqs
:
int
):
return
llguidance_torch
.
allocate_token_bitmask
(
max_num_seqs
,
self
.
ll_tokenizer
.
vocab_size
)
@
dataclass
class
GuidanceGrammar
(
StructuredOutputGrammar
):
ll_matcher
:
llguidance
.
LLMatcher
ll_tokenizer
:
llguidance
.
LLTokenizer
vocab_size
:
int
printed_error
:
bool
=
False
terminated
:
bool
=
False
def
check_error
(
self
):
if
not
self
.
printed_error
:
err
=
self
.
ll_matcher
.
get_error
()
if
err
:
self
.
printed_error
=
True
logger
.
warning
(
"LLMatcher error: %s"
,
err
)
def
accept_tokens
(
self
,
request_id
:
str
,
tokens
:
list
[
int
])
->
bool
:
"""Accepts a list of tokens and advances the parser.
Returns True if the parser was advanced successfully.
Returns False if the parser failed to advance.
"""
if
self
.
ll_tokenizer
.
eos_token
in
tokens
:
self
.
terminated
=
True
if
self
.
ll_matcher
.
is_stopped
():
return
True
# TODO - Add jump decoding support in the future:
# self.ll_matcher.compute_ff_bytes() - this should always work
# self.ll_matcher.compute_ff_tokens() - this only works for
# "canonical" tokenizers
# For conversion between the two, see
# https://github.com/guidance-ai/llguidance/blob/main/docs/fast_forward.md
r
=
self
.
ll_matcher
.
consume_tokens
(
tokens
)
self
.
check_error
()
return
r
def
fill_bitmask
(
self
,
bitmask
:
torch
.
Tensor
,
idx
:
int
)
->
None
:
# this will automatically return [EOS] mask if the matcher is stopped
# or otherwise in an error state
llguidance_torch
.
fill_next_token_bitmask
(
self
.
ll_matcher
,
bitmask
,
idx
)
self
.
check_error
()
def
is_terminated
(
self
)
->
bool
:
return
self
.
terminated
def
reset
(
self
):
# This method may be not needed anymore? TODO
self
.
ll_matcher
.
reset
()
def
serialize_guidance_grammar
(
request_type
:
StructuredOutputOptions
,
grammar_spec
:
str
)
->
str
:
if
request_type
==
StructuredOutputOptions
.
JSON
:
# TODO: make whitespace_flexible configurable
return
llguidance
.
LLMatcher
.
grammar_from_json_schema
(
grammar_spec
,
defaults
=
{
"whitespace_flexible"
:
True
,
})
elif
request_type
==
StructuredOutputOptions
.
JSON_OBJECT
:
return
llguidance
.
LLMatcher
.
grammar_from_json_schema
(
'{"type": "object"}'
,
defaults
=
{
"whitespace_flexible"
:
True
,
})
else
:
if
request_type
==
StructuredOutputOptions
.
REGEX
:
tp
=
"regex"
elif
request_type
==
StructuredOutputOptions
.
GRAMMAR
:
tp
=
"grammar"
elif
request_type
==
StructuredOutputOptions
.
CHOICE
:
tp
=
"choice"
else
:
logger
.
error
(
"Validation should have already occurred. "
"Please file an issue."
)
raise
ValueError
(
"grammar is not of valid supported types. "
f
"(
{
request_type
!
s
}
)"
)
return
llguidance
.
grammar_from
(
tp
,
grammar_spec
)
def
validate_guidance_grammar
(
sampling_params
:
SamplingParams
,
tokenizer
:
Optional
[
llguidance
.
LLTokenizer
]
=
None
)
->
None
:
tp
,
grm
=
get_structured_output_key
(
sampling_params
)
guidance_grm
=
serialize_guidance_grammar
(
tp
,
grm
)
err
=
llguidance
.
LLMatcher
.
validate_grammar
(
guidance_grm
,
tokenizer
=
tokenizer
)
if
err
:
raise
ValueError
(
f
"Grammar error:
{
err
}
"
)
vllm/v1/structured_output/request.py
View file @
31f6b24f
...
...
@@ -53,25 +53,30 @@ class StructuredOutputRequest:
@
functools
.
cached_property
def
structured_output_key
(
self
)
->
StructuredOutputKey
:
params
=
self
.
sampling_params
.
guided_decoding
assert
params
is
not
None
,
"params can't be None."
if
params
.
json
is
not
None
:
if
not
isinstance
(
params
.
json
,
str
):
json_str
=
json
.
dumps
(
params
.
json
)
else
:
json_str
=
params
.
json
return
(
StructuredOutputOptions
.
JSON
,
json_str
)
el
if
params
.
json
_object
:
return
(
StructuredOutputOptions
.
JSON_OBJECT
,
""
)
el
if
params
.
regex
is
not
Non
e
:
return
(
StructuredOutputOptions
.
REGEX
,
params
.
regex
)
elif
params
.
choice
is
not
None
:
if
not
isinstance
(
params
.
choice
,
str
)
:
json_str
=
json
.
dumps
(
params
.
choice
)
els
e
:
json_str
=
params
.
choice
return
(
StructuredOutputOptions
.
CHOICE
,
json_str
)
el
if
params
.
grammar
is
not
None
:
return
(
StructuredOutputOptions
.
GRAMMAR
,
params
.
grammar
)
return
get_structured_output_key
(
self
.
sampling_params
)
def
get_structured_output_key
(
sampling_params
:
SamplingParams
)
->
StructuredOutputKey
:
params
=
sampling_params
.
guided_decoding
assert
params
is
not
None
,
"params can't be None."
if
params
.
json
is
not
None
:
if
not
isinstance
(
params
.
json
,
str
)
:
json_str
=
json
.
dumps
(
params
.
json
)
el
s
e
:
json_str
=
params
.
json
return
(
StructuredOutputOptions
.
JSON
,
json_str
)
elif
params
.
json_object
:
return
(
StructuredOutputOptions
.
JSON_OBJECT
,
""
)
elif
params
.
regex
is
not
Non
e
:
return
(
StructuredOutputOptions
.
REGEX
,
params
.
regex
)
elif
params
.
choice
is
not
None
:
if
not
isinstance
(
params
.
choice
,
str
)
:
json_str
=
json
.
dumps
(
params
.
choice
)
else
:
raise
ValueError
(
"No valid structured output parameter found"
)
json_str
=
params
.
choice
return
(
StructuredOutputOptions
.
CHOICE
,
json_str
)
elif
params
.
grammar
is
not
None
:
return
(
StructuredOutputOptions
.
GRAMMAR
,
params
.
grammar
)
else
:
raise
ValueError
(
"No valid structured output parameter found"
)
vllm/v1/structured_output/utils.py
View file @
31f6b24f
...
...
@@ -239,7 +239,7 @@ def choice_as_grammar(choice: list[str]) -> str:
return
grammar
def
validate_structured_output_request
(
def
validate_structured_output_request
_xgrammar
(
sampling_params
:
SamplingParams
)
->
None
:
"""Validate that the request is supported by structured output.
...
...
vllm/v1/worker/gpu_input_batch.py
View file @
31f6b24f
...
...
@@ -11,6 +11,7 @@ from vllm.lora.request import LoRARequest
from
vllm.multimodal
import
MultiModalKwargs
from
vllm.sampling_params
import
SamplingParams
,
SamplingType
from
vllm.utils
import
swap_dict_values
from
vllm.v1.outputs
import
LogprobsTensors
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.utils
import
copy_slice
from
vllm.v1.worker.block_table
import
BlockTable
...
...
@@ -197,6 +198,9 @@ class InputBatch:
# that are currently in the prefill phase.
self
.
num_prompt_logprobs
:
dict
[
str
,
int
]
=
{}
# To accumulate prompt logprobs tensor chunks across prefill steps.
self
.
in_progress_prompt_logprobs_cpu
:
dict
[
str
,
LogprobsTensors
]
=
{}
self
.
logit_bias
:
list
[
Optional
[
dict
[
int
,
float
]]]
=
[
None
]
*
max_num_reqs
self
.
has_allowed_token_ids
:
set
[
str
]
=
set
()
...
...
@@ -362,6 +366,7 @@ class InputBatch:
self
.
generators
.
pop
(
req_index
,
None
)
self
.
num_logprobs
.
pop
(
req_id
,
None
)
self
.
num_prompt_logprobs
.
pop
(
req_id
,
None
)
self
.
in_progress_prompt_logprobs_cpu
.
pop
(
req_id
,
None
)
# LoRA
lora_id
=
self
.
request_lora_mapping
[
req_index
]
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
31f6b24f
...
...
@@ -1059,7 +1059,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
sampling_metadata
=
sampling_metadata
,
)
else
:
# TODO(woosuk): Optimize the memory usage.
# When indexing with a tensor (bonus_logits_indices), PyTorch
# creates a new tensor with separate storage from the original
# logits tensor. This means any in-place operations on bonus_logits
# won't affect the original logits tensor.
bonus_logits
=
logits
[
spec_decode_metadata
.
bonus_logits_indices
]
sampler_output
=
self
.
model
.
sample
(
logits
=
bonus_logits
,
...
...
@@ -1067,7 +1070,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
)
bonus_token_ids
=
sampler_output
.
sampled_token_ids
# TODO(woosuk): Optimize the memory usage.
# Just like `bonus_logits`, `target_logits` is a new tensor with
# separate storage from the original `logits` tensor. Therefore,
# it is safe to update `target_logits` in place.
target_logits
=
logits
[
spec_decode_metadata
.
target_logits_indices
]
output_token_ids
=
self
.
rejection_sampler
(
spec_decode_metadata
,
...
...
@@ -1191,6 +1196,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
if
not
num_prompt_logprobs_dict
:
return
{}
in_progress_dict
=
self
.
input_batch
.
in_progress_prompt_logprobs_cpu
prompt_logprobs_dict
:
dict
[
str
,
Optional
[
LogprobsTensors
]]
=
{}
# Since prompt logprobs are a rare feature, prioritize simple,
...
...
@@ -1206,16 +1212,36 @@ class GPUModelRunner(LoRAModelRunnerMixin):
prompt_token_ids
=
torch
.
tensor
(
request
.
prompt_token_ids
).
to
(
self
.
device
,
non_blocking
=
True
)
# Set up target LogprobsTensors object.
logprobs_tensors
=
in_progress_dict
.
get
(
req_id
)
if
not
logprobs_tensors
:
# Create empty logprobs CPU tensors for the entire prompt.
# If chunked, we'll copy in slice by slice.
logprobs_tensors
=
LogprobsTensors
.
empty_cpu
(
num_prompt_tokens
-
1
,
num_prompt_logprobs
+
1
)
in_progress_dict
[
req_id
]
=
logprobs_tensors
# Determine number of logits to retrieve.
start_tok
=
request
.
num_computed_tokens
+
1
start_idx
=
request
.
num_computed_tokens
start_tok
=
start_idx
+
1
num_remaining_tokens
=
num_prompt_tokens
-
start_tok
if
num_tokens
<
num_remaining_tokens
:
if
num_tokens
<
=
num_remaining_tokens
:
# This is a chunk, more tokens remain.
# In the == case, there are no more prompt logprobs to produce
# but we want to defer returning them to the next step where we
# have new generated tokens to return.
num_logits
=
num_tokens
else
:
# This is the last chunk of prompt tokens to return.
num_logits
=
num_remaining_tokens
completed_prefill_reqs
.
append
(
req_id
)
prompt_logprobs_dict
[
req_id
]
=
logprobs_tensors
if
num_logits
<=
0
:
# This can happen for the final chunk if we prefilled exactly
# (num_prompt_tokens - 1) tokens for this request in the prior
# step. There are no more prompt logprobs to produce.
continue
# Get the logits corresponding to this req's prompt tokens.
# If this is a partial request (i.e. chunked prefill),
...
...
@@ -1236,19 +1262,23 @@ class GPUModelRunner(LoRAModelRunnerMixin):
logprobs
,
num_prompt_logprobs
,
tgt_token_ids
)
# Transfer GPU->CPU async.
prompt_logprobs_dict
[
req_id
]
=
LogprobsTensors
(
token_ids
.
to
(
"cpu"
,
non_blocking
=
True
),
logprobs
.
to
(
"cpu"
,
non_blocking
=
True
),
ranks
.
to
(
"cpu"
,
non_blocking
=
True
),
)
chunk_slice
=
slice
(
start_idx
,
start_idx
+
num_logits
)
logprobs_tensors
.
logprob_token_ids
[
chunk_slice
].
copy_
(
token_ids
,
non_blocking
=
True
)
logprobs_tensors
.
logprobs
[
chunk_slice
].
copy_
(
logprobs
,
non_blocking
=
True
)
logprobs_tensors
.
selected_token_ranks
[
chunk_slice
].
copy_
(
ranks
,
non_blocking
=
True
)
# Remove requests that have completed prefill from the batch
# num_prompt_logprobs_dict.
for
req_id
in
completed_prefill_reqs
:
del
num_prompt_logprobs_dict
[
req_id
]
del
in_progress_dict
[
req_id
]
# Must synchronize the non-blocking GPU->CPU transfers.
torch
.
cuda
.
synchronize
()
if
prompt_logprobs_dict
:
torch
.
cuda
.
synchronize
()
return
prompt_logprobs_dict
...
...
vllm/fa_utils.py
→
vllm/
vllm_flash_attn/
fa_utils.py
View file @
31f6b24f
...
...
@@ -46,3 +46,9 @@ def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
return
fa_version
except
(
ImportError
,
AssertionError
):
return
None
def
flash_attn_supports_fp8
()
->
bool
:
from
vllm.platforms
import
current_platform
return
get_flash_attn_version
()
==
3
and
\
current_platform
.
get_device_capability
().
major
==
9
vllm/worker/hpu_model_runner.py
View file @
31f6b24f
...
...
@@ -376,8 +376,22 @@ class HpuModelAdapter:
mask
=
mask
>=
metadata
.
block_usage
.
unsqueeze
(
-
1
)
attn_bias
=
(
torch
.
zeros_like
(
mask
,
dtype
=
dtype
).
masked_fill_
(
mask
,
-
math
.
inf
))
block_mapping
=
torch
.
nn
.
functional
.
one_hot
(
metadata
.
block_groups
,
num_classes
=
batch_size
)
if
os
.
environ
.
get
(
'VLLM_USE_FAKE_HPU'
,
'0'
)
==
'0'
and
htorch
.
utils
.
internal
.
is_lazy
():
block_mapping
=
torch
.
nn
.
functional
.
one_hot
(
metadata
.
block_groups
,
num_classes
=
batch_size
)
else
:
# Unfortunately one_hot on CPU/torch.compile mode/eager mode
# doesn't handle out of bounds classes so we need to convert
# all negative values to 0 (block_mapping) or bs (block_groups)
block_groups
=
metadata
.
block_groups
.
to
(
torch
.
long
)
block_mapping
=
torch
.
nn
.
functional
.
relu
(
block_groups
)
block_mapping
=
torch
.
nn
.
functional
.
one_hot
(
block_mapping
,
num_classes
=
batch_size
)
oob_values
=
block_groups
.
lt
(
0
)
block_mapping
.
masked_fill_
(
oob_values
.
unsqueeze
(
-
1
),
0
)
block_groups
.
masked_fill_
(
oob_values
,
batch_size
)
metadata
=
metadata
.
_replace
(
block_groups
=
block_groups
)
block_mapping
=
block_mapping
.
to
(
dtype
)
metadata
=
metadata
.
_replace
(
block_mapping
=
block_mapping
,
attn_bias
=
attn_bias
)
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment