Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8d6cd32b
Unverified
Commit
8d6cd32b
authored
Mar 05, 2025
by
Lu Fang
Committed by
GitHub
Mar 05, 2025
Browse files
[Bugfix][V1] Fix allowed_token_ids for v1 Sampler (#14169)
Signed-off-by:
Lu Fang
<
lufang@fb.com
>
parent
ec79b67c
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
4 deletions
+12
-4
vllm/v1/engine/processor.py
vllm/v1/engine/processor.py
+5
-3
vllm/v1/worker/gpu_input_batch.py
vllm/v1/worker/gpu_input_batch.py
+7
-1
No files found.
vllm/v1/engine/processor.py
View file @
8d6cd32b
...
@@ -92,10 +92,12 @@ class Processor:
...
@@ -92,10 +92,12 @@ class Processor:
return
return
if
params
.
allowed_token_ids
is
None
:
if
params
.
allowed_token_ids
is
None
:
return
return
if
not
all
(
0
<=
tid
<
self
.
model_config
.
vocab_size
if
not
params
.
allowed_token_ids
:
for
tid
in
params
.
allowed_token_ids
):
raise
ValueError
(
"allowed_token_ids is not None and empty!"
)
vocab_size
=
self
.
model_config
.
get_vocab_size
()
if
not
all
(
0
<=
tid
<
vocab_size
for
tid
in
params
.
allowed_token_ids
):
raise
ValueError
(
raise
ValueError
(
"allowed_token_ids contains out-of-vocab token id"
)
"allowed_token_ids contains out-of-vocab token id
!
"
)
def
process_inputs
(
def
process_inputs
(
self
,
self
,
...
...
vllm/v1/worker/gpu_input_batch.py
View file @
8d6cd32b
...
@@ -199,6 +199,8 @@ class InputBatch:
...
@@ -199,6 +199,8 @@ class InputBatch:
self
.
logit_bias
:
list
[
Optional
[
dict
[
int
,
self
.
logit_bias
:
list
[
Optional
[
dict
[
int
,
float
]]]
=
[
None
]
*
max_num_reqs
float
]]]
=
[
None
]
*
max_num_reqs
self
.
has_allowed_token_ids
:
set
[
str
]
=
set
()
self
.
has_allowed_token_ids
:
set
[
str
]
=
set
()
# NOTE(lufang): In the mask tensor, if the corresponding token allowed,
# the value is False. Since we use masked_fill_ to set -inf.
self
.
allowed_token_ids_mask
:
Optional
[
torch
.
Tensor
]
=
None
self
.
allowed_token_ids_mask
:
Optional
[
torch
.
Tensor
]
=
None
self
.
allowed_token_ids_mask_cpu_tensor
:
Optional
[
torch
.
Tensor
]
=
None
self
.
allowed_token_ids_mask_cpu_tensor
:
Optional
[
torch
.
Tensor
]
=
None
...
@@ -300,6 +302,7 @@ class InputBatch:
...
@@ -300,6 +302,7 @@ class InputBatch:
self
.
has_allowed_token_ids
.
add
(
req_id
)
self
.
has_allowed_token_ids
.
add
(
req_id
)
if
self
.
allowed_token_ids_mask_cpu_tensor
is
None
:
if
self
.
allowed_token_ids_mask_cpu_tensor
is
None
:
# Lazy allocation for this tensor, which can be large.
# Lazy allocation for this tensor, which can be large.
# False means we don't fill with -inf.
self
.
allowed_token_ids_mask
=
torch
.
zeros
(
self
.
max_num_reqs
,
self
.
allowed_token_ids_mask
=
torch
.
zeros
(
self
.
max_num_reqs
,
self
.
vocab_size
,
self
.
vocab_size
,
dtype
=
torch
.
bool
,
dtype
=
torch
.
bool
,
...
@@ -309,8 +312,10 @@ class InputBatch:
...
@@ -309,8 +312,10 @@ class InputBatch:
self
.
vocab_size
,
self
.
vocab_size
,
dtype
=
torch
.
bool
,
dtype
=
torch
.
bool
,
device
=
"cpu"
)
device
=
"cpu"
)
self
.
allowed_token_ids_mask_cpu_tensor
[
req_index
]
=
True
# False means we don't fill with -inf.
self
.
allowed_token_ids_mask_cpu_tensor
[
req_index
][
self
.
allowed_token_ids_mask_cpu_tensor
[
req_index
][
sampling_params
.
allowed_token_ids
]
=
Tru
e
sampling_params
.
allowed_token_ids
]
=
Fals
e
# Add request lora ID
# Add request lora ID
if
request
.
lora_request
:
if
request
.
lora_request
:
...
@@ -359,6 +364,7 @@ class InputBatch:
...
@@ -359,6 +364,7 @@ class InputBatch:
self
.
logit_bias
[
req_index
]
=
None
self
.
logit_bias
[
req_index
]
=
None
self
.
has_allowed_token_ids
.
discard
(
req_id
)
self
.
has_allowed_token_ids
.
discard
(
req_id
)
if
self
.
allowed_token_ids_mask_cpu_tensor
is
not
None
:
if
self
.
allowed_token_ids_mask_cpu_tensor
is
not
None
:
# False means we don't fill with -inf.
self
.
allowed_token_ids_mask_cpu_tensor
[
req_index
].
fill_
(
False
)
self
.
allowed_token_ids_mask_cpu_tensor
[
req_index
].
fill_
(
False
)
return
req_index
return
req_index
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment