Unverified Commit 057fc94c authored by Willy Hardy's avatar Willy Hardy Committed by GitHub
Browse files

[Bugfix] Fix structured output crash on CPU due to pin_memory=True (#37706)


Signed-off-by: default avatarWilly Hardy <whardy@redhat.com>
Signed-off-by: default avatarWill Hardy <whardy@redhat.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent b58c5f28
...@@ -17,6 +17,7 @@ from diskcache import Cache ...@@ -17,6 +17,7 @@ from diskcache import Cache
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils.import_utils import LazyLoader from vllm.utils.import_utils import LazyLoader
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -106,28 +107,33 @@ def apply_grammar_bitmask( ...@@ -106,28 +107,33 @@ def apply_grammar_bitmask(
# since the bitmask is already aligned with the logits. # since the bitmask is already aligned with the logits.
skip_out_indices = len(out_indices) == logits.shape[0] skip_out_indices = len(out_indices) == logits.shape[0]
if not logits.is_cpu:
index_tensor = None index_tensor = None
if not skip_out_indices: if not skip_out_indices:
# xgrammar expects a python list of indices but it will actually work with # xgrammar expects a python list of indices but it will actually work with
# a tensor. If we copy the tensor ourselves here we can do it in a non_blocking # a tensor. If we copy the tensor ourselves here we can do it in a
# manner and there should be no cpu sync within xgrammar. # non_blocking manner and there should be no cpu sync within xgrammar.
pin_memory = is_pin_memory_available()
index_tensor = torch.tensor( index_tensor = torch.tensor(
out_indices, dtype=torch.int32, device="cpu", pin_memory=True out_indices, dtype=torch.int32, device="cpu", pin_memory=pin_memory
) )
index_tensor = index_tensor.to(logits.device, non_blocking=True) index_tensor = index_tensor.to(logits.device, non_blocking=True)
xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=index_tensor)
return
# CPU case, use list for indices.
indices = None if skip_out_indices else out_indices
# Handle dtype conversion for CPU (older xgrammar CPU kernels require float32) # Handle dtype conversion for CPU (older xgrammar CPU kernels require float32)
# See: https://github.com/vllm-project/vllm/issues/31901 # See: https://github.com/vllm-project/vllm/issues/31901
if logits.device.type == "cpu" and logits.dtype != torch.float32: if logits.dtype != torch.float32:
# Convert to float32, apply bitmask, then convert back # Convert to float32, apply bitmask, then convert back
logits_float32 = logits.to(torch.float32) logits_fp32 = logits.to(torch.float32)
xgr.apply_token_bitmask_inplace( xgr.apply_token_bitmask_inplace(logits_fp32, grammar_bitmask, indices=indices)
logits_float32, grammar_bitmask, indices=index_tensor
)
# Copy the modified values back to the original tensor # Copy the modified values back to the original tensor
logits.copy_(logits_float32.to(logits.dtype)) logits.copy_(logits_fp32.to(logits.dtype))
else: else:
xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=index_tensor) xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=indices)
class OutlinesVocabulary: class OutlinesVocabulary:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment