Unverified Commit 821fde2d authored by Karan Bansal's avatar Karan Bansal Committed by GitHub
Browse files

[Bugfix] Fix xgrammar dtype mismatch on macOS CPU inference (#32384)


Signed-off-by: default avatarKaran Bansal <karanb192@gmail.com>
Co-authored-by: default avatarInokinoki <inoki@inoki.cc>
parent 8c29042b
...@@ -116,6 +116,17 @@ def apply_grammar_bitmask( ...@@ -116,6 +116,17 @@ def apply_grammar_bitmask(
) )
index_tensor = index_tensor.to(logits.device, non_blocking=True) index_tensor = index_tensor.to(logits.device, non_blocking=True)
# Handle dtype conversion for CPU (older xgrammar CPU kernels require float32)
# See: https://github.com/vllm-project/vllm/issues/31901
if logits.device.type == "cpu" and logits.dtype != torch.float32:
# Convert to float32, apply bitmask, then convert back
logits_float32 = logits.to(torch.float32)
xgr.apply_token_bitmask_inplace(
logits_float32, grammar_bitmask, indices=index_tensor
)
# Copy the modified values back to the original tensor
logits.copy_(logits_float32.to(logits.dtype))
else:
xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=index_tensor) xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=index_tensor)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment