[BUGFIX] Move scores to float32 in case of running xgrammar on cpu (#12152)

Signed-off-by: Michal Adamczyk <madamczyk@habana.ai>

[BUGFIX] Move scores to float32 in case of running xgrammar on cpu (#12152)
Signed-off-by: Michal Adamczyk <madamczyk@habana.ai>
4e94951b · Michal Adamczyk · GitHub · 7a8a48d5 · 4e94951b
Unverified Commit 4e94951b authored Jan 19, 2025 by Michal Adamczyk Committed by GitHub Jan 19, 2025
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

vllm/model_executor/guided_decoding/xgrammar_decoding.py vllm/model_executor/guided_decoding/xgrammar_decoding.py +5 -2

No files found.
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -298,8 +298,11 @@ class XGrammarLogitsProcessor:
        # token_bitmask is a CPU tensor for use with accept_token and
        # fill_next_token_bitmask so we move it to the device of scores
        device_type = scores.device.type
+        dtype = scores.dtype
        if device_type != "cuda":
-            scores = scores.to("cpu").unsqueeze(0)
+            # xgrammar on cpu only supports float32 scores
+            # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
+            scores = scores.to("cpu").float().unsqueeze(0)

        # Note: In this method, if the tensors have different dimensions
        # on CPU device fails, but on GPU it runs without error. Hence the
@@ -307,7 +310,7 @@ class XGrammarLogitsProcessor:
        xgr.apply_token_bitmask_inplace(scores,
                                        self.token_bitmask.to(scores.device))
        if device_type != "cuda":
-            scores = scores.to(device_type).squeeze()
+            scores = scores.to(dtype).to(device_type).squeeze()

        return scores