[Bugfix] use float32 precision in samplers/test_logprobs.py for comparing with HF (#6409)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>

[Bugfix] use float32 precision in samplers/test_logprobs.py for comparing with HF (#6409)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
4ef95b0f · Thomas Parnell · GitHub · eaec4b91 · 4ef95b0f · 4ef95b0f
Unverified Commit 4ef95b0f authored Jul 15, 2024 by Thomas Parnell Committed by GitHub Jul 15, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 1 deletion

tests/samplers/test_logprobs.py tests/samplers/test_logprobs.py +2 -1

vllm/attention/ops/prefix_prefill.py vllm/attention/ops/prefix_prefill.py +6 -0

No files found.
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -11,7 +11,8 @@ MODELS = ["facebook/opt-125m"]
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype",
+                         ["float"])  # needed for comparing logprobs with HF
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
 @pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
 @pytest.mark.parametrize("detokenize", [True, False])

--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -687,6 +687,12 @@ if triton.__version__ >= "2.1.0":
        cap = current_platform.get_device_capability()
        BLOCK = 128 if cap[0] >= 8 else 64
+        # need to reduce num. blocks when using fp32
+        # due to increased use of GPU shared memory
+        if q.dtype is torch.float32:
+            BLOCK = BLOCK // 2
        # shape constraints
        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
        assert Lq == Lk and Lk == Lv