Minor follow-up fixes for the logprob refactor (#2670)

21ec66e5 · Lianmin Zheng · GitHub · c5210dfa · 21ec66e5 · 21ec66e5
Unverified Commit 21ec66e5 authored Dec 30, 2024 by Lianmin Zheng Committed by GitHub Dec 30, 2024
5 changed files
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -35,21 +35,21 @@ from sglang.srt.model_executor.forward_batch_info import (
 @dataclasses.dataclass
 class LogitsProcessorOutput:
-    ## First part. This part will be returned by python/sglang/srt/layers/logits_processor.py::LogitsProcessor.
+    ## Part 1: This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
    # The logits of the next tokens.       shape: [#seq, vocab_size]
    next_token_logits: torch.Tensor
    # Used by speculative decoding (EAGLE)
    # The last hidden layers
    hidden_states: Optional[torch.Tensor] = None
-    ## Second part. This part will be returned by python/sglang/srt/layers/sampler.py::Sampler.
+    ## Part 2: This part will be assigned in python/sglang/srt/layers/sampler.py::Sampler
    # The logprobs of the next tokens.                              shape: [#seq]
    next_token_logprobs: Optional[torch.Tensor] = None
    # The logprobs and ids of the top-k tokens in output positions. shape: [#seq, k]
    next_token_top_logprobs_val: Optional[List] = None
    next_token_top_logprobs_idx: Optional[List] = None
-    ## Third part. This part will be returned by python/sglang/srt/layers/logits_processor.py::LogitsProcessor. Prefill-only.
+    ## Part 3: Prefill-only. This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
    # The normlaized logprobs of prompts.  shape: [#seq]
    normalized_prompt_logprobs: torch.Tensor = None
    # The logprobs of input tokens.        shape: [#token]

--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -56,7 +56,9 @@ class Sampler(nn.Module):
            if global_server_args_dict["sampling_backend"] == "flashinfer":
                if return_logprob:
-                    # NOTE: the top_p_renorm_prob from flashinfer has numerical problems
+                    # NOTE: the top_p_renorm_prob from flashinfer has numerical problems,
+                    # https://github.com/flashinfer-ai/flashinfer/issues/708
+                    # so we use the torch implementation.
                    logprobs = torch.log(
                        top_p_normalize_probs_torch(probs, sampling_info.top_ps)
                    )

--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -36,7 +36,7 @@ from sglang.srt.layers.attention.flashinfer_backend import FlashInferAttnBackend
 from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
-from sglang.srt.layers.sampler import Sampler, get_top_logprobs
+from sglang.srt.layers.sampler import Sampler
 from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
 from sglang.srt.lora.lora_manager import LoRAManager
 from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -191,10 +191,9 @@ class ModelRunner:
        torch.get_device_module(self.device).set_device(self.gpu_id)
        if self.device == "cuda":
            backend = "nccl"
-        # TODO(liangan1):Just use gloo to bypass the initilization fail
-        # Need to use xccl for xpu backend in the future
        elif self.device == "xpu":
+            # TODO(liangan1):Just use gloo to bypass the initilization fail
+            # Need to use xccl for xpu backend in the future
            backend = "gloo"
        elif self.device == "hpu":
            backend = "hccl"

--- a/python/sglang/srt/sampling/sampling_batch_info.py
+++ b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -244,7 +244,7 @@ class SamplingBatchInfo:
        # repetition
        if self.scaling_penalties is not None:
-            logits = torch.where(
+            logits[:] = torch.where(
                logits > 0,
                logits / self.scaling_penalties,
                logits * self.scaling_penalties,
@@ -253,5 +253,3 @@ class SamplingBatchInfo:
        # Apply regex vocab_mask
        if self.vocab_mask is not None:
            self.apply_mask(logits=logits, vocab_mask=self.vocab_mask)
-        return logits
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -227,7 +227,7 @@ class TestSRTEndpoint(unittest.TestCase):
                    "regex": "( Yes| No)",
                },
                "return_logprob": True,
-                "top_logprobs_num": 5,
+                "top_logprobs_num": 5,  # The grammar constraint allows all prefix tokens so we need to use a larger top_k.
                "return_text_in_logprobs": True,
            },
        )