Fix a bug in BatchTokenIDOut & Misc style and dependency updates (#7457)

55e03b10 · Lianmin Zheng · GitHub · 8aa68ed5 · 55e03b10 · 55e03b10
Unverified Commit 55e03b10 authored Jun 23, 2025 by Lianmin Zheng Committed by GitHub Jun 23, 2025
9 changed files
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -113,6 +113,10 @@ jobs:
        github.event.pull_request.draft == false
    needs: [unit-test-frontend, unit-test-backend-2-gpu]
    runs-on: 8-gpu-runner
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
@@ -125,7 +129,7 @@ jobs:
        timeout-minutes: 20
        run: |
          cd test/srt
-          python3 run_suite.py --suite per-commit-8-gpu
+          python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2

  performance-test-1-gpu-part-1:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&

--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -29,6 +29,7 @@ runtime_common = [
    "msgspec",
    "ninja",
    "orjson",
+    "outlines==0.1.11",
    "packaging",
    "partial_json_parser",
    "pillow",
@@ -50,13 +51,12 @@ runtime_common = [
 srt = [
    "sglang[runtime_common]",
    "sgl-kernel==0.1.9",
-    "flashinfer_python==0.2.6.post1",
    "torch==2.7.1",
    "torchaudio==2.7.1",
    "torchvision==0.22.1",
    "cuda-python",
-    "outlines>=0.0.44,<=0.1.11",
    "einops",
+    "flashinfer_python==0.2.6.post1",
 ]

 blackwell = [
@@ -66,7 +66,6 @@ blackwell = [
    "torchaudio==2.7.1",
    "torchvision==0.22.1",
    "cuda-python",
-    "outlines>=0.0.44,<=0.1.11",
    "einops",
    "flashinfer_python==0.2.6.post1",
 ]
@@ -77,23 +76,22 @@ srt_hip = [
    "sglang[runtime_common]",
    "torch",
    "vllm==0.6.7.dev2",
-    "outlines==0.1.11"
 ]

 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
-srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
+srt_xpu = ["sglang[runtime_common]"]

 # For Intel Gaudi(device : hpu) follow the installation guide
 # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
-srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
+srt_hpu = ["sglang[runtime_common]"]

 # CPU: currently, there are no pre-built vllm wheels for CPU.
 # To install vllm for CPU, please follow the instruction here:
 # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
-srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"]
+srt_cpu = ["sglang[runtime_common]", "einops"]
 # https://vllm-ascend.readthedocs.io/en/latest/installation.html
-srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
+srt_npu = ["sglang[runtime_common]"]

 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]

--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -788,6 +788,7 @@ class Req:
        self.multimodal_inputs = None
        self.grammar = None
        self.origin_input_ids = [0]  # set it to one token to skip the long prefill
+        self.return_logprob = False
        self.finished_reason = FINISH_ABORT(
            error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
        )

--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1374,7 +1374,14 @@ class Scheduler(
            )
            raise ValueError(msg)

-        if len(self.req_to_token_pool.free_slots) != self.req_to_token_pool.size:
+        if self.disaggregation_mode == DisaggregationMode.DECODE:
+            req_total_size = (
+                self.req_to_token_pool.size + self.req_to_token_pool.pre_alloc_size
+            )
+        else:
+            req_total_size = self.req_to_token_pool.size
+
+        if len(self.req_to_token_pool.free_slots) != req_total_size:
            msg = (
                "req_to_token_pool memory leak detected!"
                f"available_size={len(self.req_to_token_pool.free_slots)}, "

--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -1226,7 +1226,7 @@ class TokenizerManager:
                    state.last_output_offset = len(state.output_ids)
                else:
                    state.output_ids.extend(recv_obj.output_ids[i])
-                    output_token_ids = state.output_ids
+                    output_token_ids = state.output_ids.copy()

                out_dict = {
                    "output_ids": output_token_ids,

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -1723,9 +1723,8 @@ class PortArgs:
            dist_init_host, dist_init_port = dist_init_addr
            port_base = int(dist_init_port) + 1
            if dp_rank is None:
-                scheduler_input_port = (
-                    port_base + 3
-                )  # TokenizerManager to DataParallelController
+                # TokenizerManager to DataParallelController
+                scheduler_input_port = port_base + 3
            else:
                scheduler_input_port = port_base + 3 + 1 + dp_rank


--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -1917,13 +1917,6 @@ def configure_ipv6(dist_init_addr):
    return port, host


-def rank0_log(msg: str):
-    from sglang.srt.distributed import get_tensor_model_parallel_rank
-
-    if get_tensor_model_parallel_rank() == 0:
-        logger.info(msg)
-
-
 def rank0_print(msg: str):
    from sglang.srt.distributed import get_tensor_model_parallel_rank

@@ -1931,6 +1924,9 @@ def rank0_print(msg: str):
        print(msg, flush=True)


+rank0_log = rank0_print
+
+
 def get_cuda_version():
    if torch.version.cuda:
        return tuple(map(int, torch.version.cuda.split(".")))

--- a/sgl-kernel/CMakeLists.txt
+++ b/sgl-kernel/CMakeLists.txt
@@ -39,14 +39,6 @@ find_package(Torch REQUIRED)
 # clean Torch Flag
 clear_cuda_arches(CMAKE_FLAG)

-if("${CUDA_VERSION}" VERSION_EQUAL "12.8")
-  set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM")
-  set(DeepGEMM_TAG "blackwell")
-else()
-  set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM")
-  set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f")
-endif()
-
 include(FetchContent)

 # cutlass
@@ -57,7 +49,16 @@ FetchContent_Declare(
    GIT_SHALLOW    OFF
 )
 FetchContent_Populate(repo-cutlass)
+
 # DeepGEMM
+if("${CUDA_VERSION}" VERSION_EQUAL "12.8")
+  set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM")
+  set(DeepGEMM_TAG "blackwell")
+else()
+  set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM")
+  set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f")
+endif()
+
 FetchContent_Declare(
    repo-deepgemm
    GIT_REPOSITORY ${DeepGEMM_REPO}
@@ -107,7 +108,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
    message(STATUS "For aarch64, disable gencode below SM90 by default")
 endif()

-
 include_directories(
    ${PROJECT_SOURCE_DIR}/include
    ${PROJECT_SOURCE_DIR}/csrc
@@ -247,8 +247,8 @@ set(SOURCES
    "csrc/moe/ep_moe_reorder_kernel.cu"
    "csrc/moe/ep_moe_silu_and_mul_kernel.cu"
    "csrc/speculative/eagle_utils.cu"
-    "csrc/speculative/speculative_sampling.cu"
    "csrc/speculative/packbit.cu"
+    "csrc/speculative/speculative_sampling.cu"
    "csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
    "csrc/common_extension.cc"
    "${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"

--- a/sgl-kernel/python/sgl_kernel/sampling.py
+++ b/sgl-kernel/python/sgl_kernel/sampling.py
 from typing import Optional, Union

 import torch
-from sgl_kernel.utils import _to_tensor_scalar_tuple, get_cuda_stream
+from sgl_kernel.utils import _to_tensor_scalar_tuple


 def _top_k_renorm_probs_internal(