Unverified Commit 55e03b10 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Fix a bug in BatchTokenIDOut & Misc style and dependency updates (#7457)

parent 8aa68ed5
...@@ -113,6 +113,10 @@ jobs: ...@@ -113,6 +113,10 @@ jobs:
github.event.pull_request.draft == false github.event.pull_request.draft == false
needs: [unit-test-frontend, unit-test-backend-2-gpu] needs: [unit-test-frontend, unit-test-backend-2-gpu]
runs-on: 8-gpu-runner runs-on: 8-gpu-runner
strategy:
fail-fast: false
matrix:
part: [0, 1]
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
...@@ -125,7 +129,7 @@ jobs: ...@@ -125,7 +129,7 @@ jobs:
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
cd test/srt cd test/srt
python3 run_suite.py --suite per-commit-8-gpu python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
performance-test-1-gpu-part-1: performance-test-1-gpu-part-1:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
......
...@@ -29,6 +29,7 @@ runtime_common = [ ...@@ -29,6 +29,7 @@ runtime_common = [
"msgspec", "msgspec",
"ninja", "ninja",
"orjson", "orjson",
"outlines==0.1.11",
"packaging", "packaging",
"partial_json_parser", "partial_json_parser",
"pillow", "pillow",
...@@ -50,13 +51,12 @@ runtime_common = [ ...@@ -50,13 +51,12 @@ runtime_common = [
srt = [ srt = [
"sglang[runtime_common]", "sglang[runtime_common]",
"sgl-kernel==0.1.9", "sgl-kernel==0.1.9",
"flashinfer_python==0.2.6.post1",
"torch==2.7.1", "torch==2.7.1",
"torchaudio==2.7.1", "torchaudio==2.7.1",
"torchvision==0.22.1", "torchvision==0.22.1",
"cuda-python", "cuda-python",
"outlines>=0.0.44,<=0.1.11",
"einops", "einops",
"flashinfer_python==0.2.6.post1",
] ]
blackwell = [ blackwell = [
...@@ -66,7 +66,6 @@ blackwell = [ ...@@ -66,7 +66,6 @@ blackwell = [
"torchaudio==2.7.1", "torchaudio==2.7.1",
"torchvision==0.22.1", "torchvision==0.22.1",
"cuda-python", "cuda-python",
"outlines>=0.0.44,<=0.1.11",
"einops", "einops",
"flashinfer_python==0.2.6.post1", "flashinfer_python==0.2.6.post1",
] ]
...@@ -77,23 +76,22 @@ srt_hip = [ ...@@ -77,23 +76,22 @@ srt_hip = [
"sglang[runtime_common]", "sglang[runtime_common]",
"torch", "torch",
"vllm==0.6.7.dev2", "vllm==0.6.7.dev2",
"outlines==0.1.11"
] ]
# xpu is not enabled in public vllm and torch whl, # xpu is not enabled in public vllm and torch whl,
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] srt_xpu = ["sglang[runtime_common]"]
# For Intel Gaudi(device : hpu) follow the installation guide # For Intel Gaudi(device : hpu) follow the installation guide
# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html # https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] srt_hpu = ["sglang[runtime_common]"]
# CPU: currently, there are no pre-built vllm wheels for CPU. # CPU: currently, there are no pre-built vllm wheels for CPU.
# To install vllm for CPU, please follow the instruction here: # To install vllm for CPU, please follow the instruction here:
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html # https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "einops"] srt_cpu = ["sglang[runtime_common]", "einops"]
# https://vllm-ascend.readthedocs.io/en/latest/installation.html # https://vllm-ascend.readthedocs.io/en/latest/installation.html
srt_npu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"] srt_npu = ["sglang[runtime_common]"]
openai = ["openai>=1.0", "tiktoken"] openai = ["openai>=1.0", "tiktoken"]
anthropic = ["anthropic>=0.20.0"] anthropic = ["anthropic>=0.20.0"]
......
...@@ -788,6 +788,7 @@ class Req: ...@@ -788,6 +788,7 @@ class Req:
self.multimodal_inputs = None self.multimodal_inputs = None
self.grammar = None self.grammar = None
self.origin_input_ids = [0] # set it to one token to skip the long prefill self.origin_input_ids = [0] # set it to one token to skip the long prefill
self.return_logprob = False
self.finished_reason = FINISH_ABORT( self.finished_reason = FINISH_ABORT(
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError" error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
) )
......
...@@ -1374,7 +1374,14 @@ class Scheduler( ...@@ -1374,7 +1374,14 @@ class Scheduler(
) )
raise ValueError(msg) raise ValueError(msg)
if len(self.req_to_token_pool.free_slots) != self.req_to_token_pool.size: if self.disaggregation_mode == DisaggregationMode.DECODE:
req_total_size = (
self.req_to_token_pool.size + self.req_to_token_pool.pre_alloc_size
)
else:
req_total_size = self.req_to_token_pool.size
if len(self.req_to_token_pool.free_slots) != req_total_size:
msg = ( msg = (
"req_to_token_pool memory leak detected!" "req_to_token_pool memory leak detected!"
f"available_size={len(self.req_to_token_pool.free_slots)}, " f"available_size={len(self.req_to_token_pool.free_slots)}, "
......
...@@ -1226,7 +1226,7 @@ class TokenizerManager: ...@@ -1226,7 +1226,7 @@ class TokenizerManager:
state.last_output_offset = len(state.output_ids) state.last_output_offset = len(state.output_ids)
else: else:
state.output_ids.extend(recv_obj.output_ids[i]) state.output_ids.extend(recv_obj.output_ids[i])
output_token_ids = state.output_ids output_token_ids = state.output_ids.copy()
out_dict = { out_dict = {
"output_ids": output_token_ids, "output_ids": output_token_ids,
......
...@@ -1723,9 +1723,8 @@ class PortArgs: ...@@ -1723,9 +1723,8 @@ class PortArgs:
dist_init_host, dist_init_port = dist_init_addr dist_init_host, dist_init_port = dist_init_addr
port_base = int(dist_init_port) + 1 port_base = int(dist_init_port) + 1
if dp_rank is None: if dp_rank is None:
scheduler_input_port = ( # TokenizerManager to DataParallelController
port_base + 3 scheduler_input_port = port_base + 3
) # TokenizerManager to DataParallelController
else: else:
scheduler_input_port = port_base + 3 + 1 + dp_rank scheduler_input_port = port_base + 3 + 1 + dp_rank
......
...@@ -1917,13 +1917,6 @@ def configure_ipv6(dist_init_addr): ...@@ -1917,13 +1917,6 @@ def configure_ipv6(dist_init_addr):
return port, host return port, host
def rank0_log(msg: str):
from sglang.srt.distributed import get_tensor_model_parallel_rank
if get_tensor_model_parallel_rank() == 0:
logger.info(msg)
def rank0_print(msg: str): def rank0_print(msg: str):
from sglang.srt.distributed import get_tensor_model_parallel_rank from sglang.srt.distributed import get_tensor_model_parallel_rank
...@@ -1931,6 +1924,9 @@ def rank0_print(msg: str): ...@@ -1931,6 +1924,9 @@ def rank0_print(msg: str):
print(msg, flush=True) print(msg, flush=True)
rank0_log = rank0_print
def get_cuda_version(): def get_cuda_version():
if torch.version.cuda: if torch.version.cuda:
return tuple(map(int, torch.version.cuda.split("."))) return tuple(map(int, torch.version.cuda.split(".")))
......
...@@ -39,14 +39,6 @@ find_package(Torch REQUIRED) ...@@ -39,14 +39,6 @@ find_package(Torch REQUIRED)
# clean Torch Flag # clean Torch Flag
clear_cuda_arches(CMAKE_FLAG) clear_cuda_arches(CMAKE_FLAG)
if("${CUDA_VERSION}" VERSION_EQUAL "12.8")
set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM")
set(DeepGEMM_TAG "blackwell")
else()
set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM")
set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f")
endif()
include(FetchContent) include(FetchContent)
# cutlass # cutlass
...@@ -57,7 +49,16 @@ FetchContent_Declare( ...@@ -57,7 +49,16 @@ FetchContent_Declare(
GIT_SHALLOW OFF GIT_SHALLOW OFF
) )
FetchContent_Populate(repo-cutlass) FetchContent_Populate(repo-cutlass)
# DeepGEMM # DeepGEMM
if("${CUDA_VERSION}" VERSION_EQUAL "12.8")
set(DeepGEMM_REPO "https://github.com/sgl-project/DeepGEMM")
set(DeepGEMM_TAG "blackwell")
else()
set(DeepGEMM_REPO "https://github.com/deepseek-ai/DeepGEMM")
set(DeepGEMM_TAG "8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f")
endif()
FetchContent_Declare( FetchContent_Declare(
repo-deepgemm repo-deepgemm
GIT_REPOSITORY ${DeepGEMM_REPO} GIT_REPOSITORY ${DeepGEMM_REPO}
...@@ -107,7 +108,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") ...@@ -107,7 +108,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
message(STATUS "For aarch64, disable gencode below SM90 by default") message(STATUS "For aarch64, disable gencode below SM90 by default")
endif() endif()
include_directories( include_directories(
${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/csrc ${PROJECT_SOURCE_DIR}/csrc
...@@ -247,8 +247,8 @@ set(SOURCES ...@@ -247,8 +247,8 @@ set(SOURCES
"csrc/moe/ep_moe_reorder_kernel.cu" "csrc/moe/ep_moe_reorder_kernel.cu"
"csrc/moe/ep_moe_silu_and_mul_kernel.cu" "csrc/moe/ep_moe_silu_and_mul_kernel.cu"
"csrc/speculative/eagle_utils.cu" "csrc/speculative/eagle_utils.cu"
"csrc/speculative/speculative_sampling.cu"
"csrc/speculative/packbit.cu" "csrc/speculative/packbit.cu"
"csrc/speculative/speculative_sampling.cu"
"csrc/grammar/apply_token_bitmask_inplace_cuda.cu" "csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
"csrc/common_extension.cc" "csrc/common_extension.cc"
"${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu" "${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"
......
from typing import Optional, Union from typing import Optional, Union
import torch import torch
from sgl_kernel.utils import _to_tensor_scalar_tuple, get_cuda_stream from sgl_kernel.utils import _to_tensor_scalar_tuple
def _top_k_renorm_probs_internal( def _top_k_renorm_probs_internal(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment