"vscode:/vscode.git/clone" did not exist on "1a04812439c82a9dd318d14a800bb04e84dbbfc0"
Unverified Commit d77caa2b authored by Seungduk Kim's avatar Seungduk Kim Committed by GitHub
Browse files

[#2812] Make the decode status dict capcity adjustable by a CLI param (#2839)

parent 8b6a4486
......@@ -15,6 +15,7 @@
import dataclasses
import logging
import os
import signal
from collections import OrderedDict
from typing import Dict, List, Union
......@@ -35,6 +36,12 @@ from sglang.utils import find_printable_text, get_exception_traceback
logger = logging.getLogger(__name__)
# Maximum number of request states that detokenizer can hold. When exceeded,
# oldest request states will be evicted. Default: 65536 (1<<16).
# For more details, see: https://github.com/sgl-project/sglang/issues/2812
# Use power of 2 values for better memory allocation.
DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 << 16))
@dataclasses.dataclass
class DecodeStatus:
......@@ -74,7 +81,7 @@ class DetokenizerManager:
revision=server_args.revision,
)
self.decode_status = LimitedCapacityDict()
self.decode_status = LimitedCapacityDict(capacity=DETOKENIZER_MAX_STATES)
def trim_matched_stop(
self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
......@@ -156,7 +163,17 @@ class DetokenizerManager:
# Incremental decoding
output_strs = []
for i in range(bs):
s = self.decode_status[recv_obj.rids[i]]
try:
s = self.decode_status[recv_obj.rids[i]]
except KeyError:
raise RuntimeError(
f"Decode status not found for request {recv_obj.rids[i]}. "
"It may be due to the request being evicted from the decode status due to memory pressure. "
"Please increase the maximum number of requests by setting "
"the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. "
f"The current value is {DETOKENIZER_MAX_STATES}. "
"For more details, see: https://github.com/sgl-project/sglang/issues/2812"
)
new_text = read_texts[i][len(surr_texts[i]) :]
if recv_obj.finished_reasons[i] is None:
# Streaming chunk: update the decode status
......@@ -197,7 +214,7 @@ class DetokenizerManager:
class LimitedCapacityDict(OrderedDict):
def __init__(self, capacity=1 << 15, *args, **kwargs):
def __init__(self, capacity: int, *args, **kwargs):
super().__init__(*args, **kwargs)
self.capacity = capacity
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment