Unverified Commit d77caa2b authored by Seungduk Kim's avatar Seungduk Kim Committed by GitHub
Browse files

[#2812] Make the decode status dict capcity adjustable by a CLI param (#2839)

parent 8b6a4486
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import dataclasses import dataclasses
import logging import logging
import os
import signal import signal
from collections import OrderedDict from collections import OrderedDict
from typing import Dict, List, Union from typing import Dict, List, Union
...@@ -35,6 +36,12 @@ from sglang.utils import find_printable_text, get_exception_traceback ...@@ -35,6 +36,12 @@ from sglang.utils import find_printable_text, get_exception_traceback
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Maximum number of request states that detokenizer can hold. When exceeded,
# oldest request states will be evicted. Default: 65536 (1<<16).
# For more details, see: https://github.com/sgl-project/sglang/issues/2812
# Use power of 2 values for better memory allocation.
DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 << 16))
@dataclasses.dataclass @dataclasses.dataclass
class DecodeStatus: class DecodeStatus:
...@@ -74,7 +81,7 @@ class DetokenizerManager: ...@@ -74,7 +81,7 @@ class DetokenizerManager:
revision=server_args.revision, revision=server_args.revision,
) )
self.decode_status = LimitedCapacityDict() self.decode_status = LimitedCapacityDict(capacity=DETOKENIZER_MAX_STATES)
def trim_matched_stop( def trim_matched_stop(
self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
...@@ -156,7 +163,17 @@ class DetokenizerManager: ...@@ -156,7 +163,17 @@ class DetokenizerManager:
# Incremental decoding # Incremental decoding
output_strs = [] output_strs = []
for i in range(bs): for i in range(bs):
try:
s = self.decode_status[recv_obj.rids[i]] s = self.decode_status[recv_obj.rids[i]]
except KeyError:
raise RuntimeError(
f"Decode status not found for request {recv_obj.rids[i]}. "
"It may be due to the request being evicted from the decode status due to memory pressure. "
"Please increase the maximum number of requests by setting "
"the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. "
f"The current value is {DETOKENIZER_MAX_STATES}. "
"For more details, see: https://github.com/sgl-project/sglang/issues/2812"
)
new_text = read_texts[i][len(surr_texts[i]) :] new_text = read_texts[i][len(surr_texts[i]) :]
if recv_obj.finished_reasons[i] is None: if recv_obj.finished_reasons[i] is None:
# Streaming chunk: update the decode status # Streaming chunk: update the decode status
...@@ -197,7 +214,7 @@ class DetokenizerManager: ...@@ -197,7 +214,7 @@ class DetokenizerManager:
class LimitedCapacityDict(OrderedDict): class LimitedCapacityDict(OrderedDict):
def __init__(self, capacity=1 << 15, *args, **kwargs): def __init__(self, capacity: int, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.capacity = capacity self.capacity = capacity
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment