Unverified Commit 0305c505 authored by Wenxuan Tan's avatar Wenxuan Tan Committed by GitHub
Browse files

Reduce memory accumulation in long-running server (#8306)


Co-authored-by: default avatarLiangsheng Yin <hnyls2002@gmail.com>
parent 8675bdf2
......@@ -30,7 +30,11 @@ class GlobalConfig:
self.default_new_token_ratio_decay_steps = float(
os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
)
self.torch_empty_cache_interval = float(
os.environ.get(
"SGLANG_EMPTY_CACHE_INTERVAL", -1
) # in seconds. Set if you observe high memory accumulation over a long serving period.
)
# Runtime constants: others
self.retract_decode_steps = 20
self.flashinfer_workspace_size = os.environ.get(
......
......@@ -2362,11 +2362,19 @@ class IdleSleeper:
def __init__(self, sockets):
self.poller = zmq.Poller()
self.last_empty_time = time.time()
for s in sockets:
self.poller.register(s, zmq.POLLIN)
def maybe_sleep(self):
self.poller.poll(1000)
if (
global_config.torch_empty_cache_interval > 0
and time.time() - self.last_empty_time
> global_config.torch_empty_cache_interval
):
self.last_empty_time = time.time()
torch.cuda.empty_cache()
def is_health_check_generate_req(recv_req):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment