Unverified Commit 55dfb539 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[Auto Sync] Update scheduler_metrics_mixin.py, collector.py (20251104) (#12647)


Co-authored-by: default avatargithub-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: default avatarSangBin Cho <rkooo567@gmail.com>
parent 42889acb
...@@ -164,6 +164,8 @@ class SchedulerMetricsMixin: ...@@ -164,6 +164,8 @@ class SchedulerMetricsMixin:
self.stats.token_usage = token_usage self.stats.token_usage = token_usage
if self.is_hybrid: if self.is_hybrid:
self.stats.swa_token_usage = swa_token_usage self.stats.swa_token_usage = swa_token_usage
if self.is_hybrid_gdn:
self.stats.mamba_usage = mamba_usage
self.stats.num_queue_reqs = len(self.waiting_queue) self.stats.num_queue_reqs = len(self.waiting_queue)
self.stats.num_grammar_queue_reqs = len(self.grammar_queue) self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
self.stats.cache_hit_rate = cache_hit_rate self.stats.cache_hit_rate = cache_hit_rate
...@@ -306,6 +308,8 @@ class SchedulerMetricsMixin: ...@@ -306,6 +308,8 @@ class SchedulerMetricsMixin:
self.stats.token_usage = token_usage self.stats.token_usage = token_usage
if self.is_hybrid: if self.is_hybrid:
self.stats.swa_token_usage = swa_token_usage self.stats.swa_token_usage = swa_token_usage
if self.is_hybrid_gdn:
self.stats.mamba_usage = mamba_usage
self.stats.gen_throughput = self.last_gen_throughput self.stats.gen_throughput = self.last_gen_throughput
self.stats.num_queue_reqs = len(self.waiting_queue) self.stats.num_queue_reqs = len(self.waiting_queue)
self.stats.num_grammar_queue_reqs = len(self.grammar_queue) self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
......
...@@ -150,6 +150,7 @@ class SchedulerStats: ...@@ -150,6 +150,7 @@ class SchedulerStats:
token_usage: float = 0.0 token_usage: float = 0.0
pending_prealloc_token_usage: float = 0.0 pending_prealloc_token_usage: float = 0.0
swa_token_usage: float = 0.0 swa_token_usage: float = 0.0
mamba_usage: float = 0.0
gen_throughput: float = 0.0 gen_throughput: float = 0.0
num_queue_reqs: int = 0 num_queue_reqs: int = 0
num_grammar_queue_reqs: int = 0 num_grammar_queue_reqs: int = 0
...@@ -225,6 +226,12 @@ class SchedulerMetricsCollector: ...@@ -225,6 +226,12 @@ class SchedulerMetricsCollector:
labelnames=labels.keys(), labelnames=labels.keys(),
multiprocess_mode="mostrecent", multiprocess_mode="mostrecent",
) )
self.mamba_usage = Gauge(
name="sglang:mamba_usage",
documentation="The token usage for Mamba layers.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
self.gen_throughput = Gauge( self.gen_throughput = Gauge(
name="sglang:gen_throughput", name="sglang:gen_throughput",
documentation="The generation throughput (token/s).", documentation="The generation throughput (token/s).",
...@@ -581,6 +588,7 @@ class SchedulerMetricsCollector: ...@@ -581,6 +588,7 @@ class SchedulerMetricsCollector:
self.pending_prealloc_token_usage, stats.pending_prealloc_token_usage self.pending_prealloc_token_usage, stats.pending_prealloc_token_usage
) )
self._log_gauge(self.swa_token_usage, stats.swa_token_usage) self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
self._log_gauge(self.mamba_usage, stats.mamba_usage)
self._log_gauge(self.gen_throughput, stats.gen_throughput) self._log_gauge(self.gen_throughput, stats.gen_throughput)
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs) self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs) self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment