[metrics][EPLB]: Support selected count of physical experts on each GPU (#9825)

5e36a0b4 · Yingchun Lai · GitHub · 0297773a · 5e36a0b4 · 5e36a0b4
Unverified Commit 5e36a0b4 authored Oct 29, 2025 by Yingchun Lai Committed by GitHub Oct 28, 2025
3 changed files
--- a/docs/references/environment_variables.md
+++ b/docs/references/environment_variables.md
@@ -6,14 +6,15 @@ SGLang supports various environment variables that can be used to configure its

 ## General Configuration

-| Environment Variable | Description | Default Value |
-| --- | --- | --- |
-| `SGLANG_USE_MODELSCOPE` | Enable using models from ModelScope | `false` |
-| `SGLANG_HOST_IP` | Host IP address for the server | `0.0.0.0` |
-| `SGLANG_PORT` | Port for the server | auto-detected |
-| `SGLANG_LOGGING_CONFIG_PATH` | Custom logging configuration path | Not set |
-| `SGLANG_DISABLE_REQUEST_LOGGING` | Disable request logging | `false` |
-| `SGLANG_HEALTH_CHECK_TIMEOUT` | Timeout for health check in seconds | `20` |
+| Environment Variable                      | Description                                                                                                                      | Default Value |
+|-------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------|---------------|
+| `SGLANG_USE_MODELSCOPE`                   | Enable using models from ModelScope                                                                                              | `false`       |
+| `SGLANG_HOST_IP`                          | Host IP address for the server                                                                                                   | `0.0.0.0`     |
+| `SGLANG_PORT`                             | Port for the server                                                                                                              | auto-detected |
+| `SGLANG_LOGGING_CONFIG_PATH`              | Custom logging configuration path                                                                                                | Not set       |
+| `SGLANG_DISABLE_REQUEST_LOGGING`          | Disable request logging                                                                                                          | `false`       |
+| `SGLANG_HEALTH_CHECK_TIMEOUT`             | Timeout for health check in seconds                                                                                              | `20`          |
+| `SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL` | The interval of passes to collect the metric of selected count of physical experts on each layer and GPU rank. 0 means disabled. | `0`           |

 ## Performance Tuning


--- a/python/sglang/srt/eplb/expert_distribution.py
+++ b/python/sglang/srt/eplb/expert_distribution.py
@@ -28,6 +28,7 @@ import torch
 import torch.distributed

 from sglang.srt.environ import envs
+from sglang.srt.metrics.collector import ExpertDispatchCollector
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import Withable, is_npu
@@ -661,6 +662,10 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
            self.window_sizes = [10, 100, 1000]
            self._history = _DequeCollection(maxlens=self.window_sizes)
            self._rank = torch.distributed.get_rank()
+            self._expert_dispatch_collector = ExpertDispatchCollector(
+                self._expert_location_metadata.ep_size
+            )
+            self._collection_counter = 0

    def append(
        self,
@@ -692,6 +697,8 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
        )

        if self._rank == 0:
+            self._collect_metrics_if_needed(gpu_physical_count)
+
            utilization_rate_tensor = compute_utilization_rate(gpu_physical_count)
            utilization_rate = torch.mean(utilization_rate_tensor).item()
            self._history.append(utilization_rate)
@@ -707,6 +714,31 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
                # f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}"
            )

+    def _collect_metrics_if_needed(self, gpu_physical_count: torch.Tensor):
+        # sglang:eplb_gpu_physical_count metric is disabled if SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL <= 0
+        if (
+            envs.SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL > 0
+            and self._collection_counter % envs.SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL
+            == 0
+        ):
+            for layer_idx in range(self._expert_location_metadata.num_layers):
+                count_of_layer = (
+                    self._expert_dispatch_collector.eplb_gpu_physical_count.labels(
+                        layer=str(layer_idx)
+                    )
+                )
+                # Exclude the +Inf bucket.
+                assert (
+                    self._expert_location_metadata.ep_size
+                    == len(count_of_layer._buckets) - 1
+                ), f"{self._expert_location_metadata.ep_size=}, {len(count_of_layer._buckets)=}"
+                for gpu_rank in range(self._expert_location_metadata.ep_size):
+                    count = gpu_physical_count[layer_idx, gpu_rank]
+                    if count > 0:
+                        count_of_layer._sum.inc(count * gpu_rank)
+                        count_of_layer._buckets[gpu_rank].inc(count)
+        self._collection_counter += 1
+

 class _DequeCollection:
    def __init__(self, maxlens: List[int]):

--- a/python/sglang/srt/metrics/collector.py
+++ b/python/sglang/srt/metrics/collector.py
@@ -999,3 +999,16 @@ class StorageMetricsCollector:
            self._log_histogram(self.histogram_prefetch_bandwidth, v)
        for v in storage_metrics.backup_bandwidth:
            self._log_histogram(self.histogram_backup_bandwidth, v)
+
+
+class ExpertDispatchCollector:
+    def __init__(self, ep_size: int) -> None:
+        from prometheus_client import Histogram
+
+        ep_size_buckets = [i for i in range(ep_size)]
+        self.eplb_gpu_physical_count = Histogram(
+            name="sglang:eplb_gpu_physical_count",
+            documentation="The selected count of physical experts on each layer and GPU rank.",
+            labelnames={"layer"},
+            buckets=ep_size_buckets,
+        )