Unverified Commit 5e36a0b4 authored by Yingchun Lai's avatar Yingchun Lai Committed by GitHub
Browse files

[metrics][EPLB]: Support selected count of physical experts on each GPU (#9825)

parent 0297773a
......@@ -6,14 +6,15 @@ SGLang supports various environment variables that can be used to configure its
## General Configuration
| Environment Variable | Description | Default Value |
| --- | --- | --- |
| `SGLANG_USE_MODELSCOPE` | Enable using models from ModelScope | `false` |
| `SGLANG_HOST_IP` | Host IP address for the server | `0.0.0.0` |
| `SGLANG_PORT` | Port for the server | auto-detected |
| `SGLANG_LOGGING_CONFIG_PATH` | Custom logging configuration path | Not set |
| `SGLANG_DISABLE_REQUEST_LOGGING` | Disable request logging | `false` |
| `SGLANG_HEALTH_CHECK_TIMEOUT` | Timeout for health check in seconds | `20` |
| Environment Variable | Description | Default Value |
|-------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------|---------------|
| `SGLANG_USE_MODELSCOPE` | Enable using models from ModelScope | `false` |
| `SGLANG_HOST_IP` | Host IP address for the server | `0.0.0.0` |
| `SGLANG_PORT` | Port for the server | auto-detected |
| `SGLANG_LOGGING_CONFIG_PATH` | Custom logging configuration path | Not set |
| `SGLANG_DISABLE_REQUEST_LOGGING` | Disable request logging | `false` |
| `SGLANG_HEALTH_CHECK_TIMEOUT` | Timeout for health check in seconds | `20` |
| `SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL` | The interval of passes to collect the metric of selected count of physical experts on each layer and GPU rank. 0 means disabled. | `0` |
## Performance Tuning
......
......@@ -28,6 +28,7 @@ import torch
import torch.distributed
from sglang.srt.environ import envs
from sglang.srt.metrics.collector import ExpertDispatchCollector
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import Withable, is_npu
......@@ -661,6 +662,10 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
self.window_sizes = [10, 100, 1000]
self._history = _DequeCollection(maxlens=self.window_sizes)
self._rank = torch.distributed.get_rank()
self._expert_dispatch_collector = ExpertDispatchCollector(
self._expert_location_metadata.ep_size
)
self._collection_counter = 0
def append(
self,
......@@ -692,6 +697,8 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
)
if self._rank == 0:
self._collect_metrics_if_needed(gpu_physical_count)
utilization_rate_tensor = compute_utilization_rate(gpu_physical_count)
utilization_rate = torch.mean(utilization_rate_tensor).item()
self._history.append(utilization_rate)
......@@ -707,6 +714,31 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
# f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}"
)
def _collect_metrics_if_needed(self, gpu_physical_count: torch.Tensor):
# sglang:eplb_gpu_physical_count metric is disabled if SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL <= 0
if (
envs.SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL > 0
and self._collection_counter % envs.SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL
== 0
):
for layer_idx in range(self._expert_location_metadata.num_layers):
count_of_layer = (
self._expert_dispatch_collector.eplb_gpu_physical_count.labels(
layer=str(layer_idx)
)
)
# Exclude the +Inf bucket.
assert (
self._expert_location_metadata.ep_size
== len(count_of_layer._buckets) - 1
), f"{self._expert_location_metadata.ep_size=}, {len(count_of_layer._buckets)=}"
for gpu_rank in range(self._expert_location_metadata.ep_size):
count = gpu_physical_count[layer_idx, gpu_rank]
if count > 0:
count_of_layer._sum.inc(count * gpu_rank)
count_of_layer._buckets[gpu_rank].inc(count)
self._collection_counter += 1
class _DequeCollection:
def __init__(self, maxlens: List[int]):
......
......@@ -999,3 +999,16 @@ class StorageMetricsCollector:
self._log_histogram(self.histogram_prefetch_bandwidth, v)
for v in storage_metrics.backup_bandwidth:
self._log_histogram(self.histogram_backup_bandwidth, v)
class ExpertDispatchCollector:
def __init__(self, ep_size: int) -> None:
from prometheus_client import Histogram
ep_size_buckets = [i for i in range(ep_size)]
self.eplb_gpu_physical_count = Histogram(
name="sglang:eplb_gpu_physical_count",
documentation="The selected count of physical experts on each layer and GPU rank.",
labelnames={"layer"},
buckets=ep_size_buckets,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment