"vscode:/vscode.git/clone" did not exist on "95fadb089f6eab102403542fd503764c8fcb0baa"
Unverified Commit 5e36a0b4 authored by Yingchun Lai's avatar Yingchun Lai Committed by GitHub
Browse files

[metrics][EPLB]: Support selected count of physical experts on each GPU (#9825)

parent 0297773a
...@@ -7,13 +7,14 @@ SGLang supports various environment variables that can be used to configure its ...@@ -7,13 +7,14 @@ SGLang supports various environment variables that can be used to configure its
## General Configuration ## General Configuration
| Environment Variable | Description | Default Value | | Environment Variable | Description | Default Value |
| --- | --- | --- | |-------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------|---------------|
| `SGLANG_USE_MODELSCOPE` | Enable using models from ModelScope | `false` | | `SGLANG_USE_MODELSCOPE` | Enable using models from ModelScope | `false` |
| `SGLANG_HOST_IP` | Host IP address for the server | `0.0.0.0` | | `SGLANG_HOST_IP` | Host IP address for the server | `0.0.0.0` |
| `SGLANG_PORT` | Port for the server | auto-detected | | `SGLANG_PORT` | Port for the server | auto-detected |
| `SGLANG_LOGGING_CONFIG_PATH` | Custom logging configuration path | Not set | | `SGLANG_LOGGING_CONFIG_PATH` | Custom logging configuration path | Not set |
| `SGLANG_DISABLE_REQUEST_LOGGING` | Disable request logging | `false` | | `SGLANG_DISABLE_REQUEST_LOGGING` | Disable request logging | `false` |
| `SGLANG_HEALTH_CHECK_TIMEOUT` | Timeout for health check in seconds | `20` | | `SGLANG_HEALTH_CHECK_TIMEOUT` | Timeout for health check in seconds | `20` |
| `SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL` | The interval of passes to collect the metric of selected count of physical experts on each layer and GPU rank. 0 means disabled. | `0` |
## Performance Tuning ## Performance Tuning
......
...@@ -28,6 +28,7 @@ import torch ...@@ -28,6 +28,7 @@ import torch
import torch.distributed import torch.distributed
from sglang.srt.environ import envs from sglang.srt.environ import envs
from sglang.srt.metrics.collector import ExpertDispatchCollector
from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.server_args import ServerArgs from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import Withable, is_npu from sglang.srt.utils import Withable, is_npu
...@@ -661,6 +662,10 @@ class _UtilizationRateAccumulatorMixin(_Accumulator): ...@@ -661,6 +662,10 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
self.window_sizes = [10, 100, 1000] self.window_sizes = [10, 100, 1000]
self._history = _DequeCollection(maxlens=self.window_sizes) self._history = _DequeCollection(maxlens=self.window_sizes)
self._rank = torch.distributed.get_rank() self._rank = torch.distributed.get_rank()
self._expert_dispatch_collector = ExpertDispatchCollector(
self._expert_location_metadata.ep_size
)
self._collection_counter = 0
def append( def append(
self, self,
...@@ -692,6 +697,8 @@ class _UtilizationRateAccumulatorMixin(_Accumulator): ...@@ -692,6 +697,8 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
) )
if self._rank == 0: if self._rank == 0:
self._collect_metrics_if_needed(gpu_physical_count)
utilization_rate_tensor = compute_utilization_rate(gpu_physical_count) utilization_rate_tensor = compute_utilization_rate(gpu_physical_count)
utilization_rate = torch.mean(utilization_rate_tensor).item() utilization_rate = torch.mean(utilization_rate_tensor).item()
self._history.append(utilization_rate) self._history.append(utilization_rate)
...@@ -707,6 +714,31 @@ class _UtilizationRateAccumulatorMixin(_Accumulator): ...@@ -707,6 +714,31 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
# f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}" # f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}"
) )
def _collect_metrics_if_needed(self, gpu_physical_count: torch.Tensor):
# sglang:eplb_gpu_physical_count metric is disabled if SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL <= 0
if (
envs.SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL > 0
and self._collection_counter % envs.SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL
== 0
):
for layer_idx in range(self._expert_location_metadata.num_layers):
count_of_layer = (
self._expert_dispatch_collector.eplb_gpu_physical_count.labels(
layer=str(layer_idx)
)
)
# Exclude the +Inf bucket.
assert (
self._expert_location_metadata.ep_size
== len(count_of_layer._buckets) - 1
), f"{self._expert_location_metadata.ep_size=}, {len(count_of_layer._buckets)=}"
for gpu_rank in range(self._expert_location_metadata.ep_size):
count = gpu_physical_count[layer_idx, gpu_rank]
if count > 0:
count_of_layer._sum.inc(count * gpu_rank)
count_of_layer._buckets[gpu_rank].inc(count)
self._collection_counter += 1
class _DequeCollection: class _DequeCollection:
def __init__(self, maxlens: List[int]): def __init__(self, maxlens: List[int]):
......
...@@ -999,3 +999,16 @@ class StorageMetricsCollector: ...@@ -999,3 +999,16 @@ class StorageMetricsCollector:
self._log_histogram(self.histogram_prefetch_bandwidth, v) self._log_histogram(self.histogram_prefetch_bandwidth, v)
for v in storage_metrics.backup_bandwidth: for v in storage_metrics.backup_bandwidth:
self._log_histogram(self.histogram_backup_bandwidth, v) self._log_histogram(self.histogram_backup_bandwidth, v)
class ExpertDispatchCollector:
def __init__(self, ep_size: int) -> None:
from prometheus_client import Histogram
ep_size_buckets = [i for i in range(ep_size)]
self.eplb_gpu_physical_count = Histogram(
name="sglang:eplb_gpu_physical_count",
documentation="The selected count of physical experts on each layer and GPU rank.",
labelnames={"layer"},
buckets=ep_size_buckets,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment