Unverified Commit cd493b5a authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Improve metrics, logging, and importing orders (#2992)

parent 61f42b57
......@@ -52,7 +52,7 @@ jobs:
runs-on: 1-gpu-runner
strategy:
matrix:
range: [0-6, 6-16, 16-23, 23-30, 30-38, 38-100]
range: [0-6, 6-15, 15-22, 22-32, 32-37, 37-100]
steps:
- name: Checkout code
uses: actions/checkout@v3
......
"""
Usage:
python3 offline_batch_inference.py --model meta-llama/Llama-3.1-8B-Instruct
"""
import argparse
import dataclasses
......
# SGL API Components
# SGLang public APIs
# Frontend Language APIs
from sglang.api import (
Engine,
Runtime,
......@@ -23,16 +24,26 @@ from sglang.api import (
user_end,
video,
)
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.lang.choices import (
greedy_token_selection,
token_length_normalized,
unconditional_likelihood_normalized,
)
from sglang.utils import LazyImport
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
# Other configs
from sglang.global_config import global_config
from sglang.version import __version__
# SGLang DSL APIs
__all__ = [
"Runtime",
"Engine",
"Runtime",
"assistant",
"assistant_begin",
"assistant_end",
......@@ -52,27 +63,14 @@ __all__ = [
"user_begin",
"user_end",
"video",
"RuntimeEndpoint",
"greedy_token_selection",
"token_length_normalized",
"unconditional_likelihood_normalized",
"Anthropic",
"LiteLLM",
"OpenAI",
"VertexAI",
"global_config",
"__version__",
]
# Global Configurations
from sglang.global_config import global_config
__all__ += ["global_config"]
from sglang.version import __version__
__all__ += ["__version__"]
# SGLang Backends
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.utils import LazyImport
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
......@@ -19,9 +19,6 @@ from sglang.lang.ir import (
REGEX_STR,
SglSamplingParams,
)
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import is_port_available, kill_process_tree
from sglang.utils import http_request
......@@ -342,7 +339,7 @@ class Runtime:
using the commond line interface.
It is mainly used for the frontend language.
You should use the Engine class if you want to do normal offline processing.
You should use the Engine class if you want to do normal offline processing without the frontend language.
"""
def __init__(
......@@ -352,13 +349,14 @@ class Runtime:
**kwargs,
):
"""See the arguments in server_args.py::ServerArgs"""
# We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
# client code without installing SRT server and its dependency if they want.
from sglang.srt.server import launch_server
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import is_port_available
self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
# before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
atexit.register(self.shutdown)
# Pre-allocate ports
for port in range(self.server_args.port, 40000):
if is_port_available(port):
......@@ -380,6 +378,10 @@ class Runtime:
pipe_writer.close()
self.pid = proc.pid
# Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
atexit.register(self.shutdown)
# TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
try:
init_state = pipe_reader.recv()
except EOFError:
......@@ -394,6 +396,8 @@ class Runtime:
self.endpoint = RuntimeEndpoint(self.url)
def shutdown(self):
from sglang.srt.utils import kill_process_tree
if self.pid is not None:
kill_process_tree(self.pid)
self.pid = None
......@@ -402,6 +406,8 @@ class Runtime:
self.endpoint.cache_prefix(prefix)
def get_tokenizer(self):
from sglang.srt.hf_transformers_utils import get_tokenizer
return get_tokenizer(
self.server_args.tokenizer_path,
tokenizer_mode=self.server_args.tokenizer_mode,
......
......@@ -785,8 +785,9 @@ class Scheduler:
f"gen throughput (token/s): {gen_throughput:.2f}, "
f"#queue-req: {len(self.waiting_queue)}"
)
spec_accept_length = 0
else:
accept_length = (
spec_accept_length = (
self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct
)
self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
......@@ -795,7 +796,7 @@ class Scheduler:
f"#running-req: {num_running_reqs}, "
f"#token: {num_used}, "
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
f"accept len: {accept_length:.2f}, "
f"accept len: {spec_accept_length:.2f}, "
f"gen throughput (token/s): {gen_throughput:.2f}, "
f"#queue-req: {len(self.waiting_queue)}"
)
......@@ -807,6 +808,7 @@ class Scheduler:
self.stats.token_usage = num_used / self.max_total_num_tokens
self.stats.gen_throughput = gen_throughput
self.stats.num_queue_reqs = len(self.waiting_queue)
self.stats.spec_accept_length = spec_accept_length
self.metrics_collector.log_stats(self.stats)
def check_memory(self):
......
......@@ -25,6 +25,7 @@ class SchedulerStats:
gen_throughput: float = 0.0
num_queue_reqs: int = 0
cache_hit_rate: float = 0.0
spec_accept_length: float = 0.0
class SchedulerMetricsCollector:
......@@ -37,42 +38,49 @@ class SchedulerMetricsCollector:
self.num_running_reqs = Gauge(
name="sglang:num_running_reqs",
documentation="The number of running requests",
documentation="The number of running requests.",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.num_used_tokens = Gauge(
name="sglang:num_used_tokens",
documentation="The number of used tokens",
documentation="The number of used tokens.",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.token_usage = Gauge(
name="sglang:token_usage",
documentation="The token usage",
documentation="The token usage.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
self.gen_throughput = Gauge(
name="sglang:gen_throughput",
documentation="The generate throughput (token/s)",
documentation="The generation throughput (token/s).",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.num_queue_reqs = Gauge(
name="sglang:num_queue_reqs",
documentation="The number of requests in the waiting queue",
documentation="The number of requests in the waiting queue.",
labelnames=labels.keys(),
multiprocess_mode="sum",
)
self.cache_hit_rate = Gauge(
name="sglang:cache_hit_rate",
documentation="The cache hit rate",
documentation="The prefix cache hit rate.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
self.spec_accept_length = Gauge(
name="sglang:spec_accept_length",
documentation="The average acceptance length of speculative decoding.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
......@@ -88,6 +96,7 @@ class SchedulerMetricsCollector:
self._log_gauge(self.gen_throughput, stats.gen_throughput)
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
class TokenizerMetricsCollector:
......
# a lightweihgt wrapper on router with argument type and comments
from sglang_router_rs import PolicyType
# no wrapper on policy type => direct export
from .router import Router
__all__ = ["Router", "PolicyType"]
from sglang_router.router import Router
from sglang_router.version import __version__
from sglang_router_rs import PolicyType
__all__ += ["__version__"]
__all__ = ["Router", "PolicyType", "__version__"]
......@@ -42,8 +42,7 @@ suites = {
"test_srt_endpoint.py",
"test_torch_compile.py",
"test_torch_compile_moe.py",
# Temporarily disable this because it requires PyTorch >= 2.5
# "test_torch_native_attention_backend.py",
"test_torch_native_attention_backend.py",
"test_torchao.py",
"test_triton_attention_kernels.py",
"test_triton_attention_backend.py",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment