Unverified Commit cd493b5a authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Improve metrics, logging, and importing orders (#2992)

parent 61f42b57
...@@ -52,7 +52,7 @@ jobs: ...@@ -52,7 +52,7 @@ jobs:
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
strategy: strategy:
matrix: matrix:
range: [0-6, 6-16, 16-23, 23-30, 30-38, 38-100] range: [0-6, 6-15, 15-22, 22-32, 32-37, 37-100]
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v3 uses: actions/checkout@v3
......
"""
Usage:
python3 offline_batch_inference.py --model meta-llama/Llama-3.1-8B-Instruct
"""
import argparse import argparse
import dataclasses import dataclasses
......
# SGL API Components # SGLang public APIs
# Frontend Language APIs
from sglang.api import ( from sglang.api import (
Engine, Engine,
Runtime, Runtime,
...@@ -23,16 +24,26 @@ from sglang.api import ( ...@@ -23,16 +24,26 @@ from sglang.api import (
user_end, user_end,
video, video,
) )
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.lang.choices import ( from sglang.lang.choices import (
greedy_token_selection, greedy_token_selection,
token_length_normalized, token_length_normalized,
unconditional_likelihood_normalized, unconditional_likelihood_normalized,
) )
from sglang.utils import LazyImport
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
# Other configs
from sglang.global_config import global_config
from sglang.version import __version__
# SGLang DSL APIs
__all__ = [ __all__ = [
"Runtime",
"Engine", "Engine",
"Runtime",
"assistant", "assistant",
"assistant_begin", "assistant_begin",
"assistant_end", "assistant_end",
...@@ -52,27 +63,14 @@ __all__ = [ ...@@ -52,27 +63,14 @@ __all__ = [
"user_begin", "user_begin",
"user_end", "user_end",
"video", "video",
"RuntimeEndpoint",
"greedy_token_selection", "greedy_token_selection",
"token_length_normalized", "token_length_normalized",
"unconditional_likelihood_normalized", "unconditional_likelihood_normalized",
"Anthropic",
"LiteLLM",
"OpenAI",
"VertexAI",
"global_config",
"__version__",
] ]
# Global Configurations
from sglang.global_config import global_config
__all__ += ["global_config"]
from sglang.version import __version__
__all__ += ["__version__"]
# SGLang Backends
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.utils import LazyImport
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
...@@ -19,9 +19,6 @@ from sglang.lang.ir import ( ...@@ -19,9 +19,6 @@ from sglang.lang.ir import (
REGEX_STR, REGEX_STR,
SglSamplingParams, SglSamplingParams,
) )
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import is_port_available, kill_process_tree
from sglang.utils import http_request from sglang.utils import http_request
...@@ -342,7 +339,7 @@ class Runtime: ...@@ -342,7 +339,7 @@ class Runtime:
using the commond line interface. using the commond line interface.
It is mainly used for the frontend language. It is mainly used for the frontend language.
You should use the Engine class if you want to do normal offline processing. You should use the Engine class if you want to do normal offline processing without the frontend language.
""" """
def __init__( def __init__(
...@@ -352,13 +349,14 @@ class Runtime: ...@@ -352,13 +349,14 @@ class Runtime:
**kwargs, **kwargs,
): ):
"""See the arguments in server_args.py::ServerArgs""" """See the arguments in server_args.py::ServerArgs"""
# We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
# client code without installing SRT server and its dependency if they want.
from sglang.srt.server import launch_server from sglang.srt.server import launch_server
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import is_port_available
self.server_args = ServerArgs(*args, log_level=log_level, **kwargs) self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
# before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
atexit.register(self.shutdown)
# Pre-allocate ports # Pre-allocate ports
for port in range(self.server_args.port, 40000): for port in range(self.server_args.port, 40000):
if is_port_available(port): if is_port_available(port):
...@@ -380,6 +378,10 @@ class Runtime: ...@@ -380,6 +378,10 @@ class Runtime:
pipe_writer.close() pipe_writer.close()
self.pid = proc.pid self.pid = proc.pid
# Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
atexit.register(self.shutdown)
# TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
try: try:
init_state = pipe_reader.recv() init_state = pipe_reader.recv()
except EOFError: except EOFError:
...@@ -394,6 +396,8 @@ class Runtime: ...@@ -394,6 +396,8 @@ class Runtime:
self.endpoint = RuntimeEndpoint(self.url) self.endpoint = RuntimeEndpoint(self.url)
def shutdown(self): def shutdown(self):
from sglang.srt.utils import kill_process_tree
if self.pid is not None: if self.pid is not None:
kill_process_tree(self.pid) kill_process_tree(self.pid)
self.pid = None self.pid = None
...@@ -402,6 +406,8 @@ class Runtime: ...@@ -402,6 +406,8 @@ class Runtime:
self.endpoint.cache_prefix(prefix) self.endpoint.cache_prefix(prefix)
def get_tokenizer(self): def get_tokenizer(self):
from sglang.srt.hf_transformers_utils import get_tokenizer
return get_tokenizer( return get_tokenizer(
self.server_args.tokenizer_path, self.server_args.tokenizer_path,
tokenizer_mode=self.server_args.tokenizer_mode, tokenizer_mode=self.server_args.tokenizer_mode,
......
...@@ -785,8 +785,9 @@ class Scheduler: ...@@ -785,8 +785,9 @@ class Scheduler:
f"gen throughput (token/s): {gen_throughput:.2f}, " f"gen throughput (token/s): {gen_throughput:.2f}, "
f"#queue-req: {len(self.waiting_queue)}" f"#queue-req: {len(self.waiting_queue)}"
) )
spec_accept_length = 0
else: else:
accept_length = ( spec_accept_length = (
self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct
) )
self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0 self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
...@@ -795,7 +796,7 @@ class Scheduler: ...@@ -795,7 +796,7 @@ class Scheduler:
f"#running-req: {num_running_reqs}, " f"#running-req: {num_running_reqs}, "
f"#token: {num_used}, " f"#token: {num_used}, "
f"token usage: {num_used / self.max_total_num_tokens:.2f}, " f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
f"accept len: {accept_length:.2f}, " f"accept len: {spec_accept_length:.2f}, "
f"gen throughput (token/s): {gen_throughput:.2f}, " f"gen throughput (token/s): {gen_throughput:.2f}, "
f"#queue-req: {len(self.waiting_queue)}" f"#queue-req: {len(self.waiting_queue)}"
) )
...@@ -807,6 +808,7 @@ class Scheduler: ...@@ -807,6 +808,7 @@ class Scheduler:
self.stats.token_usage = num_used / self.max_total_num_tokens self.stats.token_usage = num_used / self.max_total_num_tokens
self.stats.gen_throughput = gen_throughput self.stats.gen_throughput = gen_throughput
self.stats.num_queue_reqs = len(self.waiting_queue) self.stats.num_queue_reqs = len(self.waiting_queue)
self.stats.spec_accept_length = spec_accept_length
self.metrics_collector.log_stats(self.stats) self.metrics_collector.log_stats(self.stats)
def check_memory(self): def check_memory(self):
......
...@@ -25,6 +25,7 @@ class SchedulerStats: ...@@ -25,6 +25,7 @@ class SchedulerStats:
gen_throughput: float = 0.0 gen_throughput: float = 0.0
num_queue_reqs: int = 0 num_queue_reqs: int = 0
cache_hit_rate: float = 0.0 cache_hit_rate: float = 0.0
spec_accept_length: float = 0.0
class SchedulerMetricsCollector: class SchedulerMetricsCollector:
...@@ -37,42 +38,49 @@ class SchedulerMetricsCollector: ...@@ -37,42 +38,49 @@ class SchedulerMetricsCollector:
self.num_running_reqs = Gauge( self.num_running_reqs = Gauge(
name="sglang:num_running_reqs", name="sglang:num_running_reqs",
documentation="The number of running requests", documentation="The number of running requests.",
labelnames=labels.keys(), labelnames=labels.keys(),
multiprocess_mode="sum", multiprocess_mode="sum",
) )
self.num_used_tokens = Gauge( self.num_used_tokens = Gauge(
name="sglang:num_used_tokens", name="sglang:num_used_tokens",
documentation="The number of used tokens", documentation="The number of used tokens.",
labelnames=labels.keys(), labelnames=labels.keys(),
multiprocess_mode="sum", multiprocess_mode="sum",
) )
self.token_usage = Gauge( self.token_usage = Gauge(
name="sglang:token_usage", name="sglang:token_usage",
documentation="The token usage", documentation="The token usage.",
labelnames=labels.keys(), labelnames=labels.keys(),
multiprocess_mode="mostrecent", multiprocess_mode="mostrecent",
) )
self.gen_throughput = Gauge( self.gen_throughput = Gauge(
name="sglang:gen_throughput", name="sglang:gen_throughput",
documentation="The generate throughput (token/s)", documentation="The generation throughput (token/s).",
labelnames=labels.keys(), labelnames=labels.keys(),
multiprocess_mode="sum", multiprocess_mode="sum",
) )
self.num_queue_reqs = Gauge( self.num_queue_reqs = Gauge(
name="sglang:num_queue_reqs", name="sglang:num_queue_reqs",
documentation="The number of requests in the waiting queue", documentation="The number of requests in the waiting queue.",
labelnames=labels.keys(), labelnames=labels.keys(),
multiprocess_mode="sum", multiprocess_mode="sum",
) )
self.cache_hit_rate = Gauge( self.cache_hit_rate = Gauge(
name="sglang:cache_hit_rate", name="sglang:cache_hit_rate",
documentation="The cache hit rate", documentation="The prefix cache hit rate.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)
self.spec_accept_length = Gauge(
name="sglang:spec_accept_length",
documentation="The average acceptance length of speculative decoding.",
labelnames=labels.keys(), labelnames=labels.keys(),
multiprocess_mode="mostrecent", multiprocess_mode="mostrecent",
) )
...@@ -88,6 +96,7 @@ class SchedulerMetricsCollector: ...@@ -88,6 +96,7 @@ class SchedulerMetricsCollector:
self._log_gauge(self.gen_throughput, stats.gen_throughput) self._log_gauge(self.gen_throughput, stats.gen_throughput)
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs) self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate) self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
class TokenizerMetricsCollector: class TokenizerMetricsCollector:
......
# a lightweihgt wrapper on router with argument type and comments # a lightweihgt wrapper on router with argument type and comments
from sglang_router_rs import PolicyType
# no wrapper on policy type => direct export # no wrapper on policy type => direct export
from .router import Router from sglang_router.router import Router
__all__ = ["Router", "PolicyType"]
from sglang_router.version import __version__ from sglang_router.version import __version__
from sglang_router_rs import PolicyType
__all__ += ["__version__"] __all__ = ["Router", "PolicyType", "__version__"]
...@@ -42,8 +42,7 @@ suites = { ...@@ -42,8 +42,7 @@ suites = {
"test_srt_endpoint.py", "test_srt_endpoint.py",
"test_torch_compile.py", "test_torch_compile.py",
"test_torch_compile_moe.py", "test_torch_compile_moe.py",
# Temporarily disable this because it requires PyTorch >= 2.5 "test_torch_native_attention_backend.py",
# "test_torch_native_attention_backend.py",
"test_torchao.py", "test_torchao.py",
"test_triton_attention_kernels.py", "test_triton_attention_kernels.py",
"test_triton_attention_backend.py", "test_triton_attention_backend.py",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment