"tests/python/common/vscode:/vscode.git/clone" did not exist on "74c9d27d16c75a1f67e73561bfca964b8502bd3b"
Unverified Commit 22ec7bc2 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Expose more arguments to control the scheduling policy (#32)

parent c0454b32
...@@ -4,9 +4,9 @@ from typing import Callable, List, Optional, Union ...@@ -4,9 +4,9 @@ from typing import Callable, List, Optional, Union
from sglang.backend.anthropic import Anthropic from sglang.backend.anthropic import Anthropic
from sglang.backend.base_backend import BaseBackend from sglang.backend.base_backend import BaseBackend
from sglang.backend.vertexai import VertexAI
from sglang.backend.openai import OpenAI from sglang.backend.openai import OpenAI
from sglang.backend.runtime_endpoint import RuntimeEndpoint from sglang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.backend.vertexai import VertexAI
from sglang.global_config import global_config from sglang.global_config import global_config
from sglang.lang.ir import ( from sglang.lang.ir import (
SglExpr, SglExpr,
......
...@@ -54,7 +54,9 @@ class SglSamplingParams: ...@@ -54,7 +54,9 @@ class SglSamplingParams:
def to_vertexai_kwargs(self): def to_vertexai_kwargs(self):
if self.regex is not None: if self.regex is not None:
warnings.warn("Regular expression is not supported in the VertexAI backend.") warnings.warn(
"Regular expression is not supported in the VertexAI backend."
)
return { return {
"candidate_count": 1, "candidate_count": 1,
"max_output_tokens": self.max_new_tokens, "max_output_tokens": self.max_new_tokens,
...@@ -67,7 +69,9 @@ class SglSamplingParams: ...@@ -67,7 +69,9 @@ class SglSamplingParams:
def to_anthropic_kwargs(self): def to_anthropic_kwargs(self):
# Anthropic does not support frequency_penalty or presence_penalty, so we drop it here # Anthropic does not support frequency_penalty or presence_penalty, so we drop it here
if self.regex is not None: if self.regex is not None:
warnings.warn("Regular expression is not supported in the Anthropic backend.") warnings.warn(
"Regular expression is not supported in the Anthropic backend."
)
return { return {
"max_tokens_to_sample": self.max_new_tokens, "max_tokens_to_sample": self.max_new_tokens,
"stop_sequences": self.stop, "stop_sequences": self.stop,
......
...@@ -45,6 +45,7 @@ class ModelRpcServer(rpyc.Service): ...@@ -45,6 +45,7 @@ class ModelRpcServer(rpyc.Service):
self.tp_rank = tp_rank self.tp_rank = tp_rank
self.tp_size = server_args.tp_size self.tp_size = server_args.tp_size
self.schedule_heuristic = server_args.schedule_heuristic self.schedule_heuristic = server_args.schedule_heuristic
self.schedule_conservativeness = server_args.schedule_conservativeness
# Init model and tokenizer # Init model and tokenizer
self.model_config = ModelConfig( self.model_config = ModelConfig(
...@@ -248,7 +249,9 @@ class ModelRpcServer(rpyc.Service): ...@@ -248,7 +249,9 @@ class ModelRpcServer(rpyc.Service):
available_size = ( available_size = (
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size() self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
) )
new_ratio = self.scheduler.new_token_estimation_ratio() new_ratio = (
self.scheduler.new_token_estimation_ratio() * self.schedule_conservativeness
)
if self.running_batch: if self.running_batch:
available_size -= sum( available_size -= sum(
[ [
......
...@@ -16,6 +16,7 @@ class ServerArgs: ...@@ -16,6 +16,7 @@ class ServerArgs:
tp_size: int = 1 tp_size: int = 1
model_mode: List[str] = () model_mode: List[str] = ()
schedule_heuristic: str = "lpm" schedule_heuristic: str = "lpm"
schedule_conservativeness: float = 1.0
random_seed: int = 42 random_seed: int = 42
stream_interval: int = 2 stream_interval: int = 2
disable_log_stats: bool = False disable_log_stats: bool = False
...@@ -85,7 +86,7 @@ class ServerArgs: ...@@ -85,7 +86,7 @@ class ServerArgs:
"--mem-fraction-static", "--mem-fraction-static",
type=float, type=float,
default=ServerArgs.mem_fraction_static, default=ServerArgs.mem_fraction_static,
help="The fraction of the memory used for static allocation (model weights and KV cache memory pool)", help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
) )
parser.add_argument( parser.add_argument(
"--tp-size", "--tp-size",
...@@ -107,6 +108,12 @@ class ServerArgs: ...@@ -107,6 +108,12 @@ class ServerArgs:
default=ServerArgs.schedule_heuristic, default=ServerArgs.schedule_heuristic,
help="Schudule mode: [lpm, weight, random, fcfs]", help="Schudule mode: [lpm, weight, random, fcfs]",
) )
parser.add_argument(
"--schedule-conservativeness",
type=float,
default=ServerArgs.schedule_conservativeness,
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see out-of-memory errors.",
)
parser.add_argument( parser.add_argument(
"--random-seed", "--random-seed",
type=int, type=int,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment