Unverified Commit 641b7d0a authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[Minor] Improve code style (#2422)

parent 0ce091a8
...@@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] ...@@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
dev = ["sglang[all]", "sglang[test]"] dev = ["sglang[all]", "sglang[test]"]
dev_hip = ["sglang[all_hip]", "sglang[test]"] dev_hip = ["sglang[all_hip]", "sglang[test]"]
dev_xpu = ["sglang[all_xpu]", "sglang[test]"] dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
......
...@@ -285,7 +285,7 @@ def throughput_test( ...@@ -285,7 +285,7 @@ def throughput_test(
else: else:
raise ValueError('Please set backend to either "engine" or "runtime"') raise ValueError('Please set backend to either "engine" or "runtime"')
tokenizer_id = server_args.model_path tokenizer_id = server_args.tokenizer_path or server_args.model_path
tokenizer = get_tokenizer(tokenizer_id) tokenizer = get_tokenizer(tokenizer_id)
# Set global environmnets # Set global environmnets
......
...@@ -117,6 +117,9 @@ class XGrammarGrammarBackend(BaseGrammarBackend): ...@@ -117,6 +117,9 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
key_type, key_string = key key_type, key_string = key
if key_type == "json": if key_type == "json":
try: try:
if key_string == "$$ANY$$":
ctx = self.grammar_compiler.compile_builtin_json_grammar()
else:
ctx = self.grammar_compiler.compile_json_schema(schema=key_string) ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
except RuntimeError as e: except RuntimeError as e:
logging.warning( logging.warning(
......
...@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING ...@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING
import torch import torch
from sglang.srt.layers.attention import AttentionBackend from sglang.srt.layers.attention import AttentionBackend
from sglang.srt.managers.schedule_batch import global_server_args_dict
from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_executor.forward_batch_info import ForwardBatch
if TYPE_CHECKING: if TYPE_CHECKING:
......
...@@ -48,7 +48,14 @@ class RadixAttention(nn.Module): ...@@ -48,7 +48,14 @@ class RadixAttention(nn.Module):
self.sliding_window_size = sliding_window_size or -1 self.sliding_window_size = sliding_window_size or -1
self.is_cross_attention = is_cross_attention self.is_cross_attention = is_cross_attention
def forward(self, q, k, v, forward_batch: ForwardBatch, save_kv_cache=True): def forward(
self,
q,
k,
v,
forward_batch: ForwardBatch,
save_kv_cache: bool = True,
):
if k is not None: if k is not None:
# For cross-layer sharing, kv can be None # For cross-layer sharing, kv can be None
assert v is not None assert v is not None
......
...@@ -484,7 +484,7 @@ bid = 0 ...@@ -484,7 +484,7 @@ bid = 0
@dataclasses.dataclass @dataclasses.dataclass
class ScheduleBatch: class ScheduleBatch:
"""Store all inforamtion of a batch on the scheduler.""" """Store all information of a batch on the scheduler."""
# Request, memory pool, and cache # Request, memory pool, and cache
reqs: List[Req] reqs: List[Req]
......
...@@ -22,7 +22,7 @@ import signal ...@@ -22,7 +22,7 @@ import signal
import sys import sys
import time import time
import uuid import uuid
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Union
import fastapi import fastapi
import uvloop import uvloop
......
...@@ -127,7 +127,7 @@ class CudaGraphRunner: ...@@ -127,7 +127,7 @@ class CudaGraphRunner:
# Batch sizes to capture # Batch sizes to capture
if model_runner.server_args.disable_cuda_graph_padding: if model_runner.server_args.disable_cuda_graph_padding:
self.capture_bs = list(range(1, 32)) + [64, 128] self.capture_bs = list(range(1, 33)) + [64, 128]
else: else:
self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)] self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
......
...@@ -242,20 +242,22 @@ class ModelRunner: ...@@ -242,20 +242,22 @@ class ModelRunner:
if torch.cuda.get_device_capability()[1] < 5: if torch.cuda.get_device_capability()[1] < 5:
raise RuntimeError("SGLang only supports sm75 and above.") raise RuntimeError("SGLang only supports sm75 and above.")
# Prepare the vllm model config # Prepare the model config
self.load_config = LoadConfig( self.load_config = LoadConfig(
load_format=self.server_args.load_format, load_format=self.server_args.load_format,
download_dir=self.server_args.download_dir, download_dir=self.server_args.download_dir,
) )
if self.server_args.load_format == "gguf": if self.server_args.load_format == "gguf":
monkey_patch_vllm_gguf_config() monkey_patch_vllm_gguf_config()
# Load the model
self.model = get_model( self.model = get_model(
model_config=self.model_config, model_config=self.model_config,
load_config=self.load_config, load_config=self.load_config,
device_config=DeviceConfig(self.device), device_config=DeviceConfig(self.device),
) )
# Parse other args
self.sliding_window_size = ( self.sliding_window_size = (
self.model.get_attention_sliding_window_size() self.model.get_attention_sliding_window_size()
if hasattr(self.model, "get_attention_sliding_window_size") if hasattr(self.model, "get_attention_sliding_window_size")
...@@ -270,8 +272,10 @@ class ModelRunner: ...@@ -270,8 +272,10 @@ class ModelRunner:
f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB" f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
) )
def update_weights_from_disk(self, model_path: str, load_format: str): def update_weights_from_disk(
"""Update engine weights online from disk.""" self, model_path: str, load_format: str
) -> tuple[bool, str]:
"""Update engine weights in-place from the disk."""
from sglang.srt.model_loader.loader import ( from sglang.srt.model_loader.loader import (
DefaultModelLoader, DefaultModelLoader,
device_loading_context, device_loading_context,
......
...@@ -32,7 +32,6 @@ class Gemma2ForSequenceClassification(nn.Module): ...@@ -32,7 +32,6 @@ class Gemma2ForSequenceClassification(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
self.torchao_config = None
self.quant_config = quant_config self.quant_config = quant_config
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.model = Gemma2Model(config, quant_config=quant_config) self.model = Gemma2Model(config, quant_config=quant_config)
......
...@@ -33,7 +33,6 @@ class LlamaForClassification(nn.Module): ...@@ -33,7 +33,6 @@ class LlamaForClassification(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
self.torchao_config = None
self.quant_config = quant_config self.quant_config = quant_config
self.model = LlamaModel(config, quant_config=quant_config) self.model = LlamaModel(config, quant_config=quant_config)
......
...@@ -21,7 +21,6 @@ from transformers import LlamaConfig ...@@ -21,7 +21,6 @@ from transformers import LlamaConfig
from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.quantization.base_config import QuantizationConfig
from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
...@@ -33,7 +32,6 @@ class LlamaForSequenceClassification(nn.Module): ...@@ -33,7 +32,6 @@ class LlamaForSequenceClassification(nn.Module):
) -> None: ) -> None:
super().__init__() super().__init__()
self.config = config self.config = config
self.torchao_config = None
self.quant_config = quant_config self.quant_config = quant_config
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.model = LlamaModel(config, quant_config=quant_config) self.model = LlamaModel(config, quant_config=quant_config)
......
...@@ -196,7 +196,7 @@ async def stop_profile_async(): ...@@ -196,7 +196,7 @@ async def stop_profile_async():
@app.post("/update_weights_from_disk") @app.post("/update_weights_from_disk")
@time_func_latency @time_func_latency
async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request): async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
"""Update the weights from disk inplace without re-launching the server.""" """Update the weights from disk in-place without re-launching the server."""
success, message = await tokenizer_manager.update_weights_from_disk(obj, request) success, message = await tokenizer_manager.update_weights_from_disk(obj, request)
content = {"success": success, "message": message} content = {"success": success, "message": message}
if success: if success:
......
...@@ -169,7 +169,7 @@ def calculate_time(show=False, min_cost_ms=0.0): ...@@ -169,7 +169,7 @@ def calculate_time(show=False, min_cost_ms=0.0):
return wrapper return wrapper
def get_available_gpu_memory(device, gpu_id, distributed=False): def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True):
""" """
Get available memory for cuda:gpu_id device. Get available memory for cuda:gpu_id device.
When distributed is True, the available memory is the minimum available memory of all GPUs. When distributed is True, the available memory is the minimum available memory of all GPUs.
...@@ -184,6 +184,7 @@ def get_available_gpu_memory(device, gpu_id, distributed=False): ...@@ -184,6 +184,7 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
"which may cause useless memory allocation for torch CUDA context.", "which may cause useless memory allocation for torch CUDA context.",
) )
if empty_cache:
torch.cuda.empty_cache() torch.cuda.empty_cache()
free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id) free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
...@@ -196,6 +197,8 @@ def get_available_gpu_memory(device, gpu_id, distributed=False): ...@@ -196,6 +197,8 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ", f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ",
"which may cause useless memory allocation for torch XPU context.", "which may cause useless memory allocation for torch XPU context.",
) )
if empty_cache:
torch.xpu.empty_cache() torch.xpu.empty_cache()
used_memory = torch.xpu.memory_allocated() used_memory = torch.xpu.memory_allocated()
total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory
......
...@@ -15,7 +15,6 @@ suites = { ...@@ -15,7 +15,6 @@ suites = {
"test_double_sparsity.py", "test_double_sparsity.py",
"test_embedding_openai_server.py", "test_embedding_openai_server.py",
"test_eval_accuracy_mini.py", "test_eval_accuracy_mini.py",
"test_fused_moe.py",
"test_get_weights_by_name.py", "test_get_weights_by_name.py",
"test_gguf.py", "test_gguf.py",
"test_input_embeddings.py", "test_input_embeddings.py",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment