[Minor] Improve code style (#2422)

641b7d0a · Lianmin Zheng · GitHub · 0ce091a8 · 641b7d0a · 641b7d0a
Unverified Commit 641b7d0a authored Dec 09, 2024 by Lianmin Zheng Committed by GitHub Dec 09, 2024
15 changed files
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]
 dev_xpu = ["sglang[all_xpu]", "sglang[test]"]

--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -285,7 +285,7 @@ def throughput_test(
    else:
        raise ValueError('Please set backend to either "engine" or "runtime"')
-    tokenizer_id = server_args.model_path
+    tokenizer_id = server_args.tokenizer_path or server_args.model_path
    tokenizer = get_tokenizer(tokenizer_id)
    # Set global environmnets

--- a/python/sglang/srt/constrained/xgrammar_backend.py
+++ b/python/sglang/srt/constrained/xgrammar_backend.py
@@ -117,6 +117,9 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
        key_type, key_string = key
        if key_type == "json":
            try:
+                if key_string == "$$ANY$$":
+                    ctx = self.grammar_compiler.compile_builtin_json_grammar()
+                else:
                    ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
            except RuntimeError as e:
                logging.warning(

--- a/python/sglang/srt/layers/attention/triton_backend.py
+++ b/python/sglang/srt/layers/attention/triton_backend.py
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING
 import torch
 from sglang.srt.layers.attention import AttentionBackend
-from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 if TYPE_CHECKING:

--- a/python/sglang/srt/layers/radix_attention.py
+++ b/python/sglang/srt/layers/radix_attention.py
@@ -48,7 +48,14 @@ class RadixAttention(nn.Module):
        self.sliding_window_size = sliding_window_size or -1
        self.is_cross_attention = is_cross_attention
-    def forward(self, q, k, v, forward_batch: ForwardBatch, save_kv_cache=True):
+    def forward(
+        self,
+        q,
+        k,
+        v,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
        if k is not None:
            # For cross-layer sharing, kv can be None
            assert v is not None

--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -484,7 +484,7 @@ bid = 0
 @dataclasses.dataclass
 class ScheduleBatch:
-    """Store all inforamtion of a batch on the scheduler."""
+    """Store all information of a batch on the scheduler."""
    # Request, memory pool, and cache
    reqs: List[Req]

--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -22,7 +22,7 @@ import signal
 import sys
 import time
 import uuid
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
 import fastapi
 import uvloop

--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -127,7 +127,7 @@ class CudaGraphRunner:
        # Batch sizes to capture
        if model_runner.server_args.disable_cuda_graph_padding:
-            self.capture_bs = list(range(1, 32)) + [64, 128]
+            self.capture_bs = list(range(1, 33)) + [64, 128]
        else:
            self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]

--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -242,20 +242,22 @@ class ModelRunner:
                if torch.cuda.get_device_capability()[1] < 5:
                    raise RuntimeError("SGLang only supports sm75 and above.")
-        # Prepare the vllm model config
+        # Prepare the model config
        self.load_config = LoadConfig(
            load_format=self.server_args.load_format,
            download_dir=self.server_args.download_dir,
        )
        if self.server_args.load_format == "gguf":
            monkey_patch_vllm_gguf_config()
+        # Load the model
        self.model = get_model(
            model_config=self.model_config,
            load_config=self.load_config,
            device_config=DeviceConfig(self.device),
        )
+        # Parse other args
        self.sliding_window_size = (
            self.model.get_attention_sliding_window_size()
            if hasattr(self.model, "get_attention_sliding_window_size")
@@ -270,8 +272,10 @@ class ModelRunner:
            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
        )
-    def update_weights_from_disk(self, model_path: str, load_format: str):
+    def update_weights_from_disk(
-        """Update engine weights online from disk."""
+        self, model_path: str, load_format: str
+    ) -> tuple[bool, str]:
+        """Update engine weights in-place from the disk."""
        from sglang.srt.model_loader.loader import (
            DefaultModelLoader,
            device_loading_context,

--- a/python/sglang/srt/models/gemma2_reward.py
+++ b/python/sglang/srt/models/gemma2_reward.py
@@ -32,7 +32,6 @@ class Gemma2ForSequenceClassification(nn.Module):
    ) -> None:
        super().__init__()
        self.config = config
-        self.torchao_config = None
        self.quant_config = quant_config
        self.num_labels = config.num_labels
        self.model = Gemma2Model(config, quant_config=quant_config)

--- a/python/sglang/srt/models/llama_classification.py
+++ b/python/sglang/srt/models/llama_classification.py
@@ -33,7 +33,6 @@ class LlamaForClassification(nn.Module):
    ) -> None:
        super().__init__()
        self.config = config
-        self.torchao_config = None
        self.quant_config = quant_config
        self.model = LlamaModel(config, quant_config=quant_config)

--- a/python/sglang/srt/models/llama_reward.py
+++ b/python/sglang/srt/models/llama_reward.py
@@ -21,7 +21,6 @@ from transformers import LlamaConfig
 from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
@@ -33,7 +32,6 @@ class LlamaForSequenceClassification(nn.Module):
    ) -> None:
        super().__init__()
        self.config = config
-        self.torchao_config = None
        self.quant_config = quant_config
        self.num_labels = config.num_labels
        self.model = LlamaModel(config, quant_config=quant_config)

--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -196,7 +196,7 @@ async def stop_profile_async():
 @app.post("/update_weights_from_disk")
 @time_func_latency
 async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
-    """Update the weights from disk inplace without re-launching the server."""
+    """Update the weights from disk in-place without re-launching the server."""
    success, message = await tokenizer_manager.update_weights_from_disk(obj, request)
    content = {"success": success, "message": message}
    if success:

--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -169,7 +169,7 @@ def calculate_time(show=False, min_cost_ms=0.0):
    return wrapper
-def get_available_gpu_memory(device, gpu_id, distributed=False):
+def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True):
    """
    Get available memory for cuda:gpu_id device.
    When distributed is True, the available memory is the minimum available memory of all GPUs.
@@ -184,6 +184,7 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
                "which may cause useless memory allocation for torch CUDA context.",
            )
+        if empty_cache:
            torch.cuda.empty_cache()
        free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
@@ -196,6 +197,8 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
                f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ",
                "which may cause useless memory allocation for torch XPU context.",
            )
+        if empty_cache:
            torch.xpu.empty_cache()
        used_memory = torch.xpu.memory_allocated()
        total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -15,7 +15,6 @@ suites = {
        "test_double_sparsity.py",
        "test_embedding_openai_server.py",
        "test_eval_accuracy_mini.py",
-        "test_fused_moe.py",
        "test_get_weights_by_name.py",
        "test_gguf.py",
        "test_input_embeddings.py",