Minor fix in compiler & format (#545)

11616fc6 · sglang · GitHub · 9ce89bc1 · 11616fc6 · 11616fc6
Unverified Commit 11616fc6 authored Jun 29, 2024 by sglang Committed by GitHub Jun 29, 2024
12 changed files
--- a/benchmark/latency_throughput/bench_serving.py
+++ b/benchmark/latency_throughput/bench_serving.py
@@ -38,7 +38,6 @@ def sample_requests(
    num_requests: int,
    tokenizer: AutoTokenizer,
 ) -> List[Tuple[str, int, int]]:
    def load_dataset():
        with open(dataset_path, encoding="utf-8") as f:
            dataset = json.load(f)

--- a/benchmark/line_retrieval/gen_data.py
+++ b/benchmark/line_retrieval/gen_data.py
@@ -48,9 +48,9 @@ def generate_lines(random_words, num_lines, redirect_ratio):
        )
        for i in redirect_indices:
            target_idx = np.random.choice(min(i * 2 + 100, num_lines))
-            lines[i] = (
+            lines[
-                f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
+                i
-            )
+            ] = f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
            redirects[i] = target_idx
    # Build links and find sources

--- a/python/sglang/backend/litellm.py
+++ b/python/sglang/backend/litellm.py
@@ -13,7 +13,6 @@ except ImportError as e:
 class LiteLLM(BaseBackend):
    def __init__(
        self,
        model_name,

--- a/python/sglang/lang/compiler.py
+++ b/python/sglang/lang/compiler.py
@@ -4,7 +4,7 @@ from queue import Queue
 from typing import List, Union
 from sglang.global_config import global_config
-from sglang.lang.interpreter import ProgramState, StreamExecutor, pin_program
+from sglang.lang.interpreter import ProgramState, StreamExecutor, cache_program
 from sglang.lang.ir import (
    SglArgument,
    SglConstantText,
@@ -184,7 +184,7 @@ class CompiledFunction:
        # Extract prefix by tracing and cache it
        if len(batch_kwargs) > 1:
-            pin_program(self.function, backend)
+            cache_program(self.function, backend)
        # Run all programs
        if num_threads == "auto":

--- a/python/sglang/launch_server_llavavid.py
+++ b/python/sglang/launch_server_llavavid.py
@@ -6,7 +6,6 @@ import multiprocessing as mp
 from sglang.srt.server import ServerArgs, launch_server
 if __name__ == "__main__":
    model_overide_args = {}
    model_overide_args["mm_spatial_pool_stride"] = 2

--- a/python/sglang/srt/managers/controller/infer_batch.py
+++ b/python/sglang/srt/managers/controller/infer_batch.py
@@ -498,9 +498,10 @@ class Batch:
                        req.output_ids = cur_output_ids
                        continue
-                    jump_forward_str, next_state = (
+                    (
-                        req.jump_forward_map.jump_forward_symbol(cur_state)
+                        jump_forward_str,
-                    )
+                        next_state,
+                    ) = req.jump_forward_map.jump_forward_symbol(cur_state)
                    # Make the incrementally decoded text part of jump_forward_str
                    # so that the UTF-8 will not corrupt

--- a/python/sglang/srt/managers/controller/tp_worker.py
+++ b/python/sglang/srt/managers/controller/tp_worker.py
@@ -283,13 +283,14 @@ class ModelTpServer:
                (recv_req.image_hash >> 64) % self.model_config.vocab_size,
            ]
            req.image_size = recv_req.image_size
-            req.origin_input_ids, req.image_offset = (
+            (
-                self.model_runner.model.pad_input_ids(
+                req.origin_input_ids,
-                    req.origin_input_ids_unpadded,
+                req.image_offset,
-                    req.pad_value,
+            ) = self.model_runner.model.pad_input_ids(
-                    req.pixel_values.shape,
+                req.origin_input_ids_unpadded,
-                    req.image_size,
+                req.pad_value,
-                )
+                req.pixel_values.shape,
+                req.image_size,
            )
        req.sampling_params = recv_req.sampling_params
        req.return_logprob = recv_req.return_logprob

--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -35,7 +35,6 @@ class GenerateReqInput:
    stream: bool = False
    def post_init(self):
        if (self.text is None and self.input_ids is None) or (
            self.text is not None and self.input_ids is not None
        ):

--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -334,15 +334,15 @@ class TokenizerManager:
                ret["meta_info"]["decode_token_logprobs"], return_text_in_logprobs
            )
        if top_logprobs_num > 0:
-            ret["meta_info"]["prefill_top_logprobs"] = (
+            ret["meta_info"][
-                self.detokenize_top_logprobs_tokens(
+                "prefill_top_logprobs"
-                    ret["meta_info"]["prefill_top_logprobs"], return_text_in_logprobs
+            ] = self.detokenize_top_logprobs_tokens(
-                )
+                ret["meta_info"]["prefill_top_logprobs"], return_text_in_logprobs
            )
-            ret["meta_info"]["decode_top_logprobs"] = (
+            ret["meta_info"][
-                self.detokenize_top_logprobs_tokens(
+                "decode_top_logprobs"
-                    ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs
+            ] = self.detokenize_top_logprobs_tokens(
-                )
+                ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs
            )
        return ret

--- a/python/sglang/srt/models/chatglm.py
+++ b/python/sglang/srt/models/chatglm.py
@@ -36,7 +36,6 @@ LoraConfig = None
 class GLMAttention(nn.Module):
    def __init__(
        self,
        config,
@@ -294,7 +293,6 @@ class GLMTransformer(nn.Module):
 class ChatGLMModel(nn.Module):
    def __init__(
        self,
        config,

--- a/python/sglang/srt/models/grok.py
+++ b/python/sglang/srt/models/grok.py
@@ -521,7 +521,6 @@ class Grok1DecoderLayer(nn.Module):
        hidden_states: torch.Tensor,
        input_metadata: InputMetadata,
    ) -> torch.Tensor:
        hidden_states = (
            self.post_attn_norm(
                self.self_attn(

--- a/python/sglang/srt/models/llama2.py
+++ b/python/sglang/srt/models/llama2.py
@@ -160,9 +160,9 @@ class LlamaDecoderLayer(nn.Module):
        if rope_scaling is not None and getattr(
            config, "original_max_position_embeddings", None
        ):
-            rope_scaling["original_max_position_embeddings"] = (
+            rope_scaling[
-                config.original_max_position_embeddings
+                "original_max_position_embeddings"
-            )
+            ] = config.original_max_position_embeddings
        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
        self.self_attn = LlamaAttention(
            hidden_size=self.hidden_size,