Implement Standalone gRPC Server for SGLang Python Scheduler (#10283)

53ca1552 · Chang Su · GitHub · a23bdeaf · 53ca1552 · 53ca1552
Unverified Commit 53ca1552 authored Sep 11, 2025 by Chang Su Committed by GitHub Sep 11, 2025
11 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,17 +22,19 @@ repos:
    rev: 5.13.2
    hooks:
      - id: isort
+        exclude: '^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$'
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.11.7
    hooks:
      - id: ruff
        args: [--select=F401, --fixable=F401]
        files: ^(benchmark/|docs/|examples/)
-        exclude: \.ipynb$
+        exclude: \.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$
  - repo: https://github.com/psf/black
    rev: 24.10.0
    hooks:
      - id: black-jupyter
+        exclude: '^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$'
  - repo: https://github.com/codespell-project/codespell
    rev: v2.4.1
    hooks:
@@ -42,7 +44,11 @@ repos:
        exclude: |
          (?x)^(
            test/srt/test_reasoning_parser\.py|
-            docs/advanced_features/vlm_query\.ipynb
+            docs/advanced_features/vlm_query\.ipynb|
+            python/sglang/srt/grpc/.*_pb2\.py|
+            python/sglang/srt/grpc/.*_pb2_grpc\.py|
+            python/sglang/srt/grpc/.*_pb2\.pyi|
+            python/sglang/srt/grpc/.*_pb2_grpc\.pyi
          )$
  - repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v18.1.8

--- a/python/sglang/srt/entrypoints/grpc_request_manager.py
+++ b/python/sglang/srt/entrypoints/grpc_request_manager.py
--- a/python/sglang/srt/entrypoints/grpc_server.py
+++ b/python/sglang/srt/entrypoints/grpc_server.py
--- a/python/sglang/srt/grpc/__init__.py
+++ b/python/sglang/srt/grpc/__init__.py
+# SGLang gRPC module
--- a/python/sglang/srt/grpc/sglang_scheduler.proto
+++ b/python/sglang/srt/grpc/sglang_scheduler.proto
+syntax = "proto3";
+package sglang.grpc.scheduler;
+import "google/protobuf/timestamp.proto";
+import "google/protobuf/struct.proto";
+// Service definition for SGLang scheduler communication
+// This protocol bridges the Rust router and Python scheduler
+service SglangScheduler {
+  // Submit a generation request (supports streaming)
+  rpc Generate(GenerateRequest) returns (stream GenerateResponse);
+  // Submit an embedding request
+  rpc Embed(EmbedRequest) returns (EmbedResponse);
+  // Health check and metrics
+  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
+  // Abort a running request
+  rpc Abort(AbortRequest) returns (AbortResponse);
+}
+// =====================
+// Common Types
+// =====================
+// Sampling parameters matching SGLang's SamplingParams
+message SamplingParams {
+  float temperature = 1;
+  float top_p = 2;
+  int32 top_k = 3;
+  float min_p = 4;
+  float frequency_penalty = 5;
+  float presence_penalty = 6;
+  float repetition_penalty = 7;
+  int32 max_new_tokens = 8;
+  repeated string stop = 9;
+  repeated int32 stop_token_ids = 10;
+  bool skip_special_tokens = 11;
+  bool spaces_between_special_tokens = 12;
+  // Structured generation
+  oneof constraint {
+    string regex = 13;
+    string json_schema = 14;
+    string ebnf_grammar = 15;
+  }
+  // LoRA adapter
+  string lora_path = 16;
+  // Speculative decoding
+  int32 n = 17;  // Number of samples
+  // Token healing
+  bool token_healing = 18;
+  // Additional parameters
+  int32 min_new_tokens = 19;
+  bool ignore_eos = 20;
+  bool no_stop_trim = 21;
+  int32 stream_interval = 22;
+  map<string, float> logit_bias = 23;
+  string structural_tag = 24;
+  // Custom parameters for extensibility
+  google.protobuf.Struct custom_params = 25;
+}
+// Disaggregated serving parameters
+message DisaggregatedParams {
+  string bootstrap_host = 1;
+  int32 bootstrap_port = 2;
+  int32 bootstrap_room = 3;
+}
+// =====================
+// Generate Request
+// =====================
+message GenerateRequest {
+  string request_id = 1;
+  // Input must be tokenized (no raw text)
+  TokenizedInput tokenized = 2;
+  // Multimodal inputs
+  MultimodalInputs mm_inputs = 3;
+  // Generation parameters
+  SamplingParams sampling_params = 4;
+  // Return options
+  bool return_logprob = 5;
+  int32 logprob_start_len = 6;
+  int32 top_logprobs_num = 7;
+  repeated int32 token_ids_logprob = 8;
+  bool return_hidden_states = 9;
+  // For disaggregated serving
+  DisaggregatedParams disaggregated_params = 10;
+  // Custom logit processor (serialized)
+  string custom_logit_processor = 11;
+  // Request metadata
+  google.protobuf.Timestamp timestamp = 12;
+  bool log_metrics = 13;
+  // Input embeddings (alternative to text/tokens)
+  repeated float input_embeds = 14;
+  // LoRA adapter ID (if pre-loaded)
+  string lora_id = 15;
+  // Data parallel routing
+  int32 data_parallel_rank = 16;
+  // For load balancing
+  int32 dp_balance_id = 17;
+}
+message TokenizedInput {
+  string original_text = 1;  // For reference
+  repeated int32 input_ids = 2;
+}
+message MultimodalInputs {
+  // Simplified multimodal handling - actual data processed by tokenizer
+  repeated string image_urls = 1;
+  repeated string video_urls = 2;
+  repeated string audio_urls = 3;
+  // Pre-processed multimodal features (if available)
+  google.protobuf.Struct processed_features = 4;
+  // Raw data for direct processing
+  repeated bytes image_data = 5;
+  repeated bytes video_data = 6;
+  repeated bytes audio_data = 7;
+  // Modality metadata
+  repeated string modalities = 8;
+}
+// =====================
+// Generate Response
+// =====================
+message GenerateResponse {
+  string request_id = 1;
+  // Response type
+  oneof response {
+    GenerateStreamChunk chunk = 2;
+    GenerateComplete complete = 3;
+    GenerateError error = 4;
+  }
+}
+message GenerateStreamChunk {
+  // Generated token
+  int32 token_id = 1;
+  string text = 2;
+  // Cumulative counts
+  int32 prompt_tokens = 3;
+  int32 completion_tokens = 4;
+  int32 cached_tokens = 5;
+  // Logprobs (if requested)
+  LogProbs logprobs = 6;
+  // Hidden states (if requested)
+  repeated float hidden_states = 7;
+  // Metadata
+  float generation_time = 8;  // Time to generate this token
+  int32 queue_time = 9;       // Time spent in queue
+}
+message GenerateComplete {
+  // Final output
+  repeated int32 output_ids = 1;
+  string output_text = 2;
+  // Finish reason
+  enum FinishReason {
+    // The model generated a stop sequence.
+    STOP = 0;
+    // The model reached the maximum generation length.
+    LENGTH = 1;
+    // The model generated an end-of-sequence (EOS) token.
+    EOS_TOKEN = 2;
+    // The model generated a user-provided stop string.
+    STOP_STR = 3;
+    // The request was aborted by the user or system.
+    ABORT = 4;
+  }
+  FinishReason finish_reason = 3;
+  // All logprobs if requested
+  repeated LogProbs all_logprobs = 11;
+  // All hidden states if requested
+  repeated HiddenStates all_hidden_states = 12;
+}
+message GenerateError {
+  string message = 1;
+  string http_status_code = 2;
+  string details = 3;
+}
+message LogProbs {
+  repeated float token_logprobs = 1;
+  repeated int32 token_ids = 2;
+  // Top logprobs at each position
+  repeated TopLogProbs top_logprobs = 3;
+  // Decoded text for tokens
+  repeated string token_texts = 4;
+}
+message TopLogProbs {
+  repeated float values = 1;
+  repeated int32 token_ids = 2;
+  repeated string token_texts = 3;
+}
+message HiddenStates {
+  repeated float values = 1;
+  int32 layer = 2;
+  int32 position = 3;
+}
+// =====================
+// Embedding Request
+// =====================
+message EmbedRequest {
+  string request_id = 1;
+  // Input must be tokenized (no raw text)
+  TokenizedInput tokenized = 2;
+  // Multimodal inputs
+  MultimodalInputs mm_inputs = 4;
+  // Dummy sampling params for compatibility
+  // EmbedRequest doesn't use sampling_params
+  SamplingParams sampling_params = 5;
+  bool log_metrics = 6;
+  // Token type IDs for models that require them
+  repeated int32 token_type_ids = 7;
+  // Data parallel routing
+  int32 data_parallel_rank = 8;
+  // For cross-encoder requests
+  bool is_cross_encoder = 9;
+  repeated string texts = 10;  // For cross-encoder batch
+}
+message EmbedResponse {
+  string request_id = 1;
+  oneof response {
+    EmbedComplete complete = 2;
+    EmbedError error = 3;
+  }
+}
+message EmbedComplete {
+  repeated float embedding = 1;
+  int32 prompt_tokens = 2;
+  int32 cached_tokens = 3;
+  // Additional metadata
+  int32 embedding_dim = 4;
+  float generation_time = 5;
+  // For batch embeddings
+  repeated Embedding batch_embeddings = 6;
+}
+message Embedding {
+  repeated float values = 1;
+  int32 index = 2;
+}
+message EmbedError {
+  string message = 1;
+  string code = 2;
+  string details = 3;
+}
+// =====================
+// Management Operations
+// =====================
+message HealthCheckRequest {
+  // Input for health test generation (must be tokenized)
+  TokenizedInput tokenized = 1;
+}
+message HealthCheckResponse {
+  bool healthy = 1;
+  string message = 2;
+}
+message AbortRequest {
+  string request_id = 1;
+  string reason = 2;
+}
+message AbortResponse {
+  bool success = 1;
+  string message = 2;
+}
+// =====================
+// Additional Operations (Future)
+// =====================
+// Load LoRA adapter
+message LoadLoRARequest {
+  string adapter_id = 1;
+  string adapter_path = 2;
+  int32 rank = 3;
+}
+message LoadLoRAResponse {
+  bool success = 1;
+  string adapter_id = 2;
+  string message = 3;
+}
+// Unload LoRA adapter
+message UnloadLoRARequest {
+  string adapter_id = 1;
+}
+message UnloadLoRAResponse {
+  bool success = 1;
+  string message = 2;
+}
+// Update weights
+message UpdateWeightsRequest {
+  oneof source {
+    string disk_path = 1;
+    bytes tensor_data = 2;
+    string remote_url = 3;
+  }
+  string weight_name = 4;
+}
+message UpdateWeightsResponse {
+  bool success = 1;
+  string message = 2;
+}
+// Get internal state for debugging
+message GetInternalStateRequest {
+  repeated string state_keys = 1;
+}
+message GetInternalStateResponse {
+  google.protobuf.Struct state = 1;
+}
+// Set internal state for testing
+message SetInternalStateRequest {
+  google.protobuf.Struct state = 1;
+}
+message SetInternalStateResponse {
+  bool success = 1;
+  string message = 2;
+}
--- a/python/sglang/srt/grpc/sglang_scheduler_pb2.py
+++ b/python/sglang/srt/grpc/sglang_scheduler_pb2.py
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# NO CHECKED-IN PROTOBUF GENCODE
+# source: sglang_scheduler.proto
+# Protobuf Python Version: 6.31.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import runtime_version as _runtime_version
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+_runtime_version.ValidateProtobufRuntimeVersion(
+    _runtime_version.Domain.PUBLIC,
+    6,
+    31,
+    1,
+    '',
+    'sglang_scheduler.proto'
+)
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2
+from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x16sglang_scheduler.proto\x12\x15sglang.grpc.scheduler\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1cgoogle/protobuf/struct.proto\"\xc7\x05\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x16\n\x0emax_new_tokens\x18\x08 \x01(\x05\x12\x0c\n\x04stop\x18\t \x03(\t\x12\x16\n\x0estop_token_ids\x18\n \x03(\x05\x12\x1b\n\x13skip_special_tokens\x18\x0b \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\x0c \x01(\x08\x12\x0f\n\x05regex\x18\r \x01(\tH\x00\x12\x15\n\x0bjson_schema\x18\x0e \x01(\tH\x00\x12\x16\n\x0c\x65\x62nf_grammar\x18\x0f \x01(\tH\x00\x12\x11\n\tlora_path\x18\x10 \x01(\t\x12\t\n\x01n\x18\x11 \x01(\x05\x12\x15\n\rtoken_healing\x18\x12 \x01(\x08\x12\x16\n\x0emin_new_tokens\x18\x13 \x01(\x05\x12\x12\n\nignore_eos\x18\x14 \x01(\x08\x12\x14\n\x0cno_stop_trim\x18\x15 \x01(\x08\x12\x17\n\x0fstream_interval\x18\x16 \x01(\x05\x12H\n\nlogit_bias\x18\x17 \x03(\x0b\x32\x34.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry\x12\x16\n\x0estructural_tag\x18\x18 \x01(\t\x12.\n\rcustom_params\x18\x19 \x01(\x0b\x32\x17.google.protobuf.Struct\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraint\"]\n\x13\x44isaggregatedParams\x12\x16\n\x0e\x62ootstrap_host\x18\x01 \x01(\t\x12\x16\n\x0e\x62ootstrap_port\x18\x02 \x01(\x05\x12\x16\n\x0e\x62ootstrap_room\x18\x03 \x01(\x05\"\xe9\x04\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x04 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x16\n\x0ereturn_logprob\x18\x05 \x01(\x08\x12\x19\n\x11logprob_start_len\x18\x06 \x01(\x05\x12\x18\n\x10top_logprobs_num\x18\x07 \x01(\x05\x12\x19\n\x11token_ids_logprob\x18\x08 \x03(\x05\x12\x1c\n\x14return_hidden_states\x18\t \x01(\x08\x12H\n\x14\x64isaggregated_params\x18\n \x01(\x0b\x32*.sglang.grpc.scheduler.DisaggregatedParams\x12\x1e\n\x16\x63ustom_logit_processor\x18\x0b \x01(\t\x12-\n\ttimestamp\x18\x0c \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x13\n\x0blog_metrics\x18\r \x01(\x08\x12\x14\n\x0cinput_embeds\x18\x0e \x03(\x02\x12\x0f\n\x07lora_id\x18\x0f \x01(\t\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x10 \x01(\x05\x12\x15\n\rdp_balance_id\x18\x11 \x01(\x05\":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\x05\"\xd3\x01\n\x10MultimodalInputs\x12\x12\n\nimage_urls\x18\x01 \x03(\t\x12\x12\n\nvideo_urls\x18\x02 \x03(\t\x12\x12\n\naudio_urls\x18\x03 \x03(\t\x12\x33\n\x12processed_features\x18\x04 \x01(\x0b\x32\x17.google.protobuf.Struct\x12\x12\n\nimage_data\x18\x05 \x03(\x0c\x12\x12\n\nvideo_data\x18\x06 \x03(\x0c\x12\x12\n\naudio_data\x18\x07 \x03(\x0c\x12\x12\n\nmodalities\x18\x08 \x03(\t\"\xe3\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12;\n\x05\x63hunk\x18\x02 \x01(\x0b\x32*.sglang.grpc.scheduler.GenerateStreamChunkH\x00\x12;\n\x08\x63omplete\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.GenerateCompleteH\x00\x12\x35\n\x05\x65rror\x18\x04 \x01(\x0b\x32$.sglang.grpc.scheduler.GenerateErrorH\x00\x42\n\n\x08response\"\xf5\x01\n\x13GenerateStreamChunk\x12\x10\n\x08token_id\x18\x01 \x01(\x05\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05\x12\x31\n\x08logprobs\x18\x06 \x01(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12\x15\n\rhidden_states\x18\x07 \x03(\x02\x12\x17\n\x0fgeneration_time\x18\x08 \x01(\x02\x12\x12\n\nqueue_time\x18\t \x01(\x05\"\xcd\x02\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\x05\x12\x13\n\x0boutput_text\x18\x02 \x01(\t\x12K\n\rfinish_reason\x18\x03 \x01(\x0e\x32\x34.sglang.grpc.scheduler.GenerateComplete.FinishReason\x12\x35\n\x0c\x61ll_logprobs\x18\x0b \x03(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12>\n\x11\x61ll_hidden_states\x18\x0c \x03(\x0b\x32#.sglang.grpc.scheduler.HiddenStates\"L\n\x0c\x46inishReason\x12\x08\n\x04STOP\x10\x00\x12\n\n\x06LENGTH\x10\x01\x12\r\n\tEOS_TOKEN\x10\x02\x12\x0c\n\x08STOP_STR\x10\x03\x12\t\n\x05\x41\x42ORT\x10\x04\"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"\x84\x01\n\x08LogProbs\x12\x16\n\x0etoken_logprobs\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x38\n\x0ctop_logprobs\x18\x03 \x03(\x0b\x32\".sglang.grpc.scheduler.TopLogProbs\x12\x13\n\x0btoken_texts\x18\x04 \x03(\t\"E\n\x0bTopLogProbs\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x13\n\x0btoken_texts\x18\x03 \x03(\t\"?\n\x0cHiddenStates\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05layer\x18\x02 \x01(\x05\x12\x10\n\x08position\x18\x03 \x01(\x05\"\xca\x02\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x04 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x05 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x13\n\x0blog_metrics\x18\x06 \x01(\x08\x12\x16\n\x0etoken_type_ids\x18\x07 \x03(\x05\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x08 \x01(\x05\x12\x18\n\x10is_cross_encoder\x18\t \x01(\x08\x12\r\n\x05texts\x18\n \x03(\t\"\x9d\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\x08\x63omplete\x18\x02 \x01(\x0b\x32$.sglang.grpc.scheduler.EmbedCompleteH\x00\x12\x32\n\x05\x65rror\x18\x03 \x01(\x0b\x32!.sglang.grpc.scheduler.EmbedErrorH\x00\x42\n\n\x08response\"\xbc\x01\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rcached_tokens\x18\x03 \x01(\x05\x12\x15\n\rembedding_dim\x18\x04 \x01(\x05\x12\x17\n\x0fgeneration_time\x18\x05 \x01(\x02\x12:\n\x10\x62\x61tch_embeddings\x18\x06 \x03(\x0b\x32 .sglang.grpc.scheduler.Embedding\"*\n\tEmbedding\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05index\x18\x02 \x01(\x05\"<\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"N\n\x12HealthCheckRequest\x12\x38\n\ttokenized\x18\x01 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t\"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"I\n\x0fLoadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\x12\x14\n\x0c\x61\x64\x61pter_path\x18\x02 \x01(\t\x12\x0c\n\x04rank\x18\x03 \x01(\x05\"H\n\x10LoadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x12\n\nadapter_id\x18\x02 \x01(\t\x12\x0f\n\x07message\x18\x03 \x01(\t\"\'\n\x11UnloadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\"6\n\x12UnloadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"w\n\x14UpdateWeightsRequest\x12\x13\n\tdisk_path\x18\x01 \x01(\tH\x00\x12\x15\n\x0btensor_data\x18\x02 \x01(\x0cH\x00\x12\x14\n\nremote_url\x18\x03 \x01(\tH\x00\x12\x13\n\x0bweight_name\x18\x04 \x01(\tB\x08\n\x06source\"9\n\x15UpdateWeightsResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"-\n\x17GetInternalStateRequest\x12\x12\n\nstate_keys\x18\x01 \x03(\t\"B\n\x18GetInternalStateResponse\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"A\n\x17SetInternalStateRequest\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"<\n\x18SetInternalStateResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t2\xfe\x02\n\x0fSglangScheduler\x12]\n\x08Generate\x12&.sglang.grpc.scheduler.GenerateRequest\x1a\'.sglang.grpc.scheduler.GenerateResponse0\x01\x12R\n\x05\x45mbed\x12#.sglang.grpc.scheduler.EmbedRequest\x1a$.sglang.grpc.scheduler.EmbedResponse\x12\x64\n\x0bHealthCheck\x12).sglang.grpc.scheduler.HealthCheckRequest\x1a*.sglang.grpc.scheduler.HealthCheckResponse\x12R\n\x05\x41\x62ort\x12#.sglang.grpc.scheduler.AbortRequest\x1a$.sglang.grpc.scheduler.AbortResponseb\x06proto3')
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sglang_scheduler_pb2', _globals)
+if not _descriptor._USE_C_DESCRIPTORS:
+  DESCRIPTOR._loaded_options = None
+  _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._loaded_options = None
+  _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_options = b'8\001'
+  _globals['_SAMPLINGPARAMS']._serialized_start=113
+  _globals['_SAMPLINGPARAMS']._serialized_end=824
+  _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_start=762
+  _globals['_SAMPLINGPARAMS_LOGITBIASENTRY']._serialized_end=810
+  _globals['_DISAGGREGATEDPARAMS']._serialized_start=826
+  _globals['_DISAGGREGATEDPARAMS']._serialized_end=919
+  _globals['_GENERATEREQUEST']._serialized_start=922
+  _globals['_GENERATEREQUEST']._serialized_end=1539
+  _globals['_TOKENIZEDINPUT']._serialized_start=1541
+  _globals['_TOKENIZEDINPUT']._serialized_end=1599
+  _globals['_MULTIMODALINPUTS']._serialized_start=1602
+  _globals['_MULTIMODALINPUTS']._serialized_end=1813
+  _globals['_GENERATERESPONSE']._serialized_start=1816
+  _globals['_GENERATERESPONSE']._serialized_end=2043
+  _globals['_GENERATESTREAMCHUNK']._serialized_start=2046
+  _globals['_GENERATESTREAMCHUNK']._serialized_end=2291
+  _globals['_GENERATECOMPLETE']._serialized_start=2294
+  _globals['_GENERATECOMPLETE']._serialized_end=2627
+  _globals['_GENERATECOMPLETE_FINISHREASON']._serialized_start=2551
+  _globals['_GENERATECOMPLETE_FINISHREASON']._serialized_end=2627
+  _globals['_GENERATEERROR']._serialized_start=2629
+  _globals['_GENERATEERROR']._serialized_end=2704
+  _globals['_LOGPROBS']._serialized_start=2707
+  _globals['_LOGPROBS']._serialized_end=2839
+  _globals['_TOPLOGPROBS']._serialized_start=2841
+  _globals['_TOPLOGPROBS']._serialized_end=2910
+  _globals['_HIDDENSTATES']._serialized_start=2912
+  _globals['_HIDDENSTATES']._serialized_end=2975
+  _globals['_EMBEDREQUEST']._serialized_start=2978
+  _globals['_EMBEDREQUEST']._serialized_end=3308
+  _globals['_EMBEDRESPONSE']._serialized_start=3311
+  _globals['_EMBEDRESPONSE']._serialized_end=3468
+  _globals['_EMBEDCOMPLETE']._serialized_start=3471
+  _globals['_EMBEDCOMPLETE']._serialized_end=3659
+  _globals['_EMBEDDING']._serialized_start=3661
+  _globals['_EMBEDDING']._serialized_end=3703
+  _globals['_EMBEDERROR']._serialized_start=3705
+  _globals['_EMBEDERROR']._serialized_end=3765
+  _globals['_HEALTHCHECKREQUEST']._serialized_start=3767
+  _globals['_HEALTHCHECKREQUEST']._serialized_end=3845
+  _globals['_HEALTHCHECKRESPONSE']._serialized_start=3847
+  _globals['_HEALTHCHECKRESPONSE']._serialized_end=3902
+  _globals['_ABORTREQUEST']._serialized_start=3904
+  _globals['_ABORTREQUEST']._serialized_end=3954
+  _globals['_ABORTRESPONSE']._serialized_start=3956
+  _globals['_ABORTRESPONSE']._serialized_end=4005
+  _globals['_LOADLORAREQUEST']._serialized_start=4007
+  _globals['_LOADLORAREQUEST']._serialized_end=4080
+  _globals['_LOADLORARESPONSE']._serialized_start=4082
+  _globals['_LOADLORARESPONSE']._serialized_end=4154
+  _globals['_UNLOADLORAREQUEST']._serialized_start=4156
+  _globals['_UNLOADLORAREQUEST']._serialized_end=4195
+  _globals['_UNLOADLORARESPONSE']._serialized_start=4197
+  _globals['_UNLOADLORARESPONSE']._serialized_end=4251
+  _globals['_UPDATEWEIGHTSREQUEST']._serialized_start=4253
+  _globals['_UPDATEWEIGHTSREQUEST']._serialized_end=4372
+  _globals['_UPDATEWEIGHTSRESPONSE']._serialized_start=4374
+  _globals['_UPDATEWEIGHTSRESPONSE']._serialized_end=4431
+  _globals['_GETINTERNALSTATEREQUEST']._serialized_start=4433
+  _globals['_GETINTERNALSTATEREQUEST']._serialized_end=4478
+  _globals['_GETINTERNALSTATERESPONSE']._serialized_start=4480
+  _globals['_GETINTERNALSTATERESPONSE']._serialized_end=4546
+  _globals['_SETINTERNALSTATEREQUEST']._serialized_start=4548
+  _globals['_SETINTERNALSTATEREQUEST']._serialized_end=4613
+  _globals['_SETINTERNALSTATERESPONSE']._serialized_start=4615
+  _globals['_SETINTERNALSTATERESPONSE']._serialized_end=4675
+  _globals['_SGLANGSCHEDULER']._serialized_start=4678
+  _globals['_SGLANGSCHEDULER']._serialized_end=5060
+# @@protoc_insertion_point(module_scope)
--- a/python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
+++ b/python/sglang/srt/grpc/sglang_scheduler_pb2.pyi
--- a/python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
+++ b/python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+import warnings
+from . import sglang_scheduler_pb2 as sglang__scheduler__pb2
+GRPC_GENERATED_VERSION = '1.74.0'
+GRPC_VERSION = grpc.__version__
+_version_not_supported = False
+try:
+    from grpc._utilities import first_version_is_lower
+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
+except ImportError:
+    _version_not_supported = True
+if _version_not_supported:
+    raise RuntimeError(
+        f'The grpc package installed is at version {GRPC_VERSION},'
+        + f' but the generated code in sglang_scheduler_pb2_grpc.py depends on'
+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
+    )
+class SglangSchedulerStub(object):
+    """Service definition for SGLang scheduler communication
+    This protocol bridges the Rust router and Python scheduler
+    """
+    def __init__(self, channel):
+        """Constructor.
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Generate = channel.unary_stream(
+                '/sglang.grpc.scheduler.SglangScheduler/Generate',
+                request_serializer=sglang__scheduler__pb2.GenerateRequest.SerializeToString,
+                response_deserializer=sglang__scheduler__pb2.GenerateResponse.FromString,
+                _registered_method=True)
+        self.Embed = channel.unary_unary(
+                '/sglang.grpc.scheduler.SglangScheduler/Embed',
+                request_serializer=sglang__scheduler__pb2.EmbedRequest.SerializeToString,
+                response_deserializer=sglang__scheduler__pb2.EmbedResponse.FromString,
+                _registered_method=True)
+        self.HealthCheck = channel.unary_unary(
+                '/sglang.grpc.scheduler.SglangScheduler/HealthCheck',
+                request_serializer=sglang__scheduler__pb2.HealthCheckRequest.SerializeToString,
+                response_deserializer=sglang__scheduler__pb2.HealthCheckResponse.FromString,
+                _registered_method=True)
+        self.Abort = channel.unary_unary(
+                '/sglang.grpc.scheduler.SglangScheduler/Abort',
+                request_serializer=sglang__scheduler__pb2.AbortRequest.SerializeToString,
+                response_deserializer=sglang__scheduler__pb2.AbortResponse.FromString,
+                _registered_method=True)
+class SglangSchedulerServicer(object):
+    """Service definition for SGLang scheduler communication
+    This protocol bridges the Rust router and Python scheduler
+    """
+    def Generate(self, request, context):
+        """Submit a generation request (supports streaming)
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+    def Embed(self, request, context):
+        """Submit an embedding request
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+    def HealthCheck(self, request, context):
+        """Health check and metrics
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+    def Abort(self, request, context):
+        """Abort a running request
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+def add_SglangSchedulerServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Generate': grpc.unary_stream_rpc_method_handler(
+                    servicer.Generate,
+                    request_deserializer=sglang__scheduler__pb2.GenerateRequest.FromString,
+                    response_serializer=sglang__scheduler__pb2.GenerateResponse.SerializeToString,
+            ),
+            'Embed': grpc.unary_unary_rpc_method_handler(
+                    servicer.Embed,
+                    request_deserializer=sglang__scheduler__pb2.EmbedRequest.FromString,
+                    response_serializer=sglang__scheduler__pb2.EmbedResponse.SerializeToString,
+            ),
+            'HealthCheck': grpc.unary_unary_rpc_method_handler(
+                    servicer.HealthCheck,
+                    request_deserializer=sglang__scheduler__pb2.HealthCheckRequest.FromString,
+                    response_serializer=sglang__scheduler__pb2.HealthCheckResponse.SerializeToString,
+            ),
+            'Abort': grpc.unary_unary_rpc_method_handler(
+                    servicer.Abort,
+                    request_deserializer=sglang__scheduler__pb2.AbortRequest.FromString,
+                    response_serializer=sglang__scheduler__pb2.AbortResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'sglang.grpc.scheduler.SglangScheduler', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('sglang.grpc.scheduler.SglangScheduler', rpc_method_handlers)
+ # This class is part of an EXPERIMENTAL API.
+class SglangScheduler(object):
+    """Service definition for SGLang scheduler communication
+    This protocol bridges the Rust router and Python scheduler
+    """
+    @staticmethod
+    def Generate(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(
+            request,
+            target,
+            '/sglang.grpc.scheduler.SglangScheduler/Generate',
+            sglang__scheduler__pb2.GenerateRequest.SerializeToString,
+            sglang__scheduler__pb2.GenerateResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+    @staticmethod
+    def Embed(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/sglang.grpc.scheduler.SglangScheduler/Embed',
+            sglang__scheduler__pb2.EmbedRequest.SerializeToString,
+            sglang__scheduler__pb2.EmbedResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+    @staticmethod
+    def HealthCheck(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/sglang.grpc.scheduler.SglangScheduler/HealthCheck',
+            sglang__scheduler__pb2.HealthCheckRequest.SerializeToString,
+            sglang__scheduler__pb2.HealthCheckResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+    @staticmethod
+    def Abort(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/sglang.grpc.scheduler.SglangScheduler/Abort',
+            sglang__scheduler__pb2.AbortRequest.SerializeToString,
+            sglang__scheduler__pb2.AbortResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -2238,6 +2238,7 @@ class ServerArgs:
        args.pp_size = args.pipeline_parallel_size
        args.dp_size = args.data_parallel_size
        args.ep_size = args.expert_parallel_size
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        return cls(**{attr: getattr(args, attr) for attr in attrs})

--- a/sgl-router/src/grpc/client.rs
+++ b/sgl-router/src/grpc/client.rs
@@ -37,21 +37,6 @@ impl SglangSchedulerClient {
        Ok(Self { client })
    }
-    /// Initialize the connection
-    pub async fn initialize(
-        &mut self,
-        client_id: String,
-    ) -> Result<proto::InitializeResponse, Box<dyn std::error::Error>> {
-        let request = Request::new(proto::InitializeRequest {
-            client_id,
-            client_version: "0.1.0".to_string(),
-            mode: proto::initialize_request::Mode::Regular as i32,
-        });
-        let response = self.client.initialize(request).await?;
-        Ok(response.into_inner())
-    }
    /// Submit a generation request (returns streaming response)
    pub async fn generate_stream(
        &mut self,
@@ -68,7 +53,10 @@ impl SglangSchedulerClient {
    ) -> Result<proto::HealthCheckResponse, Box<dyn std::error::Error>> {
        debug!("Sending health check request");
        let request = Request::new(proto::HealthCheckRequest {
-            include_detailed_metrics: false,
+            tokenized: Some(proto::TokenizedInput {
+                original_text: "Hello".to_string(),
+                input_ids: vec![9906], // Mock token ID for "Hello"
+            }),
        });
        let response = self.client.health_check(request).await?;
@@ -87,21 +75,6 @@ impl SglangSchedulerClient {
        self.client.abort(request).await?;
        Ok(())
    }
-    /// Flush cache
-    pub async fn flush_cache(
-        &mut self,
-        flush_all: bool,
-        session_ids: &[String],
-    ) -> Result<proto::FlushCacheResponse, Box<dyn std::error::Error>> {
-        let request = Request::new(proto::FlushCacheRequest {
-            flush_all,
-            session_ids: session_ids.to_vec(),
-        });
-        let response = self.client.flush_cache(request).await?;
-        Ok(response.into_inner())
-    }
 }
 #[cfg(test)]
@@ -111,14 +84,13 @@ mod tests {
    #[test]
    fn test_proto_types_compilation() {
        // Test that protobuf types can be constructed
-        let init_req = proto::InitializeRequest {
+        let health_req = proto::HealthCheckRequest {
-            client_id: "test-client".to_string(),
+            tokenized: Some(proto::TokenizedInput {
-            client_version: "0.1.0".to_string(),
+                original_text: "test".to_string(),
-            mode: 0,
+                input_ids: vec![1296],
+            }),
        };
-        assert_eq!(init_req.client_id, "test-client");
+        assert!(health_req.tokenized.is_some());
-        assert_eq!(init_req.client_version, "0.1.0");
-        assert_eq!(init_req.mode, 0);
    }
    #[test]
@@ -134,9 +106,10 @@ mod tests {
        let gen_req = proto::GenerateRequest {
            request_id: "test-req-123".to_string(),
-            input: Some(proto::generate_request::Input::Text(
+            tokenized: Some(proto::TokenizedInput {
-                "Hello world".to_string(),
+                original_text: "Hello world".to_string(),
-            )),
+                input_ids: vec![9906, 1917], // Mock token IDs for "Hello world"
+            }),
            sampling_params: Some(sampling_params),
            return_logprob: true,
            logprob_start_len: 0,
@@ -145,8 +118,8 @@ mod tests {
        };
        assert_eq!(gen_req.request_id, "test-req-123");
-        if let Some(proto::generate_request::Input::Text(text)) = &gen_req.input {
+        if let Some(ref tokenized) = &gen_req.tokenized {
-            assert_eq!(text, "Hello world");
+            assert_eq!(tokenized.original_text, "Hello world");
        }
        assert!(gen_req.return_logprob);
        assert_eq!(gen_req.top_logprobs_num, 5);
@@ -160,9 +133,12 @@ mod tests {
    #[test]
    fn test_health_check_request() {
        let health_req = proto::HealthCheckRequest {
-            include_detailed_metrics: true,
+            tokenized: Some(proto::TokenizedInput {
+                original_text: "test".to_string(),
+                input_ids: vec![1296], // Mock token ID for "test"
+            }),
        };
-        assert!(health_req.include_detailed_metrics);
+        assert!(health_req.tokenized.is_some());
    }
    #[test]
@@ -175,17 +151,6 @@ mod tests {
        assert_eq!(abort_req.reason, "User canceled");
    }
-    #[test]
-    fn test_flush_cache_request() {
-        let flush_req = proto::FlushCacheRequest {
-            flush_all: true,
-            session_ids: vec!["session1".to_string(), "session2".to_string()],
-        };
-        assert!(flush_req.flush_all);
-        assert_eq!(flush_req.session_ids.len(), 2);
-        assert_eq!(flush_req.session_ids[0], "session1");
-    }
    #[test]
    fn test_sampling_params_defaults() {
        let params = proto::SamplingParams::default();
@@ -214,38 +179,29 @@ mod tests {
        assert_eq!(mm_inputs.modalities[0], "image");
    }
-    #[test]
+    // TODO: SessionParams not in current proto - skip test
-    fn test_session_params() {
+    // #[test]
-        let session_params = proto::SessionParams {
+    // fn test_session_params() { ... }
-            session_id: "sess-789".to_string(),
-            request_id: "req-101".to_string(),
-            offset: 100,
-            replace: true,
-            drop_previous_output: false,
-        };
-        assert_eq!(session_params.session_id, "sess-789");
-        assert_eq!(session_params.request_id, "req-101");
-        assert_eq!(session_params.offset, 100);
-        assert!(session_params.replace);
-        assert!(!session_params.drop_previous_output);
-    }
    #[test]
    fn test_embed_request() {
        let embed_req = proto::EmbedRequest {
            request_id: "embed-req-202".to_string(),
-            input: Some(proto::embed_request::Input::Text(
+            tokenized: Some(proto::TokenizedInput {
-                "This is a test sentence for embedding".to_string(),
+                original_text: "This is a test sentence for embedding".to_string(),
-            )),
+                input_ids: vec![2028, 374, 264, 1296, 11914, 369, 28537], // Mock token IDs
+            }),
            log_metrics: true,
            data_parallel_rank: 0,
            ..Default::default()
        };
        assert_eq!(embed_req.request_id, "embed-req-202");
-        if let Some(proto::embed_request::Input::Text(text)) = &embed_req.input {
+        if let Some(ref tokenized) = &embed_req.tokenized {
-            assert_eq!(text, "This is a test sentence for embedding");
+            assert_eq!(
+                tokenized.original_text,
+                "This is a test sentence for embedding"
+            );
        }
        assert!(embed_req.log_metrics);
        assert_eq!(embed_req.data_parallel_rank, 0);
@@ -292,36 +248,7 @@ mod tests {
        assert_eq!(chunk.queue_time, 10);
    }
-    #[test]
+    // TODO: ModelInfo not in current proto - skip test
-    fn test_model_info() {
+    // #[test]
-        let model_info = proto::ModelInfo {
+    // fn test_model_info() { ... }
-            model_name: "Meta-Llama-3-8B-Instruct".to_string(),
-            max_context_length: 8192,
-            vocab_size: 128256,
-            supports_tool_calling: true,
-            supports_vision: false,
-            special_tokens: vec![
-                "<|begin_of_text|>".to_string(),
-                "<|end_of_text|>".to_string(),
-            ],
-            model_type: "llama".to_string(),
-            num_layers: 32,
-            hidden_size: 4096,
-            num_attention_heads: 32,
-            num_key_value_heads: 8,
-            tokenizer_type: "llama".to_string(),
-            eos_token_ids: vec![128001, 128009],
-            pad_token_id: 128001,
-            bos_token_id: 128000,
-        };
-        assert_eq!(model_info.model_name, "Meta-Llama-3-8B-Instruct");
-        assert_eq!(model_info.max_context_length, 8192);
-        assert_eq!(model_info.vocab_size, 128256);
-        assert!(model_info.supports_tool_calling);
-        assert!(!model_info.supports_vision);
-        assert_eq!(model_info.special_tokens.len(), 2);
-        assert_eq!(model_info.num_layers, 32);
-        assert_eq!(model_info.eos_token_ids, vec![128001, 128009]);
-    }
 }
--- a/sgl-router/src/proto/sglang_scheduler.proto
+++ b/sgl-router/src/proto/sglang_scheduler.proto
@@ -8,9 +8,6 @@ import "google/protobuf/struct.proto";
 // Service definition for SGLang scheduler communication
 // This protocol bridges the Rust router and Python scheduler
 service SglangScheduler {
-  // Initialize connection and get model info
-  rpc Initialize(InitializeRequest) returns (InitializeResponse);
  // Submit a generation request (supports streaming)
  rpc Generate(GenerateRequest) returns (stream GenerateResponse);
@@ -23,8 +20,6 @@ service SglangScheduler {
  // Abort a running request
  rpc Abort(AbortRequest) returns (AbortResponse);
-  // Flush KV cache
-  rpc FlushCache(FlushCacheRequest) returns (FlushCacheResponse);
 }
 // =====================
@@ -75,14 +70,6 @@ message SamplingParams {
  google.protobuf.Struct custom_params = 25;
 }
-// Session parameters for continual prompting
-message SessionParams {
-  string session_id = 1;
-  string request_id = 2;
-  int32 offset = 3;
-  bool replace = 4;
-  bool drop_previous_output = 5;
-}
 // Disaggregated serving parameters
 message DisaggregatedParams {
@@ -91,87 +78,6 @@ message DisaggregatedParams {
  int32 bootstrap_room = 3;
 }
-// =====================
-// Initialize
-// =====================
-message InitializeRequest {
-  string client_id = 1;
-  string client_version = 2;
-  // Operating mode
-  enum Mode {
-    REGULAR = 0;      // Normal mode with local scheduler
-    PREFILL = 1;      // Prefill-only mode for disaggregated serving
-    DECODE = 2;       // Decode-only mode for disaggregated serving
-  }
-  Mode mode = 3;
-}
-message InitializeResponse {
-  bool success = 1;
-  string scheduler_version = 2;
-  // Model information
-  ModelInfo model_info = 3;
-  // Server capabilities
-  ServerCapabilities capabilities = 4;
-  // Error message if success is false
-  string error_message = 5;
-}
-message ModelInfo {
-  string model_name = 1;
-  int32 max_context_length = 2;
-  int32 vocab_size = 3;
-  bool supports_tool_calling = 4;
-  bool supports_vision = 5;
-  repeated string special_tokens = 6;
-  // Additional model metadata
-  string model_type = 7;
-  int32 num_layers = 8;
-  int32 hidden_size = 9;
-  int32 num_attention_heads = 10;
-  int32 num_key_value_heads = 11;
-  // Tokenizer info
-  string tokenizer_type = 12;
-  repeated int32 eos_token_ids = 13;
-  int32 pad_token_id = 14;
-  int32 bos_token_id = 15;
-}
-message ServerCapabilities {
-  bool continuous_batching = 1;
-  bool disaggregated_serving = 2;
-  bool speculative_decoding = 3;
-  int32 max_batch_size = 4;
-  int32 max_num_batched_tokens = 5;
-  int32 max_prefill_tokens = 6;
-  string attention_backend = 7;  // "flashinfer", "triton", "torch"
-  // Additional capabilities
-  bool supports_lora = 8;
-  bool supports_grammar = 9;
-  bool supports_multimodal = 10;
-  repeated string supported_modalities = 11;  // ["image", "video", "audio"]
-  bool supports_custom_logit_processor = 12;
-  bool supports_session = 13;
-  // Hardware info
-  int32 num_gpus = 14;
-  string gpu_type = 15;
-  int64 total_gpu_memory = 16;
-  // Parallelism info
-  int32 tensor_parallel_size = 17;
-  int32 pipeline_parallel_size = 18;
-  int32 data_parallel_size = 19;
-}
 // =====================
 // Generate Request
 // =====================
@@ -179,49 +85,43 @@ message ServerCapabilities {
 message GenerateRequest {
  string request_id = 1;
-  // Input can be either text or tokenized
+  // Input must be tokenized (no raw text)
-  oneof input {
+  TokenizedInput tokenized = 2;
-    string text = 2;
-    TokenizedInput tokenized = 3;
-  }
  // Multimodal inputs
-  MultimodalInputs mm_inputs = 4;
+  MultimodalInputs mm_inputs = 3;
  // Generation parameters
-  SamplingParams sampling_params = 5;
+  SamplingParams sampling_params = 4;
  // Return options
-  bool return_logprob = 6;
+  bool return_logprob = 5;
-  int32 logprob_start_len = 7;
+  int32 logprob_start_len = 6;
-  int32 top_logprobs_num = 8;
+  int32 top_logprobs_num = 7;
-  repeated int32 token_ids_logprob = 9;
+  repeated int32 token_ids_logprob = 8;
-  bool return_hidden_states = 10;
+  bool return_hidden_states = 9;
-  // Session management
-  SessionParams session_params = 11;
  // For disaggregated serving
-  DisaggregatedParams disaggregated_params = 12;
+  DisaggregatedParams disaggregated_params = 10;
  // Custom logit processor (serialized)
-  string custom_logit_processor = 13;
+  string custom_logit_processor = 11;
  // Request metadata
-  google.protobuf.Timestamp timestamp = 14;
+  google.protobuf.Timestamp timestamp = 12;
-  bool log_metrics = 15;
+  bool log_metrics = 13;
  // Input embeddings (alternative to text/tokens)
-  repeated float input_embeds = 16;
+  repeated float input_embeds = 14;
  // LoRA adapter ID (if pre-loaded)
-  string lora_id = 17;
+  string lora_id = 15;
  // Data parallel routing
-  int32 data_parallel_rank = 18;
+  int32 data_parallel_rank = 16;
  // For load balancing
-  int32 dp_balance_id = 19;
+  int32 dp_balance_id = 17;
 }
 message TokenizedInput {
@@ -303,19 +203,6 @@ message GenerateComplete {
  }
  FinishReason finish_reason = 3;
-  // Final counts
-  int32 prompt_tokens = 4;
-  int32 completion_tokens = 5;
-  int32 cached_tokens = 6;
-  // Performance metrics
-  float total_generation_time = 7;
-  float time_to_first_token = 8;
-  float tokens_per_second = 9;
-  // Spec decode metrics
-  int32 spec_verify_count = 10;
  // All logprobs if requested
  repeated LogProbs all_logprobs = 11;
@@ -359,10 +246,8 @@ message HiddenStates {
 message EmbedRequest {
  string request_id = 1;
-  oneof input {
+  // Input must be tokenized (no raw text)
-    string text = 2;
+  TokenizedInput tokenized = 2;
-    TokenizedInput tokenized = 3;
-  }
  // Multimodal inputs
  MultimodalInputs mm_inputs = 4;
@@ -422,39 +307,13 @@ message EmbedError {
 // =====================
 message HealthCheckRequest {
-  bool include_detailed_metrics = 1;
+  // Input for health test generation (must be tokenized)
+  TokenizedInput tokenized = 1;
 }
 message HealthCheckResponse {
  bool healthy = 1;
+  string message = 2;
-  // Current load metrics
-  int32 num_requests_running = 2;
-  int32 num_requests_waiting = 3;
-  float gpu_cache_usage = 4;
-  float gpu_memory_usage = 5;
-  // KV cache metrics
-  int32 kv_cache_total_blocks = 6;
-  int32 kv_cache_used_blocks = 7;
-  float kv_cache_hit_rate = 8;
-  // Additional metrics
-  int32 num_grammar_queue_requests = 9;
-  float generation_throughput = 10;  // tokens/sec
-  float average_queue_time = 11;     // seconds
-  float average_generation_time = 12; // seconds
-  // System metrics
-  float cpu_usage = 13;
-  int64 memory_usage = 14;
-  // Disaggregation metrics
-  int32 num_prefill_requests = 15;
-  int32 num_decode_requests = 16;
-  // Detailed metrics (optional)
-  google.protobuf.Struct detailed_metrics = 17;
 }
 message AbortRequest {
@@ -467,17 +326,6 @@ message AbortResponse {
  string message = 2;
 }
-message FlushCacheRequest {
-  bool flush_all = 1;
-  repeated string session_ids = 2;  // Flush specific sessions
-}
-message FlushCacheResponse {
-  bool success = 1;
-  int32 num_entries_flushed = 2;
-  int64 memory_freed = 3;  // bytes
-  string message = 4;
-}
 // =====================
 // Additional Operations (Future)