"sgl-router/vscode:/vscode.git/clone" did not exist on "336e9a6058daffafa735c61d019e15e31f7700ff"
Unverified Commit 37f3325b authored by Chang Su's avatar Chang Su Committed by GitHub
Browse files

[router][grpc] Support E2E non-stream chat completions (#10980)

parent bd95944c
...@@ -13,7 +13,7 @@ import sys ...@@ -13,7 +13,7 @@ import sys
import threading import threading
import time import time
import uuid import uuid
from typing import Any, Dict, List, Optional, Union from typing import Any, AsyncGenerator, Dict, List, Optional, Union
import grpc import grpc
import zmq import zmq
...@@ -156,7 +156,7 @@ class GrpcRequestManager: ...@@ -156,7 +156,7 @@ class GrpcRequestManager:
obj: TokenizedGenerateReqInput, obj: TokenizedGenerateReqInput,
request_id: Optional[str] = None, request_id: Optional[str] = None,
grpc_context: Optional[grpc.aio.ServicerContext] = None, grpc_context: Optional[grpc.aio.ServicerContext] = None,
): ) -> AsyncGenerator[Union[Dict, List[Dict]], None]:
""" """
Submit a generation request to the scheduler with n>1 parallel sampling support. Submit a generation request to the scheduler with n>1 parallel sampling support.
......
...@@ -321,14 +321,14 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer) ...@@ -321,14 +321,14 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
logger.info(f"Sending health check request to request manager...") logger.info(f"Sending health check request to request manager...")
# Submit and wait for response # Submit and wait for response
output_queue = await self.request_manager.generate_request( output_generator = self.request_manager.generate_request(
health_request, request_id=rid health_request, request_id=rid
) )
try: try:
# Wait for response with configurable timeout # Get first response with timeout
response = await asyncio.wait_for( response = await asyncio.wait_for(
output_queue.get(), timeout=HEALTH_CHECK_TIMEOUT output_generator.__anext__(), timeout=HEALTH_CHECK_TIMEOUT
) )
# Clean up # Clean up
...@@ -492,13 +492,32 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer) ...@@ -492,13 +492,32 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
) -> sglang_scheduler_pb2.GenerateResponse: ) -> sglang_scheduler_pb2.GenerateResponse:
"""Create a completion response.""" """Create a completion response."""
# Determine finish reason # Extract meta info and finish reason details
finish_reason = sglang_scheduler_pb2.GenerateComplete.STOP
meta_info = output.get("meta_info", {}) meta_info = output.get("meta_info", {})
if meta_info.get("finish_reason") == "length": finish_reason_data = meta_info.get("finish_reason")
finish_reason = sglang_scheduler_pb2.GenerateComplete.LENGTH
elif meta_info.get("finish_reason") == "eos_token": # Determine finish reason, default is stop
finish_reason = sglang_scheduler_pb2.GenerateComplete.EOS_TOKEN finish_reason = "stop"
if finish_reason_data:
if isinstance(finish_reason_data, dict):
finish_reason_type = finish_reason_data.get("type")
else:
# Handle legacy string format
finish_reason_type = finish_reason_data
if finish_reason_type == "length":
finish_reason = "length"
elif finish_reason_type == "abort":
finish_reason = "abort"
# Extract matched_stop information
matched_stop_kwargs = {}
if isinstance(finish_reason_data, dict) and "matched" in finish_reason_data:
matched = finish_reason_data["matched"]
if isinstance(matched, int):
matched_stop_kwargs["matched_token_id"] = matched
elif isinstance(matched, str):
matched_stop_kwargs["matched_stop_str"] = matched
return sglang_scheduler_pb2.GenerateResponse( return sglang_scheduler_pb2.GenerateResponse(
request_id=request_id, request_id=request_id,
...@@ -510,6 +529,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer) ...@@ -510,6 +529,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
"completion_tokens", len(output.get("token_ids", [])) "completion_tokens", len(output.get("token_ids", []))
), ),
cached_tokens=meta_info.get("cached_tokens", 0), cached_tokens=meta_info.get("cached_tokens", 0),
**matched_stop_kwargs,
), ),
) )
......
...@@ -185,20 +185,8 @@ message GenerateComplete { ...@@ -185,20 +185,8 @@ message GenerateComplete {
// Final output // Final output
repeated uint32 output_ids = 1; repeated uint32 output_ids = 1;
// Finish reason // Finish reason as OpenAI-compatible string ("stop", "length", "abort")
enum FinishReason { string finish_reason = 2;
// The model generated a stop sequence.
STOP = 0;
// The model reached the maximum generation length.
LENGTH = 1;
// The model generated an end-of-sequence (EOS) token.
EOS_TOKEN = 2;
// The model generated a user-provided stop string.
STOP_STR = 3;
// The request was aborted by the user or system.
ABORT = 4;
}
FinishReason finish_reason = 2;
// Token usage counts // Token usage counts
int32 prompt_tokens = 3; int32 prompt_tokens = 3;
...@@ -210,6 +198,12 @@ message GenerateComplete { ...@@ -210,6 +198,12 @@ message GenerateComplete {
// All hidden states if requested // All hidden states if requested
repeated HiddenStates all_hidden_states = 7; repeated HiddenStates all_hidden_states = 7;
// Matched stop information (for stop sequences)
oneof matched_stop {
uint32 matched_token_id = 8;
string matched_stop_str = 9;
}
} }
message GenerateError { message GenerateError {
......
...@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__ ...@@ -29,7 +29,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__
from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x16sglang_scheduler.proto\x12\x15sglang.grpc.scheduler\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1cgoogle/protobuf/struct.proto\"\xe1\x05\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x1b\n\x0emax_new_tokens\x18\x08 \x01(\x05H\x01\x88\x01\x01\x12\x0c\n\x04stop\x18\t \x03(\t\x12\x16\n\x0estop_token_ids\x18\n \x03(\r\x12\x1b\n\x13skip_special_tokens\x18\x0b \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\x0c \x01(\x08\x12\x0f\n\x05regex\x18\r \x01(\tH\x00\x12\x15\n\x0bjson_schema\x18\x0e \x01(\tH\x00\x12\x16\n\x0c\x65\x62nf_grammar\x18\x0f \x01(\tH\x00\x12\x18\n\x0estructural_tag\x18\x10 \x01(\tH\x00\x12\x11\n\tlora_path\x18\x11 \x01(\t\x12\t\n\x01n\x18\x12 \x01(\x05\x12\x15\n\rtoken_healing\x18\x13 \x01(\x08\x12\x16\n\x0emin_new_tokens\x18\x14 \x01(\x05\x12\x12\n\nignore_eos\x18\x15 \x01(\x08\x12\x14\n\x0cno_stop_trim\x18\x16 \x01(\x08\x12\x17\n\x0fstream_interval\x18\x17 \x01(\x05\x12H\n\nlogit_bias\x18\x18 \x03(\x0b\x32\x34.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry\x12.\n\rcustom_params\x18\x19 \x01(\x0b\x32\x17.google.protobuf.Struct\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraintB\x11\n\x0f_max_new_tokens\"]\n\x13\x44isaggregatedParams\x12\x16\n\x0e\x62ootstrap_host\x18\x01 \x01(\t\x12\x16\n\x0e\x62ootstrap_port\x18\x02 \x01(\x05\x12\x16\n\x0e\x62ootstrap_room\x18\x03 \x01(\x05\"\xf9\x04\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x04 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x16\n\x0ereturn_logprob\x18\x05 \x01(\x08\x12\x19\n\x11logprob_start_len\x18\x06 \x01(\x05\x12\x18\n\x10top_logprobs_num\x18\x07 \x01(\x05\x12\x19\n\x11token_ids_logprob\x18\x08 \x03(\r\x12\x1c\n\x14return_hidden_states\x18\t \x01(\x08\x12H\n\x14\x64isaggregated_params\x18\n \x01(\x0b\x32*.sglang.grpc.scheduler.DisaggregatedParams\x12\x1e\n\x16\x63ustom_logit_processor\x18\x0b \x01(\t\x12-\n\ttimestamp\x18\x0c \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x13\n\x0blog_metrics\x18\r \x01(\x08\x12\x14\n\x0cinput_embeds\x18\x0e \x03(\x02\x12\x0f\n\x07lora_id\x18\x0f \x01(\t\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x10 \x01(\x05\x12\x15\n\rdp_balance_id\x18\x11 \x01(\x05\x12\x0e\n\x06stream\x18\x12 \x01(\x08\":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\r\"\xd3\x01\n\x10MultimodalInputs\x12\x12\n\nimage_urls\x18\x01 \x03(\t\x12\x12\n\nvideo_urls\x18\x02 \x03(\t\x12\x12\n\naudio_urls\x18\x03 \x03(\t\x12\x33\n\x12processed_features\x18\x04 \x01(\x0b\x32\x17.google.protobuf.Struct\x12\x12\n\nimage_data\x18\x05 \x03(\x0c\x12\x12\n\nvideo_data\x18\x06 \x03(\x0c\x12\x12\n\naudio_data\x18\x07 \x03(\x0c\x12\x12\n\nmodalities\x18\x08 \x03(\t\"\xe3\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12;\n\x05\x63hunk\x18\x02 \x01(\x0b\x32*.sglang.grpc.scheduler.GenerateStreamChunkH\x00\x12;\n\x08\x63omplete\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.GenerateCompleteH\x00\x12\x35\n\x05\x65rror\x18\x04 \x01(\x0b\x32$.sglang.grpc.scheduler.GenerateErrorH\x00\x42\n\n\x08response\"\xbb\x01\n\x13GenerateStreamChunk\x12\x11\n\ttoken_ids\x18\x01 \x03(\r\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x03 \x01(\x05\x12\x15\n\rcached_tokens\x18\x04 \x01(\x05\x12\x31\n\x08logprobs\x18\x05 \x01(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12\x15\n\rhidden_states\x18\x06 \x03(\x02\"\x81\x03\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\r\x12K\n\rfinish_reason\x18\x02 \x01(\x0e\x32\x34.sglang.grpc.scheduler.GenerateComplete.FinishReason\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05\x12\x35\n\x0c\x61ll_logprobs\x18\x06 \x03(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12>\n\x11\x61ll_hidden_states\x18\x07 \x03(\x0b\x32#.sglang.grpc.scheduler.HiddenStates\"L\n\x0c\x46inishReason\x12\x08\n\x04STOP\x10\x00\x12\n\n\x06LENGTH\x10\x01\x12\r\n\tEOS_TOKEN\x10\x02\x12\x0c\n\x08STOP_STR\x10\x03\x12\t\n\x05\x41\x42ORT\x10\x04\"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"\x84\x01\n\x08LogProbs\x12\x16\n\x0etoken_logprobs\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x38\n\x0ctop_logprobs\x18\x03 \x03(\x0b\x32\".sglang.grpc.scheduler.TopLogProbs\x12\x13\n\x0btoken_texts\x18\x04 \x03(\t\"E\n\x0bTopLogProbs\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x13\n\x0btoken_texts\x18\x03 \x03(\t\"?\n\x0cHiddenStates\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05layer\x18\x02 \x01(\x05\x12\x10\n\x08position\x18\x03 \x01(\x05\"\xca\x02\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x04 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x05 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x13\n\x0blog_metrics\x18\x06 \x01(\x08\x12\x16\n\x0etoken_type_ids\x18\x07 \x03(\x05\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x08 \x01(\x05\x12\x18\n\x10is_cross_encoder\x18\t \x01(\x08\x12\r\n\x05texts\x18\n \x03(\t\"\x9d\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\x08\x63omplete\x18\x02 \x01(\x0b\x32$.sglang.grpc.scheduler.EmbedCompleteH\x00\x12\x32\n\x05\x65rror\x18\x03 \x01(\x0b\x32!.sglang.grpc.scheduler.EmbedErrorH\x00\x42\n\n\x08response\"\xa3\x01\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rcached_tokens\x18\x03 \x01(\x05\x12\x15\n\rembedding_dim\x18\x04 \x01(\x05\x12:\n\x10\x62\x61tch_embeddings\x18\x05 \x03(\x0b\x32 .sglang.grpc.scheduler.Embedding\"*\n\tEmbedding\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05index\x18\x02 \x01(\x05\"<\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"N\n\x12HealthCheckRequest\x12\x38\n\ttokenized\x18\x01 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t\"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"I\n\x0fLoadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\x12\x14\n\x0c\x61\x64\x61pter_path\x18\x02 \x01(\t\x12\x0c\n\x04rank\x18\x03 \x01(\x05\"H\n\x10LoadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x12\n\nadapter_id\x18\x02 \x01(\t\x12\x0f\n\x07message\x18\x03 \x01(\t\"\'\n\x11UnloadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\"6\n\x12UnloadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"w\n\x14UpdateWeightsRequest\x12\x13\n\tdisk_path\x18\x01 \x01(\tH\x00\x12\x15\n\x0btensor_data\x18\x02 \x01(\x0cH\x00\x12\x14\n\nremote_url\x18\x03 \x01(\tH\x00\x12\x13\n\x0bweight_name\x18\x04 \x01(\tB\x08\n\x06source\"9\n\x15UpdateWeightsResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"-\n\x17GetInternalStateRequest\x12\x12\n\nstate_keys\x18\x01 \x03(\t\"B\n\x18GetInternalStateResponse\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"A\n\x17SetInternalStateRequest\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"<\n\x18SetInternalStateResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t2\xfe\x02\n\x0fSglangScheduler\x12]\n\x08Generate\x12&.sglang.grpc.scheduler.GenerateRequest\x1a\'.sglang.grpc.scheduler.GenerateResponse0\x01\x12R\n\x05\x45mbed\x12#.sglang.grpc.scheduler.EmbedRequest\x1a$.sglang.grpc.scheduler.EmbedResponse\x12\x64\n\x0bHealthCheck\x12).sglang.grpc.scheduler.HealthCheckRequest\x1a*.sglang.grpc.scheduler.HealthCheckResponse\x12R\n\x05\x41\x62ort\x12#.sglang.grpc.scheduler.AbortRequest\x1a$.sglang.grpc.scheduler.AbortResponseb\x06proto3') DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x16sglang_scheduler.proto\x12\x15sglang.grpc.scheduler\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1cgoogle/protobuf/struct.proto\"\xe1\x05\n\x0eSamplingParams\x12\x13\n\x0btemperature\x18\x01 \x01(\x02\x12\r\n\x05top_p\x18\x02 \x01(\x02\x12\r\n\x05top_k\x18\x03 \x01(\x05\x12\r\n\x05min_p\x18\x04 \x01(\x02\x12\x19\n\x11\x66requency_penalty\x18\x05 \x01(\x02\x12\x18\n\x10presence_penalty\x18\x06 \x01(\x02\x12\x1a\n\x12repetition_penalty\x18\x07 \x01(\x02\x12\x1b\n\x0emax_new_tokens\x18\x08 \x01(\x05H\x01\x88\x01\x01\x12\x0c\n\x04stop\x18\t \x03(\t\x12\x16\n\x0estop_token_ids\x18\n \x03(\r\x12\x1b\n\x13skip_special_tokens\x18\x0b \x01(\x08\x12%\n\x1dspaces_between_special_tokens\x18\x0c \x01(\x08\x12\x0f\n\x05regex\x18\r \x01(\tH\x00\x12\x15\n\x0bjson_schema\x18\x0e \x01(\tH\x00\x12\x16\n\x0c\x65\x62nf_grammar\x18\x0f \x01(\tH\x00\x12\x18\n\x0estructural_tag\x18\x10 \x01(\tH\x00\x12\x11\n\tlora_path\x18\x11 \x01(\t\x12\t\n\x01n\x18\x12 \x01(\x05\x12\x15\n\rtoken_healing\x18\x13 \x01(\x08\x12\x16\n\x0emin_new_tokens\x18\x14 \x01(\x05\x12\x12\n\nignore_eos\x18\x15 \x01(\x08\x12\x14\n\x0cno_stop_trim\x18\x16 \x01(\x08\x12\x17\n\x0fstream_interval\x18\x17 \x01(\x05\x12H\n\nlogit_bias\x18\x18 \x03(\x0b\x32\x34.sglang.grpc.scheduler.SamplingParams.LogitBiasEntry\x12.\n\rcustom_params\x18\x19 \x01(\x0b\x32\x17.google.protobuf.Struct\x1a\x30\n\x0eLogitBiasEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x42\x0c\n\nconstraintB\x11\n\x0f_max_new_tokens\"]\n\x13\x44isaggregatedParams\x12\x16\n\x0e\x62ootstrap_host\x18\x01 \x01(\t\x12\x16\n\x0e\x62ootstrap_port\x18\x02 \x01(\x05\x12\x16\n\x0e\x62ootstrap_room\x18\x03 \x01(\x05\"\xf9\x04\n\x0fGenerateRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x04 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x16\n\x0ereturn_logprob\x18\x05 \x01(\x08\x12\x19\n\x11logprob_start_len\x18\x06 \x01(\x05\x12\x18\n\x10top_logprobs_num\x18\x07 \x01(\x05\x12\x19\n\x11token_ids_logprob\x18\x08 \x03(\r\x12\x1c\n\x14return_hidden_states\x18\t \x01(\x08\x12H\n\x14\x64isaggregated_params\x18\n \x01(\x0b\x32*.sglang.grpc.scheduler.DisaggregatedParams\x12\x1e\n\x16\x63ustom_logit_processor\x18\x0b \x01(\t\x12-\n\ttimestamp\x18\x0c \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x13\n\x0blog_metrics\x18\r \x01(\x08\x12\x14\n\x0cinput_embeds\x18\x0e \x03(\x02\x12\x0f\n\x07lora_id\x18\x0f \x01(\t\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x10 \x01(\x05\x12\x15\n\rdp_balance_id\x18\x11 \x01(\x05\x12\x0e\n\x06stream\x18\x12 \x01(\x08\":\n\x0eTokenizedInput\x12\x15\n\roriginal_text\x18\x01 \x01(\t\x12\x11\n\tinput_ids\x18\x02 \x03(\r\"\xd3\x01\n\x10MultimodalInputs\x12\x12\n\nimage_urls\x18\x01 \x03(\t\x12\x12\n\nvideo_urls\x18\x02 \x03(\t\x12\x12\n\naudio_urls\x18\x03 \x03(\t\x12\x33\n\x12processed_features\x18\x04 \x01(\x0b\x32\x17.google.protobuf.Struct\x12\x12\n\nimage_data\x18\x05 \x03(\x0c\x12\x12\n\nvideo_data\x18\x06 \x03(\x0c\x12\x12\n\naudio_data\x18\x07 \x03(\x0c\x12\x12\n\nmodalities\x18\x08 \x03(\t\"\xe3\x01\n\x10GenerateResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12;\n\x05\x63hunk\x18\x02 \x01(\x0b\x32*.sglang.grpc.scheduler.GenerateStreamChunkH\x00\x12;\n\x08\x63omplete\x18\x03 \x01(\x0b\x32\'.sglang.grpc.scheduler.GenerateCompleteH\x00\x12\x35\n\x05\x65rror\x18\x04 \x01(\x0b\x32$.sglang.grpc.scheduler.GenerateErrorH\x00\x42\n\n\x08response\"\xbb\x01\n\x13GenerateStreamChunk\x12\x11\n\ttoken_ids\x18\x01 \x03(\r\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x03 \x01(\x05\x12\x15\n\rcached_tokens\x18\x04 \x01(\x05\x12\x31\n\x08logprobs\x18\x05 \x01(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12\x15\n\rhidden_states\x18\x06 \x03(\x02\"\xc5\x02\n\x10GenerateComplete\x12\x12\n\noutput_ids\x18\x01 \x03(\r\x12\x15\n\rfinish_reason\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x05\x12\x19\n\x11\x63ompletion_tokens\x18\x04 \x01(\x05\x12\x15\n\rcached_tokens\x18\x05 \x01(\x05\x12\x35\n\x0c\x61ll_logprobs\x18\x06 \x03(\x0b\x32\x1f.sglang.grpc.scheduler.LogProbs\x12>\n\x11\x61ll_hidden_states\x18\x07 \x03(\x0b\x32#.sglang.grpc.scheduler.HiddenStates\x12\x1a\n\x10matched_token_id\x18\x08 \x01(\rH\x00\x12\x1a\n\x10matched_stop_str\x18\t \x01(\tH\x00\x42\x0e\n\x0cmatched_stop\"K\n\rGenerateError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x18\n\x10http_status_code\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"\x84\x01\n\x08LogProbs\x12\x16\n\x0etoken_logprobs\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x38\n\x0ctop_logprobs\x18\x03 \x03(\x0b\x32\".sglang.grpc.scheduler.TopLogProbs\x12\x13\n\x0btoken_texts\x18\x04 \x03(\t\"E\n\x0bTopLogProbs\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\x11\n\ttoken_ids\x18\x02 \x03(\x05\x12\x13\n\x0btoken_texts\x18\x03 \x03(\t\"?\n\x0cHiddenStates\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05layer\x18\x02 \x01(\x05\x12\x10\n\x08position\x18\x03 \x01(\x05\"\xca\x02\n\x0c\x45mbedRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\ttokenized\x18\x02 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\x12:\n\tmm_inputs\x18\x04 \x01(\x0b\x32\'.sglang.grpc.scheduler.MultimodalInputs\x12>\n\x0fsampling_params\x18\x05 \x01(\x0b\x32%.sglang.grpc.scheduler.SamplingParams\x12\x13\n\x0blog_metrics\x18\x06 \x01(\x08\x12\x16\n\x0etoken_type_ids\x18\x07 \x03(\x05\x12\x1a\n\x12\x64\x61ta_parallel_rank\x18\x08 \x01(\x05\x12\x18\n\x10is_cross_encoder\x18\t \x01(\x08\x12\r\n\x05texts\x18\n \x03(\t\"\x9d\x01\n\rEmbedResponse\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x38\n\x08\x63omplete\x18\x02 \x01(\x0b\x32$.sglang.grpc.scheduler.EmbedCompleteH\x00\x12\x32\n\x05\x65rror\x18\x03 \x01(\x0b\x32!.sglang.grpc.scheduler.EmbedErrorH\x00\x42\n\n\x08response\"\xa3\x01\n\rEmbedComplete\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x15\n\rprompt_tokens\x18\x02 \x01(\x05\x12\x15\n\rcached_tokens\x18\x03 \x01(\x05\x12\x15\n\rembedding_dim\x18\x04 \x01(\x05\x12:\n\x10\x62\x61tch_embeddings\x18\x05 \x03(\x0b\x32 .sglang.grpc.scheduler.Embedding\"*\n\tEmbedding\x12\x0e\n\x06values\x18\x01 \x03(\x02\x12\r\n\x05index\x18\x02 \x01(\x05\"<\n\nEmbedError\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0c\n\x04\x63ode\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65tails\x18\x03 \x01(\t\"N\n\x12HealthCheckRequest\x12\x38\n\ttokenized\x18\x01 \x01(\x0b\x32%.sglang.grpc.scheduler.TokenizedInput\"7\n\x13HealthCheckResponse\x12\x0f\n\x07healthy\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"2\n\x0c\x41\x62ortRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\t\x12\x0e\n\x06reason\x18\x02 \x01(\t\"1\n\rAbortResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"I\n\x0fLoadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\x12\x14\n\x0c\x61\x64\x61pter_path\x18\x02 \x01(\t\x12\x0c\n\x04rank\x18\x03 \x01(\x05\"H\n\x10LoadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x12\n\nadapter_id\x18\x02 \x01(\t\x12\x0f\n\x07message\x18\x03 \x01(\t\"\'\n\x11UnloadLoRARequest\x12\x12\n\nadapter_id\x18\x01 \x01(\t\"6\n\x12UnloadLoRAResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"w\n\x14UpdateWeightsRequest\x12\x13\n\tdisk_path\x18\x01 \x01(\tH\x00\x12\x15\n\x0btensor_data\x18\x02 \x01(\x0cH\x00\x12\x14\n\nremote_url\x18\x03 \x01(\tH\x00\x12\x13\n\x0bweight_name\x18\x04 \x01(\tB\x08\n\x06source\"9\n\x15UpdateWeightsResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t\"-\n\x17GetInternalStateRequest\x12\x12\n\nstate_keys\x18\x01 \x03(\t\"B\n\x18GetInternalStateResponse\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"A\n\x17SetInternalStateRequest\x12&\n\x05state\x18\x01 \x01(\x0b\x32\x17.google.protobuf.Struct\"<\n\x18SetInternalStateResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t2\xfe\x02\n\x0fSglangScheduler\x12]\n\x08Generate\x12&.sglang.grpc.scheduler.GenerateRequest\x1a\'.sglang.grpc.scheduler.GenerateResponse0\x01\x12R\n\x05\x45mbed\x12#.sglang.grpc.scheduler.EmbedRequest\x1a$.sglang.grpc.scheduler.EmbedResponse\x12\x64\n\x0bHealthCheck\x12).sglang.grpc.scheduler.HealthCheckRequest\x1a*.sglang.grpc.scheduler.HealthCheckResponse\x12R\n\x05\x41\x62ort\x12#.sglang.grpc.scheduler.AbortRequest\x1a$.sglang.grpc.scheduler.AbortResponseb\x06proto3')
_globals = globals() _globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
...@@ -55,55 +55,53 @@ if not _descriptor._USE_C_DESCRIPTORS: ...@@ -55,55 +55,53 @@ if not _descriptor._USE_C_DESCRIPTORS:
_globals['_GENERATESTREAMCHUNK']._serialized_start=2088 _globals['_GENERATESTREAMCHUNK']._serialized_start=2088
_globals['_GENERATESTREAMCHUNK']._serialized_end=2275 _globals['_GENERATESTREAMCHUNK']._serialized_end=2275
_globals['_GENERATECOMPLETE']._serialized_start=2278 _globals['_GENERATECOMPLETE']._serialized_start=2278
_globals['_GENERATECOMPLETE']._serialized_end=2663 _globals['_GENERATECOMPLETE']._serialized_end=2603
_globals['_GENERATECOMPLETE_FINISHREASON']._serialized_start=2587 _globals['_GENERATEERROR']._serialized_start=2605
_globals['_GENERATECOMPLETE_FINISHREASON']._serialized_end=2663 _globals['_GENERATEERROR']._serialized_end=2680
_globals['_GENERATEERROR']._serialized_start=2665 _globals['_LOGPROBS']._serialized_start=2683
_globals['_GENERATEERROR']._serialized_end=2740 _globals['_LOGPROBS']._serialized_end=2815
_globals['_LOGPROBS']._serialized_start=2743 _globals['_TOPLOGPROBS']._serialized_start=2817
_globals['_LOGPROBS']._serialized_end=2875 _globals['_TOPLOGPROBS']._serialized_end=2886
_globals['_TOPLOGPROBS']._serialized_start=2877 _globals['_HIDDENSTATES']._serialized_start=2888
_globals['_TOPLOGPROBS']._serialized_end=2946 _globals['_HIDDENSTATES']._serialized_end=2951
_globals['_HIDDENSTATES']._serialized_start=2948 _globals['_EMBEDREQUEST']._serialized_start=2954
_globals['_HIDDENSTATES']._serialized_end=3011 _globals['_EMBEDREQUEST']._serialized_end=3284
_globals['_EMBEDREQUEST']._serialized_start=3014 _globals['_EMBEDRESPONSE']._serialized_start=3287
_globals['_EMBEDREQUEST']._serialized_end=3344 _globals['_EMBEDRESPONSE']._serialized_end=3444
_globals['_EMBEDRESPONSE']._serialized_start=3347 _globals['_EMBEDCOMPLETE']._serialized_start=3447
_globals['_EMBEDRESPONSE']._serialized_end=3504 _globals['_EMBEDCOMPLETE']._serialized_end=3610
_globals['_EMBEDCOMPLETE']._serialized_start=3507 _globals['_EMBEDDING']._serialized_start=3612
_globals['_EMBEDCOMPLETE']._serialized_end=3670 _globals['_EMBEDDING']._serialized_end=3654
_globals['_EMBEDDING']._serialized_start=3672 _globals['_EMBEDERROR']._serialized_start=3656
_globals['_EMBEDDING']._serialized_end=3714 _globals['_EMBEDERROR']._serialized_end=3716
_globals['_EMBEDERROR']._serialized_start=3716 _globals['_HEALTHCHECKREQUEST']._serialized_start=3718
_globals['_EMBEDERROR']._serialized_end=3776 _globals['_HEALTHCHECKREQUEST']._serialized_end=3796
_globals['_HEALTHCHECKREQUEST']._serialized_start=3778 _globals['_HEALTHCHECKRESPONSE']._serialized_start=3798
_globals['_HEALTHCHECKREQUEST']._serialized_end=3856 _globals['_HEALTHCHECKRESPONSE']._serialized_end=3853
_globals['_HEALTHCHECKRESPONSE']._serialized_start=3858 _globals['_ABORTREQUEST']._serialized_start=3855
_globals['_HEALTHCHECKRESPONSE']._serialized_end=3913 _globals['_ABORTREQUEST']._serialized_end=3905
_globals['_ABORTREQUEST']._serialized_start=3915 _globals['_ABORTRESPONSE']._serialized_start=3907
_globals['_ABORTREQUEST']._serialized_end=3965 _globals['_ABORTRESPONSE']._serialized_end=3956
_globals['_ABORTRESPONSE']._serialized_start=3967 _globals['_LOADLORAREQUEST']._serialized_start=3958
_globals['_ABORTRESPONSE']._serialized_end=4016 _globals['_LOADLORAREQUEST']._serialized_end=4031
_globals['_LOADLORAREQUEST']._serialized_start=4018 _globals['_LOADLORARESPONSE']._serialized_start=4033
_globals['_LOADLORAREQUEST']._serialized_end=4091 _globals['_LOADLORARESPONSE']._serialized_end=4105
_globals['_LOADLORARESPONSE']._serialized_start=4093 _globals['_UNLOADLORAREQUEST']._serialized_start=4107
_globals['_LOADLORARESPONSE']._serialized_end=4165 _globals['_UNLOADLORAREQUEST']._serialized_end=4146
_globals['_UNLOADLORAREQUEST']._serialized_start=4167 _globals['_UNLOADLORARESPONSE']._serialized_start=4148
_globals['_UNLOADLORAREQUEST']._serialized_end=4206 _globals['_UNLOADLORARESPONSE']._serialized_end=4202
_globals['_UNLOADLORARESPONSE']._serialized_start=4208 _globals['_UPDATEWEIGHTSREQUEST']._serialized_start=4204
_globals['_UNLOADLORARESPONSE']._serialized_end=4262 _globals['_UPDATEWEIGHTSREQUEST']._serialized_end=4323
_globals['_UPDATEWEIGHTSREQUEST']._serialized_start=4264 _globals['_UPDATEWEIGHTSRESPONSE']._serialized_start=4325
_globals['_UPDATEWEIGHTSREQUEST']._serialized_end=4383 _globals['_UPDATEWEIGHTSRESPONSE']._serialized_end=4382
_globals['_UPDATEWEIGHTSRESPONSE']._serialized_start=4385 _globals['_GETINTERNALSTATEREQUEST']._serialized_start=4384
_globals['_UPDATEWEIGHTSRESPONSE']._serialized_end=4442 _globals['_GETINTERNALSTATEREQUEST']._serialized_end=4429
_globals['_GETINTERNALSTATEREQUEST']._serialized_start=4444 _globals['_GETINTERNALSTATERESPONSE']._serialized_start=4431
_globals['_GETINTERNALSTATEREQUEST']._serialized_end=4489 _globals['_GETINTERNALSTATERESPONSE']._serialized_end=4497
_globals['_GETINTERNALSTATERESPONSE']._serialized_start=4491 _globals['_SETINTERNALSTATEREQUEST']._serialized_start=4499
_globals['_GETINTERNALSTATERESPONSE']._serialized_end=4557 _globals['_SETINTERNALSTATEREQUEST']._serialized_end=4564
_globals['_SETINTERNALSTATEREQUEST']._serialized_start=4559 _globals['_SETINTERNALSTATERESPONSE']._serialized_start=4566
_globals['_SETINTERNALSTATEREQUEST']._serialized_end=4624 _globals['_SETINTERNALSTATERESPONSE']._serialized_end=4626
_globals['_SETINTERNALSTATERESPONSE']._serialized_start=4626 _globals['_SGLANGSCHEDULER']._serialized_start=4629
_globals['_SETINTERNALSTATERESPONSE']._serialized_end=4686 _globals['_SGLANGSCHEDULER']._serialized_end=5011
_globals['_SGLANGSCHEDULER']._serialized_start=4689
_globals['_SGLANGSCHEDULER']._serialized_end=5071
# @@protoc_insertion_point(module_scope) # @@protoc_insertion_point(module_scope)
...@@ -3,7 +3,6 @@ import datetime ...@@ -3,7 +3,6 @@ import datetime
from google.protobuf import timestamp_pb2 as _timestamp_pb2 from google.protobuf import timestamp_pb2 as _timestamp_pb2
from google.protobuf import struct_pb2 as _struct_pb2 from google.protobuf import struct_pb2 as _struct_pb2
from google.protobuf.internal import containers as _containers from google.protobuf.internal import containers as _containers
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message from google.protobuf import message as _message
from collections.abc import Iterable as _Iterable, Mapping as _Mapping from collections.abc import Iterable as _Iterable, Mapping as _Mapping
...@@ -179,19 +178,7 @@ class GenerateStreamChunk(_message.Message): ...@@ -179,19 +178,7 @@ class GenerateStreamChunk(_message.Message):
def __init__(self, token_ids: _Optional[_Iterable[int]] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., logprobs: _Optional[_Union[LogProbs, _Mapping]] = ..., hidden_states: _Optional[_Iterable[float]] = ...) -> None: ... def __init__(self, token_ids: _Optional[_Iterable[int]] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., logprobs: _Optional[_Union[LogProbs, _Mapping]] = ..., hidden_states: _Optional[_Iterable[float]] = ...) -> None: ...
class GenerateComplete(_message.Message): class GenerateComplete(_message.Message):
__slots__ = ("output_ids", "finish_reason", "prompt_tokens", "completion_tokens", "cached_tokens", "all_logprobs", "all_hidden_states") __slots__ = ("output_ids", "finish_reason", "prompt_tokens", "completion_tokens", "cached_tokens", "all_logprobs", "all_hidden_states", "matched_token_id", "matched_stop_str")
class FinishReason(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
__slots__ = ()
STOP: _ClassVar[GenerateComplete.FinishReason]
LENGTH: _ClassVar[GenerateComplete.FinishReason]
EOS_TOKEN: _ClassVar[GenerateComplete.FinishReason]
STOP_STR: _ClassVar[GenerateComplete.FinishReason]
ABORT: _ClassVar[GenerateComplete.FinishReason]
STOP: GenerateComplete.FinishReason
LENGTH: GenerateComplete.FinishReason
EOS_TOKEN: GenerateComplete.FinishReason
STOP_STR: GenerateComplete.FinishReason
ABORT: GenerateComplete.FinishReason
OUTPUT_IDS_FIELD_NUMBER: _ClassVar[int] OUTPUT_IDS_FIELD_NUMBER: _ClassVar[int]
FINISH_REASON_FIELD_NUMBER: _ClassVar[int] FINISH_REASON_FIELD_NUMBER: _ClassVar[int]
PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int] PROMPT_TOKENS_FIELD_NUMBER: _ClassVar[int]
...@@ -199,14 +186,18 @@ class GenerateComplete(_message.Message): ...@@ -199,14 +186,18 @@ class GenerateComplete(_message.Message):
CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int] CACHED_TOKENS_FIELD_NUMBER: _ClassVar[int]
ALL_LOGPROBS_FIELD_NUMBER: _ClassVar[int] ALL_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
ALL_HIDDEN_STATES_FIELD_NUMBER: _ClassVar[int] ALL_HIDDEN_STATES_FIELD_NUMBER: _ClassVar[int]
MATCHED_TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
MATCHED_STOP_STR_FIELD_NUMBER: _ClassVar[int]
output_ids: _containers.RepeatedScalarFieldContainer[int] output_ids: _containers.RepeatedScalarFieldContainer[int]
finish_reason: GenerateComplete.FinishReason finish_reason: str
prompt_tokens: int prompt_tokens: int
completion_tokens: int completion_tokens: int
cached_tokens: int cached_tokens: int
all_logprobs: _containers.RepeatedCompositeFieldContainer[LogProbs] all_logprobs: _containers.RepeatedCompositeFieldContainer[LogProbs]
all_hidden_states: _containers.RepeatedCompositeFieldContainer[HiddenStates] all_hidden_states: _containers.RepeatedCompositeFieldContainer[HiddenStates]
def __init__(self, output_ids: _Optional[_Iterable[int]] = ..., finish_reason: _Optional[_Union[GenerateComplete.FinishReason, str]] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., all_logprobs: _Optional[_Iterable[_Union[LogProbs, _Mapping]]] = ..., all_hidden_states: _Optional[_Iterable[_Union[HiddenStates, _Mapping]]] = ...) -> None: ... matched_token_id: int
matched_stop_str: str
def __init__(self, output_ids: _Optional[_Iterable[int]] = ..., finish_reason: _Optional[str] = ..., prompt_tokens: _Optional[int] = ..., completion_tokens: _Optional[int] = ..., cached_tokens: _Optional[int] = ..., all_logprobs: _Optional[_Iterable[_Union[LogProbs, _Mapping]]] = ..., all_hidden_states: _Optional[_Iterable[_Union[HiddenStates, _Mapping]]] = ..., matched_token_id: _Optional[int] = ..., matched_stop_str: _Optional[str] = ...) -> None: ...
class GenerateError(_message.Message): class GenerateError(_message.Message):
__slots__ = ("message", "http_status_code", "details") __slots__ = ("message", "http_status_code", "details")
......
...@@ -185,20 +185,8 @@ message GenerateComplete { ...@@ -185,20 +185,8 @@ message GenerateComplete {
// Final output // Final output
repeated uint32 output_ids = 1; repeated uint32 output_ids = 1;
// Finish reason // Finish reason as OpenAI-compatible string ("stop", "length", "abort")
enum FinishReason { string finish_reason = 2;
// The model generated a stop sequence.
STOP = 0;
// The model reached the maximum generation length.
LENGTH = 1;
// The model generated an end-of-sequence (EOS) token.
EOS_TOKEN = 2;
// The model generated a user-provided stop string.
STOP_STR = 3;
// The request was aborted by the user or system.
ABORT = 4;
}
FinishReason finish_reason = 2;
// Token usage counts // Token usage counts
int32 prompt_tokens = 3; int32 prompt_tokens = 3;
...@@ -210,6 +198,12 @@ message GenerateComplete { ...@@ -210,6 +198,12 @@ message GenerateComplete {
// All hidden states if requested // All hidden states if requested
repeated HiddenStates all_hidden_states = 7; repeated HiddenStates all_hidden_states = 7;
// Matched stop information (for stop sequences)
oneof matched_stop {
uint32 matched_token_id = 8;
string matched_stop_str = 9;
}
} }
message GenerateError { message GenerateError {
......
...@@ -423,10 +423,25 @@ pub struct ChatCompletionResponse { ...@@ -423,10 +423,25 @@ pub struct ChatCompletionResponse {
pub system_fingerprint: Option<String>, pub system_fingerprint: Option<String>,
} }
/// Response message structure for ChatCompletionResponse (different from request ChatMessage)
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ChatCompletionMessage {
pub role: String, // Always "assistant" for responses
#[serde(skip_serializing_if = "Option::is_none")]
pub content: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_calls: Option<Vec<ToolCall>>,
/// Reasoning content for O1-style models (SGLang extension)
#[serde(skip_serializing_if = "Option::is_none")]
pub reasoning_content: Option<String>,
// Note: function_call is deprecated and not included
// Note: refusal, annotations, audio are not added yet
}
#[derive(Debug, Clone, Deserialize, Serialize)] #[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ChatChoice { pub struct ChatChoice {
pub index: u32, pub index: u32,
pub message: ChatMessage, pub message: ChatCompletionMessage,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub logprobs: Option<ChatLogProbs>, pub logprobs: Option<ChatLogProbs>,
pub finish_reason: Option<String>, // "stop", "length", "tool_calls", "content_filter", "function_call" pub finish_reason: Option<String>, // "stop", "length", "tool_calls", "content_filter", "function_call"
......
...@@ -8,6 +8,7 @@ use axum::{ ...@@ -8,6 +8,7 @@ use axum::{
extract::Request, extract::Request,
http::{HeaderMap, StatusCode}, http::{HeaderMap, StatusCode},
response::{IntoResponse, Response}, response::{IntoResponse, Response},
Json,
}; };
use tracing::{debug, error, info, warn}; use tracing::{debug, error, info, warn};
...@@ -18,8 +19,9 @@ use crate::metrics::RouterMetrics; ...@@ -18,8 +19,9 @@ use crate::metrics::RouterMetrics;
use crate::policies::PolicyRegistry; use crate::policies::PolicyRegistry;
use crate::protocols::spec::ChatMessage; use crate::protocols::spec::ChatMessage;
use crate::protocols::spec::{ use crate::protocols::spec::{
ChatCompletionRequest, CompletionRequest, EmbeddingRequest, GenerateRequest, RerankRequest, ChatChoice, ChatCompletionMessage, ChatCompletionRequest, ChatCompletionResponse,
ResponsesGetParams, ResponsesRequest, StringOrArray, Tool, ToolChoice, CompletionRequest, EmbeddingRequest, GenerateRequest, RerankRequest, ResponsesGetParams,
ResponsesRequest, StringOrArray, Tool, ToolChoice, Usage,
}; };
use crate::reasoning_parser::ParserFactory; use crate::reasoning_parser::ParserFactory;
use crate::routers::RouterTrait; use crate::routers::RouterTrait;
...@@ -30,6 +32,7 @@ use crate::tokenizer::traits::Tokenizer; ...@@ -30,6 +32,7 @@ use crate::tokenizer::traits::Tokenizer;
use crate::tokenizer::HuggingFaceTokenizer; use crate::tokenizer::HuggingFaceTokenizer;
use crate::tool_parser::ParserRegistry; use crate::tool_parser::ParserRegistry;
use serde_json::Value; use serde_json::Value;
use std::time::{SystemTime, UNIX_EPOCH};
use tokio_stream::StreamExt; use tokio_stream::StreamExt;
use uuid::Uuid; use uuid::Uuid;
...@@ -648,18 +651,16 @@ impl GrpcRouter { ...@@ -648,18 +651,16 @@ impl GrpcRouter {
Err(e) => return fail_fmt("Failed to start generation: ", &e), Err(e) => return fail_fmt("Failed to start generation: ", &e),
}; };
// Get the single Complete response // Collect all responses (for n>1 support)
let gen_response = match stream.next().await { let mut all_responses = Vec::new();
Some(Ok(r)) => r, while let Some(response) = stream.next().await {
Some(Err(e)) => return fail_fmt("Failed to get GenerateResponse: ", &e), match response {
None => return fail_str("No response from server"), Ok(gen_response) => match gen_response.response {
}; Some(proto::generate_response::Response::Complete(complete)) => {
all_responses.push(complete);
// Extract the expected variant early }
let complete = match gen_response.response {
Some(proto::generate_response::Response::Complete(c)) => c,
Some(proto::generate_response::Response::Error(err)) => { Some(proto::generate_response::Response::Error(err)) => {
error!("Generation failed: {}", err.message); error!("Generation failed for one choice: {}", err.message);
return ( return (
StatusCode::INTERNAL_SERVER_ERROR, StatusCode::INTERNAL_SERVER_ERROR,
format!("Generation failed: {}", err.message), format!("Generation failed: {}", err.message),
...@@ -670,14 +671,79 @@ impl GrpcRouter { ...@@ -670,14 +671,79 @@ impl GrpcRouter {
return fail_str("Unexpected chunk response for non-streaming request") return fail_str("Unexpected chunk response for non-streaming request")
} }
None => return fail_str("Empty response from server"), None => return fail_str("Empty response from server"),
},
Err(e) => return fail_fmt("Failed to get GenerateResponse: ", &e),
}
}
if all_responses.is_empty() {
return fail_str("No responses from server");
}
// Process each response into a ChatChoice
let mut choices = Vec::new();
for (index, complete) in all_responses.iter().enumerate() {
match self
.process_single_choice(complete, index, original_request, &mut stop_decoder)
.await
{
Ok(choice) => choices.push(choice),
Err(e) => {
error!("Failed to process choice {}: {}", index, e);
return (
StatusCode::INTERNAL_SERVER_ERROR,
format!("Failed to process choice {}: {}", index, e),
)
.into_response();
}
}
}
// Aggregate usage information from all responses
let total_prompt_tokens: u32 = all_responses.iter().map(|r| r.prompt_tokens as u32).sum();
let total_completion_tokens: u32 = all_responses
.iter()
.map(|r| r.completion_tokens as u32)
.sum();
let usage = Usage {
prompt_tokens: total_prompt_tokens,
completion_tokens: total_completion_tokens,
total_tokens: total_prompt_tokens + total_completion_tokens,
completion_tokens_details: None,
}; };
// Decode tokens // Build final ChatCompletionResponse
let outputs = match stop_decoder.process_tokens(&complete.output_ids) { let response = ChatCompletionResponse {
Ok(o) => o, id: format!("chatcmpl-{}", Uuid::new_v4()),
Err(e) => return fail_fmt("Failed to process tokens: ", &e), object: "chat.completion".to_string(),
created: SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs(),
model: original_request.model.clone(),
choices,
usage: Some(usage),
system_fingerprint: None,
}; };
// Serialize and return JSON response
Json(response).into_response()
}
/// Process a single GenerateComplete response into a ChatChoice
async fn process_single_choice(
&self,
complete: &proto::GenerateComplete,
index: usize,
original_request: &ChatCompletionRequest,
stop_decoder: &mut crate::tokenizer::stop::StopSequenceDecoder,
) -> Result<ChatChoice, String> {
stop_decoder.reset();
// Decode tokens
let outputs = stop_decoder
.process_tokens(&complete.output_ids)
.map_err(|e| format!("Failed to process tokens: {}", e))?;
// Accumulate text with early breaks // Accumulate text with early breaks
let mut final_text = String::new(); let mut final_text = String::new();
for output in outputs { for output in outputs {
...@@ -697,8 +763,119 @@ impl GrpcRouter { ...@@ -697,8 +763,119 @@ impl GrpcRouter {
final_text.push_str(&t); final_text.push_str(&t);
} }
// TODO: Create proper OpenAI-compatible response // Step 1: Handle reasoning content parsing
(StatusCode::OK, format!("Final text: {}", final_text)).into_response() let mut reasoning_text: Option<String> = None;
let mut processed_text = final_text;
// Check if reasoning parsing is enabled and separate_reasoning is requested
if original_request.separate_reasoning {
if let Ok(mut parser) = self
.reasoning_parser_factory
.create(&original_request.model)
{
match parser.detect_and_parse_reasoning(&processed_text) {
Ok(result) => {
if !result.reasoning_text.is_empty() {
reasoning_text = Some(result.reasoning_text);
}
processed_text = result.normal_text;
}
Err(e) => {
return Err(format!("Reasoning parsing error: {}", e));
}
}
}
}
// Step 2: Handle tool call parsing
let mut tool_calls: Option<Vec<crate::protocols::spec::ToolCall>> = None;
// Check if tool calls should be processed
let tool_choice_enabled = !matches!(
&original_request.tool_choice,
Some(ToolChoice::Value(
crate::protocols::spec::ToolChoiceValue::None
))
);
if tool_choice_enabled && original_request.tools.is_some() {
if let Some(parser) = self
.tool_parser_registry
.get_parser(&original_request.model)
{
match parser.parse_complete(&processed_text).await {
Ok(parsed_tool_calls) => {
if !parsed_tool_calls.is_empty() {
let spec_tool_calls = parsed_tool_calls
.into_iter()
.map(|tc| crate::protocols::spec::ToolCall {
id: tc.id,
tool_type: "function".to_string(),
function: crate::protocols::spec::FunctionCallResponse {
name: tc.function.name,
arguments: Some(
serde_json::to_string(&tc.function.arguments)
.unwrap_or_else(|_| "{}".to_string()),
),
},
})
.collect();
tool_calls = Some(spec_tool_calls);
processed_text = String::new();
}
}
Err(e) => {
error!("Tool call parsing error: {}", e);
// Continue without tool calls rather than failing
}
}
}
}
// Step 3: Use finish reason directly from proto (already OpenAI-compatible string)
let finish_reason_str = &complete.finish_reason;
// Override finish reason if we have tool calls
let final_finish_reason_str = if tool_calls.is_some() {
"tool_calls"
} else {
finish_reason_str
};
// Extract matched_stop information from proto
let matched_stop = match &complete.matched_stop {
Some(proto::generate_complete::MatchedStop::MatchedTokenId(token_id)) => Some(
serde_json::Value::Number(serde_json::Number::from(*token_id)),
),
Some(proto::generate_complete::MatchedStop::MatchedStopStr(stop_str)) => {
Some(serde_json::Value::String(stop_str.clone()))
}
None => None,
};
// Step 4: Build ChatCompletionMessage (proper response message type)
let chat_message = ChatCompletionMessage {
role: "assistant".to_string(),
content: if processed_text.is_empty() {
None
} else {
Some(processed_text)
},
tool_calls,
reasoning_content: reasoning_text,
};
// Step 5: Build ChatChoice
let choice = ChatChoice {
index: index as u32,
message: chat_message,
logprobs: None,
finish_reason: Some(final_finish_reason_str.to_string()),
matched_stop,
hidden_states: None,
};
Ok(choice)
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment