vllm_engine.proto

syntax = "proto3";

package vllm.grpc.engine;

// Service definition for vLLM engine communication
// This protocol is designed for efficient binary communication between
// the Rust router and vLLM Python engine (AsyncLLM).
service VllmEngine {
  // Submit a generation request (supports streaming)
  rpc Generate(GenerateRequest) returns (stream GenerateResponse);

  // Submit an embedding request
  rpc Embed(EmbedRequest) returns (EmbedResponse);

  // Health check
  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);

  // Abort a running request
  rpc Abort(AbortRequest) returns (AbortResponse);

  // Get model information
  rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);

  // Get server information
  rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
}

// =====================
// Common Types
// =====================

// Sampling parameters for text generation
message SamplingParams {
  optional float temperature = 1;
  float top_p = 2;
  uint32 top_k = 3;
  float min_p = 4;
  float frequency_penalty = 5;
  float presence_penalty = 6;
  float repetition_penalty = 7;

  optional uint32 max_tokens = 8;
  uint32 min_tokens = 9;

  repeated string stop = 10;
  repeated uint32 stop_token_ids = 11;

  bool skip_special_tokens = 12;
  bool spaces_between_special_tokens = 13;
  bool ignore_eos = 14;

  uint32 n = 15;  // Number of parallel samples

  // Logprobs configuration
  optional int32 logprobs = 22;  // Number of log probabilities per output token (-1 for all)
  optional int32 prompt_logprobs = 23;  // Number of log probabilities per prompt token (-1 for all)

  // Additional vLLM fields
  optional int32 seed = 24;  // Random seed for reproducibility
  bool include_stop_str_in_output = 25;  // Whether to include stop strings in output
  map<int32, float> logit_bias = 26;  // Token ID to bias mapping (-100 to 100)
  optional int32 truncate_prompt_tokens = 27;  // Prompt truncation (-1 for model max)

  // Structured outputs (one of) - matches vLLM's StructuredOutputsParams
  oneof constraint {
    string json_schema = 16;  // JSON schema for structured output
    string regex = 17;  // Regex pattern
    string grammar = 18;  // Grammar/EBNF for structured output
    string structural_tag = 19;  // Structural tag (e.g., Harmony models)
    bool json_object = 20;  // Force JSON object output
    ChoiceConstraint choice = 21;  // List of allowed choices
  }
}

// Choice constraint for structured outputs
message ChoiceConstraint {
  repeated string choices = 1;
}

// Pre-tokenized input from Rust router
message TokenizedInput {
  string original_text = 1;  // For reference/debugging
  repeated uint32 input_ids = 2;  // Actual token IDs to process
}

// =====================
// Generate Request
// =====================

message GenerateRequest {
  string request_id = 1;

  // Prompt input
  oneof input {
    TokenizedInput tokenized = 2;
    string text = 3;
  }

  // Generation parameters (includes logprobs config)
  SamplingParams sampling_params = 4;

  // Streaming
  bool stream = 5;
}

// =====================
// Generate Response
// =====================

message GenerateResponse {
  oneof response {
    GenerateStreamChunk chunk = 1;     // For streaming
    GenerateComplete complete = 2;     // For final/non-streaming
  }
}

message GenerateStreamChunk {
  repeated uint32 token_ids = 1;       // Incremental tokens
  uint32 prompt_tokens = 2;
  uint32 completion_tokens = 3;
  uint32 cached_tokens = 4;

  // Logprobs support (TODO: implement in Phase 4)
  // OutputLogProbs output_logprobs = 5;
  // InputLogProbs input_logprobs = 6;  // Only in first chunk
}

message GenerateComplete {
  repeated uint32 output_ids = 1;      // All output tokens
  string finish_reason = 2;            // "stop", "length", "abort"
  uint32 prompt_tokens = 3;
  uint32 completion_tokens = 4;
  uint32 cached_tokens = 5;

  // Logprobs support (TODO: implement in Phase 4)
  // OutputLogProbs output_logprobs = 6;
  // InputLogProbs input_logprobs = 7;
}

// =====================
// Embedding Request
// =====================

message EmbedRequest {
  string request_id = 1;
  TokenizedInput tokenized = 2;
}

message EmbedResponse {
  repeated float embedding = 1;
  uint32 prompt_tokens = 2;
  uint32 embedding_dim = 3;
}

// =====================
// Management Operations
// =====================

message HealthCheckRequest {}

message HealthCheckResponse {
  bool healthy = 1;
  string message = 2;
}

message AbortRequest {
  repeated string request_ids = 1;
}

message AbortResponse {
}

// =====================
// Model and Server Info
// =====================

message GetModelInfoRequest {}

message GetModelInfoResponse {
  string model_path = 1;
  bool is_generation = 2;
  uint32 max_context_length = 3;
  uint32 vocab_size = 4;
  bool supports_vision = 5;
}

message GetServerInfoRequest {}

message GetServerInfoResponse {
  uint32 active_requests = 1;
  bool is_paused = 2;
  double last_receive_timestamp = 3;
  double uptime_seconds = 4;
  string server_type = 5;  // "vllm-grpc"
}