vllm_engine.proto 4.99 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
syntax = "proto3";

package vllm.grpc.engine;

// Service definition for vLLM engine communication
// This protocol is designed for efficient binary communication between
// the Rust router and vLLM Python engine (AsyncLLM).
service VllmEngine {
  // Submit a generation request (supports streaming)
  rpc Generate(GenerateRequest) returns (stream GenerateResponse);

  // Submit an embedding request
  rpc Embed(EmbedRequest) returns (EmbedResponse);

  // Health check
  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);

  // Abort a running request
  rpc Abort(AbortRequest) returns (AbortResponse);

  // Get model information
  rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);

  // Get server information
  rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
}

// =====================
// Common Types
// =====================

// Sampling parameters for text generation
message SamplingParams {
  optional float temperature = 1;
  float top_p = 2;
  uint32 top_k = 3;
  float min_p = 4;
  float frequency_penalty = 5;
  float presence_penalty = 6;
  float repetition_penalty = 7;

  optional uint32 max_tokens = 8;
  uint32 min_tokens = 9;

  repeated string stop = 10;
  repeated uint32 stop_token_ids = 11;

  bool skip_special_tokens = 12;
  bool spaces_between_special_tokens = 13;
  bool ignore_eos = 14;

  uint32 n = 15;  // Number of parallel samples

  // Logprobs configuration
  optional int32 logprobs = 22;  // Number of log probabilities per output token (-1 for all)
  optional int32 prompt_logprobs = 23;  // Number of log probabilities per prompt token (-1 for all)

  // Additional vLLM fields
  optional int32 seed = 24;  // Random seed for reproducibility
  bool include_stop_str_in_output = 25;  // Whether to include stop strings in output
  map<int32, float> logit_bias = 26;  // Token ID to bias mapping (-100 to 100)
  optional int32 truncate_prompt_tokens = 27;  // Prompt truncation (-1 for model max)

  // Structured outputs (one of) - matches vLLM's StructuredOutputsParams
  oneof constraint {
    string json_schema = 16;  // JSON schema for structured output
    string regex = 17;  // Regex pattern
    string grammar = 18;  // Grammar/EBNF for structured output
    string structural_tag = 19;  // Structural tag (e.g., Harmony models)
    bool json_object = 20;  // Force JSON object output
    ChoiceConstraint choice = 21;  // List of allowed choices
  }
}

// Choice constraint for structured outputs
message ChoiceConstraint {
  repeated string choices = 1;
}

// Pre-tokenized input from Rust router
message TokenizedInput {
  string original_text = 1;  // For reference/debugging
  repeated uint32 input_ids = 2;  // Actual token IDs to process
}

// =====================
// Generate Request
// =====================

message GenerateRequest {
  string request_id = 1;

  // Prompt input
  oneof input {
    TokenizedInput tokenized = 2;
    string text = 3;
  }

  // Generation parameters (includes logprobs config)
  SamplingParams sampling_params = 4;

  // Streaming
  bool stream = 5;
}

// =====================
// Generate Response
// =====================

message GenerateResponse {
  oneof response {
    GenerateStreamChunk chunk = 1;     // For streaming
    GenerateComplete complete = 2;     // For final/non-streaming
  }
}

message GenerateStreamChunk {
  repeated uint32 token_ids = 1;       // Incremental tokens
  uint32 prompt_tokens = 2;
  uint32 completion_tokens = 3;
  uint32 cached_tokens = 4;

  // Logprobs support (TODO: implement in Phase 4)
  // OutputLogProbs output_logprobs = 5;
  // InputLogProbs input_logprobs = 6;  // Only in first chunk
}

message GenerateComplete {
  repeated uint32 output_ids = 1;      // All output tokens
  string finish_reason = 2;            // "stop", "length", "abort"
  uint32 prompt_tokens = 3;
  uint32 completion_tokens = 4;
  uint32 cached_tokens = 5;

  // Logprobs support (TODO: implement in Phase 4)
  // OutputLogProbs output_logprobs = 6;
  // InputLogProbs input_logprobs = 7;
}

// =====================
// Embedding Request
// =====================

message EmbedRequest {
  string request_id = 1;
  TokenizedInput tokenized = 2;
}

message EmbedResponse {
  repeated float embedding = 1;
  uint32 prompt_tokens = 2;
  uint32 embedding_dim = 3;
}

// =====================
// Management Operations
// =====================

message HealthCheckRequest {}

message HealthCheckResponse {
  bool healthy = 1;
  string message = 2;
}

message AbortRequest {
  repeated string request_ids = 1;
}

message AbortResponse {
}

// =====================
// Model and Server Info
// =====================

message GetModelInfoRequest {}

message GetModelInfoResponse {
  string model_path = 1;
  bool is_generation = 2;
  uint32 max_context_length = 3;
  uint32 vocab_size = 4;
  bool supports_vision = 5;
}

message GetServerInfoRequest {}

message GetServerInfoResponse {
  uint32 active_requests = 1;
  bool is_paused = 2;
  double last_receive_timestamp = 3;
  double uptime_seconds = 4;
  string server_type = 5;  // "vllm-grpc"
}