Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
373e76c1
Unverified
Commit
373e76c1
authored
Feb 02, 2026
by
Biswa Panda
Committed by
GitHub
Feb 03, 2026
Browse files
feat(lora): Add lora_name tracking to scheduling and sequence management (#5875)
parent
18d9d1fa
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
127 additions
and
3 deletions
+127
-3
lib/bindings/c/src/lib.rs
lib/bindings/c/src/lib.rs
+1
-0
lib/bindings/python/rust/llm/kv.rs
lib/bindings/python/rust/llm/kv.rs
+1
-0
lib/kv-router/src/protocols.rs
lib/kv-router/src/protocols.rs
+2
-0
lib/llm/src/kv_router.rs
lib/llm/src/kv_router.rs
+12
-1
lib/llm/src/kv_router/prefill_router.rs
lib/llm/src/kv_router/prefill_router.rs
+3
-1
lib/llm/src/kv_router/scheduler.rs
lib/llm/src/kv_router/scheduler.rs
+14
-0
lib/llm/src/kv_router/sequence.rs
lib/llm/src/kv_router/sequence.rs
+65
-0
lib/llm/src/local_model.rs
lib/llm/src/local_model.rs
+1
-0
lib/llm/src/model_card.rs
lib/llm/src/model_card.rs
+6
-0
lib/llm/src/preprocessor.rs
lib/llm/src/preprocessor.rs
+17
-1
lib/llm/src/protocols/common/preprocessor.rs
lib/llm/src/protocols/common/preprocessor.rs
+5
-0
No files found.
lib/bindings/c/src/lib.rs
View file @
373e76c1
...
...
@@ -950,6 +950,7 @@ pub unsafe extern "C" fn dynamo_router_add_request(
overlap_blocks
,
None
,
worker
,
None
,
// lora_name not exposed in C API yet
)
.await
;
...
...
lib/bindings/python/rust/llm/kv.rs
View file @
373e76c1
...
...
@@ -1249,6 +1249,7 @@ impl KvPushRouter {
&
token_ids
,
router_config_override
.as_ref
(),
update_states
,
None
,
// lora_name not exposed in Python API yet
)
.await
.map_err
(
to_pyerr
)
?
;
...
...
lib/kv-router/src/protocols.rs
View file @
373e76c1
...
...
@@ -222,6 +222,8 @@ pub struct ActiveSequenceEvent {
pub
worker
:
WorkerWithDpRank
,
pub
data
:
ActiveSequenceEventData
,
pub
router_id
:
u64
,
#[serde(default)]
pub
lora_name
:
Option
<
String
>
,
}
#[derive(Serialize,
Deserialize,
Debug,
Clone)]
...
...
lib/llm/src/kv_router.rs
View file @
373e76c1
...
...
@@ -486,12 +486,14 @@ impl KvRouter {
/// Give these tokens, find the worker with the best match in it's KV cache.
/// Returns the best worker (with dp_rank) and overlap amount in number of blocks.
/// Now also takes optional context_id for request tracking
#[allow(clippy::too_many_arguments)]
pub
async
fn
find_best_match
(
&
self
,
context_id
:
Option
<&
str
>
,
tokens
:
&
[
u32
],
router_config_override
:
Option
<&
RouterConfigOverride
>
,
update_states
:
bool
,
lora_name
:
Option
<
String
>
,
)
->
anyhow
::
Result
<
(
WorkerWithDpRank
,
u32
)
>
{
// Validate that context_id is provided when update_states is true
if
update_states
&&
context_id
.is_none
()
{
...
...
@@ -517,6 +519,7 @@ impl KvRouter {
overlap_scores
.clone
(),
router_config_override
,
update_states
,
lora_name
,
)
.await
?
;
...
...
@@ -531,6 +534,7 @@ impl KvRouter {
Ok
((
best_worker
,
overlap_amount
))
}
#[allow(clippy::too_many_arguments)]
pub
async
fn
add_request
(
&
self
,
request_id
:
String
,
...
...
@@ -538,6 +542,7 @@ impl KvRouter {
overlap_blocks
:
u32
,
expected_output_tokens
:
Option
<
u32
>
,
worker
:
WorkerWithDpRank
,
lora_name
:
Option
<
String
>
,
)
{
let
isl_tokens
=
tokens
.len
();
...
...
@@ -554,6 +559,7 @@ impl KvRouter {
overlap_blocks
,
expected_output_tokens
,
worker
,
lora_name
,
)
.await
{
...
...
@@ -687,7 +693,7 @@ impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Er
let
response
=
match
request
{
RouterRequest
::
New
{
tokens
}
=>
{
let
(
best_worker
,
overlap_blocks
)
=
self
.find_best_match
(
Some
(
&
context_id
),
&
tokens
,
None
,
true
)
.find_best_match
(
Some
(
&
context_id
),
&
tokens
,
None
,
true
,
None
)
.await
?
;
RouterResponse
::
New
{
...
...
@@ -744,6 +750,9 @@ impl KvPushRouter {
)
->
Result
<
WorkerSelection
,
Error
>
{
let
routing
=
request
.routing
.as_ref
();
// Extract LORA name from routing hints
let
lora_name
=
routing
.and_then
(|
r
|
r
.lora_name
.clone
());
// Get pre-selected worker based on phase, with backend_instance_id as fallback
let
Some
(
id
)
=
(
match
phase
{
RequestPhase
::
Prefill
=>
{
...
...
@@ -763,6 +772,7 @@ impl KvPushRouter {
&
request
.token_ids
,
request
.router_config_override
.as_ref
(),
!
is_query_only
,
lora_name
,
)
.await
?
;
...
...
@@ -804,6 +814,7 @@ impl KvPushRouter {
overlap_blocks
,
expected_output_tokens
,
worker
,
lora_name
,
)
.await
;
}
else
{
...
...
lib/llm/src/kv_router/prefill_router.rs
View file @
373e76c1
...
...
@@ -266,10 +266,12 @@ impl PrefillRouter {
InnerPrefillRouter
::
KvRouter
(
r
)
=>
r
,
_
=>
return
None
,
};
// Extract LORA name from routing hints
let
lora_name
=
req
.routing
.as_ref
()
.and_then
(|
r
|
r
.lora_name
.clone
());
match
async
{
kv_router
.chooser
.find_best_match
(
None
,
&
req
.token_ids
,
None
,
false
)
.find_best_match
(
None
,
&
req
.token_ids
,
None
,
false
,
lora_name
)
.await
}
.instrument
(
tracing
::
info_span!
(
"kv_find_best_match"
))
...
...
lib/llm/src/kv_router/scheduler.rs
View file @
373e76c1
...
...
@@ -68,6 +68,8 @@ pub struct SchedulingRequest {
pub
router_config_override
:
Option
<
RouterConfigOverride
>
,
// Whether to update scheduler states (false for query_instance_id requests)
pub
update_states
:
bool
,
// LORA adapter name extracted from request.model field
pub
lora_name
:
Option
<
String
>
,
// Option to take it out to send the response without moving the struct
resp_tx
:
Option
<
tokio
::
sync
::
oneshot
::
Sender
<
SchedulingResponse
>>
,
}
...
...
@@ -248,6 +250,7 @@ impl KvScheduler {
selection
.overlap_blocks
,
None
,
// expected_output_tokens not available in scheduler loop
selection
.worker
,
request
.lora_name
.clone
(),
)
.await
{
...
...
@@ -272,6 +275,7 @@ impl KvScheduler {
Ok
(
KvScheduler
{
request_tx
,
slots
})
}
#[allow(clippy::too_many_arguments)]
pub
async
fn
schedule
(
&
self
,
maybe_request_id
:
Option
<
String
>
,
...
...
@@ -280,6 +284,7 @@ impl KvScheduler {
overlaps
:
OverlapScores
,
router_config_override
:
Option
<&
RouterConfigOverride
>
,
update_states
:
bool
,
lora_name
:
Option
<
String
>
,
)
->
Result
<
WorkerWithDpRank
,
KvSchedulerError
>
{
let
(
resp_tx
,
resp_rx
)
=
tokio
::
sync
::
oneshot
::
channel
();
let
request
=
SchedulingRequest
{
...
...
@@ -291,6 +296,7 @@ impl KvScheduler {
prefill_tokens
:
HashMap
::
new
(),
router_config_override
:
router_config_override
.cloned
(),
update_states
,
lora_name
,
resp_tx
:
Some
(
resp_tx
),
// Wrap in Some()
};
...
...
@@ -305,6 +311,7 @@ impl KvScheduler {
Ok
(
response
.best_worker
)
}
#[allow(clippy::too_many_arguments)]
pub
async
fn
add_request
(
&
self
,
request_id
:
String
,
...
...
@@ -313,6 +320,7 @@ impl KvScheduler {
overlap
:
u32
,
expected_output_tokens
:
Option
<
u32
>
,
worker
:
WorkerWithDpRank
,
lora_name
:
Option
<
String
>
,
)
->
Result
<
(),
SequenceError
>
{
self
.slots
.add_request
(
...
...
@@ -322,6 +330,7 @@ impl KvScheduler {
overlap
,
expected_output_tokens
,
worker
,
lora_name
,
)
.await
}
...
...
@@ -378,6 +387,11 @@ impl KvScheduler {
loads
}
/// Get active request counts grouped by LORA name
pub
fn
get_active_lora_counts
(
&
self
)
->
HashMap
<
String
,
usize
>
{
self
.slots
.get_active_lora_counts
()
}
}
// Helper function for softmax sampling
...
...
lib/llm/src/kv_router/sequence.rs
View file @
373e76c1
...
...
@@ -405,6 +405,7 @@ enum UpdateSequences {
pub
struct
ActiveSequencesMultiWorker
{
senders
:
Arc
<
DashMap
<
WorkerWithDpRank
,
tokio
::
sync
::
mpsc
::
UnboundedSender
<
UpdateSequences
>>>
,
request_to_worker
:
Arc
<
DashMap
<
RequestId
,
WorkerWithDpRank
>>
,
request_to_lora
:
Arc
<
DashMap
<
RequestId
,
String
>>
,
handles
:
Arc
<
DashMap
<
WorkerWithDpRank
,
std
::
thread
::
JoinHandle
<
()
>>>
,
block_size
:
usize
,
component
:
Component
,
...
...
@@ -429,6 +430,7 @@ impl ActiveSequencesMultiWorker {
let
senders
=
Arc
::
new
(
DashMap
::
new
());
let
handles
=
Arc
::
new
(
DashMap
::
new
());
let
request_to_worker
=
Arc
::
new
(
DashMap
::
new
());
let
request_to_lora
=
Arc
::
new
(
DashMap
::
new
());
// Expand workers by their dp_rank
for
(
worker_id
,
config
)
in
workers_with_configs
{
...
...
@@ -452,6 +454,7 @@ impl ActiveSequencesMultiWorker {
let
multi_worker
=
Self
{
senders
:
senders
.clone
(),
request_to_worker
:
request_to_worker
.clone
(),
request_to_lora
:
request_to_lora
.clone
(),
handles
,
block_size
,
component
:
component
.clone
(),
...
...
@@ -465,6 +468,7 @@ impl ActiveSequencesMultiWorker {
if
replica_sync
{
let
senders_clone
=
senders
.clone
();
let
request_to_worker_clone
=
request_to_worker
.clone
();
let
request_to_lora_clone
=
request_to_lora
.clone
();
let
component_clone
=
component
.clone
();
let
router_id_clone
=
router_id
;
let
cancel_token
=
component
.drt
()
.runtime
()
.child_token
();
...
...
@@ -474,6 +478,7 @@ impl ActiveSequencesMultiWorker {
if
let
Err
(
e
)
=
Self
::
subscribe_to_events
(
senders_clone
,
request_to_worker_clone
,
request_to_lora_clone
,
component_clone
,
router_id_clone
,
cancel_token
,
...
...
@@ -603,6 +608,7 @@ impl ActiveSequencesMultiWorker {
DashMap
<
WorkerWithDpRank
,
tokio
::
sync
::
mpsc
::
UnboundedSender
<
UpdateSequences
>>
,
>
,
request_to_worker
:
Arc
<
DashMap
<
RequestId
,
WorkerWithDpRank
>>
,
request_to_lora
:
Arc
<
DashMap
<
RequestId
,
String
>>
,
component
:
Component
,
router_id
:
u64
,
cancel_token
:
CancellationToken
,
...
...
@@ -642,6 +648,11 @@ impl ActiveSequencesMultiWorker {
}
=>
{
request_to_worker
.insert
(
event
.request_id
.clone
(),
event
.worker
);
// Store lora_name mapping if present
if
let
Some
(
ref
lora_name
)
=
event
.lora_name
{
request_to_lora
.insert
(
event
.request_id
.clone
(),
lora_name
.clone
());
}
if
let
Some
(
sender
)
=
senders
.get
(
&
event
.worker
)
{
// For replicated events, we create a dummy response channel since we don't need to handle expired requests
let
(
resp_tx
,
_
)
=
tokio
::
sync
::
oneshot
::
channel
();
...
...
@@ -668,6 +679,8 @@ impl ActiveSequencesMultiWorker {
request_id
:
event
.request_id
.clone
(),
});
}
// Clean up lora_name mapping
request_to_lora
.remove
(
&
event
.request_id
);
}
ActiveSequenceEventData
::
MarkPrefillCompleted
=>
{
if
let
Some
(
worker
)
=
request_to_worker
.get
(
&
event
.request_id
)
...
...
@@ -724,9 +737,22 @@ impl ActiveSequencesMultiWorker {
}
self
.handles
.remove
(
worker
);
// Collect request_ids to remove from request_to_lora
let
requests_to_remove
:
Vec
<
RequestId
>
=
self
.request_to_worker
.iter
()
.filter
(|
entry
|
entry
.value
()
==
worker
)
.map
(|
entry
|
entry
.key
()
.clone
())
.collect
();
// Clean up request_to_worker mappings for this worker
self
.request_to_worker
.retain
(|
_
request_id
,
mapped_worker
|
mapped_worker
!=
worker
);
// Clean up request_to_lora mappings for removed requests
for
request_id
in
requests_to_remove
{
self
.request_to_lora
.remove
(
&
request_id
);
}
}
// Add new workers
...
...
@@ -742,6 +768,7 @@ impl ActiveSequencesMultiWorker {
}
}
#[allow(clippy::too_many_arguments)]
pub
async
fn
add_request
(
&
self
,
request_id
:
RequestId
,
...
...
@@ -750,6 +777,7 @@ impl ActiveSequencesMultiWorker {
overlap
:
u32
,
expected_output_tokens
:
Option
<
u32
>
,
worker
:
WorkerWithDpRank
,
lora_name
:
Option
<
String
>
,
)
->
Result
<
(),
SequenceError
>
{
// Check for worker existence
if
!
self
.senders
.contains_key
(
&
worker
)
{
...
...
@@ -779,6 +807,7 @@ impl ActiveSequencesMultiWorker {
expected_output_tokens
,
},
router_id
:
self
.router_id
,
lora_name
:
lora_name
.clone
(),
};
self
.event_publisher
.publish
(
&
event
)
.await
?
;
}
...
...
@@ -786,6 +815,11 @@ impl ActiveSequencesMultiWorker {
// Update local state with full WorkerWithDpRank
self
.request_to_worker
.insert
(
request_id
.clone
(),
worker
);
// Store lora_name for later use in Free/MarkPrefillCompleted events
if
let
Some
(
lora
)
=
lora_name
{
self
.request_to_lora
.insert
(
request_id
.clone
(),
lora
);
}
self
.senders
.get
(
&
worker
)
.unwrap
()
...
...
@@ -807,6 +841,7 @@ impl ActiveSequencesMultiWorker {
// Remove expired requests from request_to_worker mapping
for
expired_id
in
&
removed_requests
{
self
.request_to_worker
.remove
(
expired_id
);
self
.request_to_lora
.remove
(
expired_id
);
}
// Publish ActiveLoad metrics for this worker
...
...
@@ -833,11 +868,18 @@ impl ActiveSequencesMultiWorker {
// Publish event only if replica_sync is enabled
if
self
.replica_sync
{
// Look up lora_name from mapping
let
lora_name
=
self
.request_to_lora
.get
(
request_id
)
.map
(|
entry
|
entry
.value
()
.clone
());
let
event
=
ActiveSequenceEvent
{
request_id
:
request_id
.clone
(),
worker
,
data
:
ActiveSequenceEventData
::
Free
,
router_id
:
self
.router_id
,
lora_name
,
};
self
.event_publisher
.publish
(
&
event
)
.await
?
;
}
...
...
@@ -852,6 +894,7 @@ impl ActiveSequencesMultiWorker {
.map_err
(|
_
|
SequenceError
::
WorkerChannelClosed
)
?
;
self
.request_to_worker
.remove
(
request_id
);
self
.request_to_lora
.remove
(
request_id
);
// Publish ActiveLoad metrics for this worker
self
.publish_active_load_for_worker
(
worker
)
.await
;
...
...
@@ -882,11 +925,18 @@ impl ActiveSequencesMultiWorker {
// Publish event only if replica_sync is enabled
if
self
.replica_sync
{
// Look up lora_name from mapping
let
lora_name
=
self
.request_to_lora
.get
(
request_id
)
.map
(|
entry
|
entry
.value
()
.clone
());
let
event
=
ActiveSequenceEvent
{
request_id
:
request_id
.clone
(),
worker
,
data
:
ActiveSequenceEventData
::
MarkPrefillCompleted
,
router_id
:
self
.router_id
,
lora_name
,
};
self
.event_publisher
.publish
(
&
event
)
.await
?
;
}
...
...
@@ -1156,6 +1206,15 @@ impl ActiveSequencesMultiWorker {
self
.query_workers
(
None
,
|
_
,
resp_tx
|
UpdateSequences
::
ActiveTokens
{
resp_tx
})
.await
}
pub
fn
get_active_lora_counts
(
&
self
)
->
HashMap
<
String
,
usize
>
{
let
mut
counts
:
HashMap
<
String
,
usize
>
=
HashMap
::
new
();
for
entry
in
self
.request_to_lora
.iter
()
{
let
lora_name
=
entry
.value
()
.clone
();
*
counts
.entry
(
lora_name
)
.or_insert
(
0
)
+=
1
;
}
counts
}
}
impl
Drop
for
ActiveSequencesMultiWorker
{
...
...
@@ -1264,6 +1323,7 @@ mod tests {
0
,
// no overlap
None
,
// expected_output_tokens
WorkerWithDpRank
::
new
(
0
,
0
),
None
,
// lora_name
)
.await
?
;
...
...
@@ -1276,6 +1336,7 @@ mod tests {
0
,
// no overlap
None
,
// expected_output_tokens
WorkerWithDpRank
::
new
(
0
,
1
),
None
,
// lora_name
)
.await
?
;
...
...
@@ -1288,6 +1349,7 @@ mod tests {
0
,
// no overlap
None
,
// expected_output_tokens
WorkerWithDpRank
::
new
(
1
,
0
),
None
,
// lora_name
)
.await
?
;
...
...
@@ -1423,6 +1485,7 @@ mod tests {
0
,
// no overlap
None
,
// expected_output_tokens
WorkerWithDpRank
::
from_worker_id
(
0
),
None
,
// lora_name
)
.await
?
;
...
...
@@ -1435,6 +1498,7 @@ mod tests {
0
,
// no overlap
None
,
// expected_output_tokens
WorkerWithDpRank
::
from_worker_id
(
1
),
None
,
// lora_name
)
.await
?
;
...
...
@@ -1447,6 +1511,7 @@ mod tests {
0
,
// no overlap
None
,
// expected_output_tokens
WorkerWithDpRank
::
from_worker_id
(
2
),
None
,
// lora_name
)
.await
?
;
...
...
lib/llm/src/local_model.rs
View file @
373e76c1
...
...
@@ -480,6 +480,7 @@ impl LocalModel {
)
->
anyhow
::
Result
<
()
>
{
self
.card.model_type
=
model_type
;
self
.card.model_input
=
model_input
;
self
.card.lora_name
=
lora_name
.map
(|
name
|
name
.to_string
());
// Compute model_suffix from lora_name if present
let
model_suffix
=
lora_name
.map
(|
name
|
Slug
::
slugify
(
name
)
.to_string
());
...
...
lib/llm/src/model_card.rs
View file @
373e76c1
...
...
@@ -230,6 +230,11 @@ pub struct ModelDeploymentCard {
/// `Text` for engines that take care of pre-processing themselves.
pub
model_input
:
ModelInput
,
/// Optional LoRA adapter name for this model card.
/// Present when this card represents a LoRA adapter registered on top of a base model.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
lora_name
:
Option
<
String
>
,
/// User-defined metadata for custom worker behavior
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
user_data
:
Option
<
serde_json
::
Value
>
,
...
...
@@ -651,6 +656,7 @@ impl ModelDeploymentCard {
migration_limit
:
0
,
model_type
:
Default
::
default
(),
// set later
model_input
:
Default
::
default
(),
// set later
lora_name
:
None
,
user_data
:
None
,
runtime_config
:
ModelRuntimeConfig
::
default
(),
media_decoder
:
None
,
...
...
lib/llm/src/preprocessor.rs
View file @
373e76c1
...
...
@@ -113,6 +113,7 @@ pub struct OpenAIPreprocessor {
formatter
:
Arc
<
dyn
OAIPromptFormatter
>
,
tokenizer
:
Arc
<
dyn
Tokenizer
>
,
model_info
:
Arc
<
dyn
ModelInfo
>
,
lora_name
:
Option
<
String
>
,
/// Per-model runtime configuration propagated to response generator (e.g., reasoning/tool parser)
runtime_config
:
crate
::
local_model
::
runtime_config
::
ModelRuntimeConfig
,
tool_call_parser
:
Option
<
String
>
,
...
...
@@ -136,7 +137,8 @@ impl OpenAIPreprocessor {
)
->
Result
<
Arc
<
Self
>>
{
let
mdcsum
=
mdc
.mdcsum
()
.to_string
();
let
tokenizer
=
Arc
::
new
(
HuggingFaceTokenizer
::
from_tokenizer
(
hf_tokenizer
));
let
Some
(
model_info
)
=
mdc
.model_info
else
{
let
lora_name
=
mdc
.lora_name
.clone
();
let
Some
(
ref
model_info
)
=
mdc
.model_info
else
{
anyhow
::
bail!
(
"Blank ModelDeploymentCard cannot be used for pre-processing, no model_info"
);
...
...
@@ -144,6 +146,10 @@ impl OpenAIPreprocessor {
let
model_info
=
model_info
.get_model_info
()
?
;
let
tool_call_parser
=
mdc
.runtime_config.tool_call_parser
.clone
();
if
let
Some
(
ref
lora_name
)
=
lora_name
{
tracing
::
info!
(
model
=
%
mdc
.display_name
,
lora_name
,
"LoRA adapter detected in MDC"
);
}
// // Initialize runtime config from the ModelDeploymentCard
let
runtime_config
=
mdc
.runtime_config
.clone
();
...
...
@@ -158,6 +164,7 @@ impl OpenAIPreprocessor {
tokenizer
,
model_info
,
mdcsum
,
lora_name
,
runtime_config
,
tool_call_parser
,
#[cfg(feature
=
"media-nixl"
)]
...
...
@@ -237,6 +244,8 @@ impl OpenAIPreprocessor {
builder
.output_options
(
request
.extract_output_options
()
?
);
builder
.annotations
(
request
.annotations
()
.unwrap_or_default
());
builder
.mdc_sum
(
Some
(
self
.mdcsum
.clone
()));
let
lora_name
=
self
.lora_name
.clone
();
// Extract routing hints from nvext if present
if
let
Some
(
nvext
)
=
request
.nvext
()
{
// Build routing hints from nvext fields
...
...
@@ -247,8 +256,15 @@ impl OpenAIPreprocessor {
dp_rank
:
None
,
// dp_rank is set later in the pipeline
enable_local_updates
:
nvext
.enable_local_updates
,
expected_output_tokens
:
nvext
.expected_output_tokens
,
lora_name
,
};
builder
.routing
(
Some
(
routing
));
}
else
if
lora_name
.is_some
()
{
// Ensure LoRA-aware routing still gets hints even when nvext is absent.
builder
.routing
(
Some
(
RoutingHints
{
lora_name
,
..
Default
::
default
()
}));
}
Ok
(
builder
)
...
...
lib/llm/src/protocols/common/preprocessor.rs
View file @
373e76c1
...
...
@@ -47,6 +47,11 @@ pub struct RoutingHints {
/// Used as a hint for routing decisions to estimate resource requirements.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
expected_output_tokens
:
Option
<
u32
>
,
/// LORA adapter name for this request.
/// Used for LORA-aware routing and tracking.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
lora_name
:
Option
<
String
>
,
}
#[derive(Serialize,
Deserialize,
Debug,
Clone,
Default)]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment