Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
bce74588
Unverified
Commit
bce74588
authored
Aug 22, 2025
by
Graham King
Committed by
GitHub
Aug 22, 2025
Browse files
chore: Rust to 1.89 and edition 2024 (#2659)
parent
268d017e
Changes
199
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
99 additions
and
100 deletions
+99
-100
lib/llm/src/block_manager/pool/managed.rs
lib/llm/src/block_manager/pool/managed.rs
+1
-1
lib/llm/src/block_manager/pool/managed/active.rs
lib/llm/src/block_manager/pool/managed/active.rs
+11
-11
lib/llm/src/block_manager/pool/managed/inactive.rs
lib/llm/src/block_manager/pool/managed/inactive.rs
+6
-4
lib/llm/src/block_manager/pool/managed/state.rs
lib/llm/src/block_manager/pool/managed/state.rs
+7
-9
lib/llm/src/block_manager/state.rs
lib/llm/src/block_manager/state.rs
+3
-3
lib/llm/src/block_manager/storage.rs
lib/llm/src/block_manager/storage.rs
+5
-2
lib/llm/src/block_manager/storage/arena.rs
lib/llm/src/block_manager/storage/arena.rs
+1
-1
lib/llm/src/block_manager/storage/cuda.rs
lib/llm/src/block_manager/storage/cuda.rs
+1
-1
lib/llm/src/block_manager/storage/disk.rs
lib/llm/src/block_manager/storage/disk.rs
+1
-1
lib/llm/src/block_manager/storage/nixl.rs
lib/llm/src/block_manager/storage/nixl.rs
+3
-3
lib/llm/src/cuda.rs
lib/llm/src/cuda.rs
+1
-1
lib/llm/src/disagg_router.rs
lib/llm/src/disagg_router.rs
+18
-18
lib/llm/src/discovery/model_manager.rs
lib/llm/src/discovery/model_manager.rs
+1
-1
lib/llm/src/discovery/watcher.rs
lib/llm/src/discovery/watcher.rs
+11
-14
lib/llm/src/engines.rs
lib/llm/src/engines.rs
+1
-1
lib/llm/src/entrypoint/input/batch.rs
lib/llm/src/entrypoint/input/batch.rs
+3
-3
lib/llm/src/entrypoint/input/common.rs
lib/llm/src/entrypoint/input/common.rs
+13
-13
lib/llm/src/entrypoint/input/endpoint.rs
lib/llm/src/entrypoint/input/endpoint.rs
+7
-8
lib/llm/src/entrypoint/input/http.rs
lib/llm/src/entrypoint/input/http.rs
+3
-3
lib/llm/src/entrypoint/input/text.rs
lib/llm/src/entrypoint/input/text.rs
+2
-2
No files found.
lib/llm/src/block_manager/pool/managed.rs
View file @
bce74588
...
...
@@ -589,7 +589,7 @@ impl<S: Storage, L: LocalityProvider + 'static, M: BlockMetadata> ProgressEngine
#[cfg(test)]
mod
tests
{
use
crate
::
block_manager
::
block
::{
BasicMetadata
,
Blocks
};
use
crate
::
block_manager
::
layout
::{
tests
::
setup_layout
,
FullyContiguous
,
LayoutConfig
};
use
crate
::
block_manager
::
layout
::{
FullyContiguous
,
LayoutConfig
,
tests
::
setup_layout
};
use
crate
::
block_manager
::
locality
::
Local
;
use
crate
::
tokens
::{
TokenBlockSequence
,
Tokens
};
...
...
lib/llm/src/block_manager/pool/managed/active.rs
View file @
bce74588
...
...
@@ -51,10 +51,10 @@ impl<S: Storage, L: LocalityProvider, M: BlockMetadata> ActiveBlockPool<S, L, M>
// Set the parent of the block if it has one.
// This is needed to ensure the lifetime of the parent is at least as long as the child.
if
let
Ok
(
Some
(
parent
))
=
block
.parent_sequence_hash
()
{
if
let
Some
(
parent_block
)
=
self
.match_sequence_hash
(
parent
)
{
block
.set_parent
(
parent_block
.mutable_block
()
.clone
());
}
if
let
Ok
(
Some
(
parent
))
=
block
.parent_sequence_hash
()
&&
let
Some
(
parent_block
)
=
self
.match_sequence_hash
(
parent
)
{
block
.set_parent
(
parent_block
.mutable_block
()
.clone
());
}
let
shared
=
Arc
::
new
(
block
);
...
...
@@ -78,14 +78,14 @@ impl<S: Storage, L: LocalityProvider, M: BlockMetadata> ActiveBlockPool<S, L, M>
}
pub
fn
remove
(
&
mut
self
,
block
:
&
mut
Block
<
S
,
L
,
M
>
)
{
if
let
Ok
(
sequence_hash
)
=
block
.sequence_hash
()
{
if
let
Some
(
weak
)
=
self
.map
.get
(
&
sequence_hash
)
{
if
let
Some
(
_
arc
)
=
weak
.upgrade
()
{
block
.reset
();
return
;
}
self
.map
.remove
(
&
sequence_hash
);
if
let
Ok
(
sequence_hash
)
=
block
.sequence_hash
()
&&
let
Some
(
weak
)
=
self
.map
.get
(
&
sequence_hash
)
{
if
let
Some
(
_
arc
)
=
weak
.upgrade
()
{
block
.reset
();
return
;
}
self
.map
.remove
(
&
sequence_hash
);
}
}
...
...
lib/llm/src/block_manager/pool/managed/inactive.rs
View file @
bce74588
...
...
@@ -15,7 +15,7 @@
use
std
::
sync
::
atomic
::
AtomicU64
;
use
crate
::
block_manager
::
block
::{
locality
::
LocalityProvider
,
BlockState
};
use
crate
::
block_manager
::
block
::{
BlockState
,
locality
::
LocalityProvider
};
use
super
::
*
;
use
priority_key
::
PriorityKey
;
...
...
@@ -113,7 +113,9 @@ impl<S: Storage, L: LocalityProvider, M: BlockMetadata> InactiveBlockPool<S, L,
fn
insert_with_sequence_hash
(
&
mut
self
,
block
:
Block
<
S
,
L
,
M
>
,
sequence_hash
:
SequenceHash
)
{
let
priority_key
=
PriorityKey
::
new
(
block
.metadata
()
.clone
(),
sequence_hash
);
if
self
.priority_set
.contains
(
&
priority_key
)
{
tracing
::
trace!
(
"multiple entries with the same sequence hash, resetting block and inserting into uninitialized set"
);
tracing
::
trace!
(
"multiple entries with the same sequence hash, resetting block and inserting into uninitialized set"
);
let
mut
block
=
block
;
block
.reset
();
self
.uninitialized_set
.push_back
(
block
);
...
...
@@ -546,8 +548,8 @@ pub(crate) mod tests {
use
crate
::{
block_manager
::{
block
::{
locality
::
Local
,
registry
::
BlockRegistry
,
state
::
CompleteState
,
Blocks
,
PrivateBlockExt
,
Blocks
,
PrivateBlockExt
,
locality
::
Local
,
registry
::
BlockRegistry
,
state
::
CompleteState
,
},
events
::
NullEventManager
,
layout
::{
BlockLayout
,
FullyContiguous
,
LayoutConfigBuilder
},
...
...
lib/llm/src/block_manager/pool/managed/state.rs
View file @
bce74588
...
...
@@ -14,7 +14,7 @@
// limitations under the License.
use
crate
::
block_manager
::{
block
::{
registry
::
BlockRegistrationError
,
BlockState
,
PrivateBlockExt
},
block
::{
BlockState
,
PrivateBlockExt
,
registry
::
BlockRegistrationError
},
events
::
Publisher
,
};
...
...
@@ -266,18 +266,16 @@ impl<S: Storage, L: LocalityProvider + 'static, M: BlockMetadata> State<S, L, M>
}
}
BlockRegistrationDuplicationSetting
::
Disabled
=>
{
if
let
Some
(
block
)
=
duplicate
{
if
let
Some
(
raw_blocks
)
=
block
.try_take_block
(
private
::
PrivateToken
)
{
self
.inactive
.return_blocks
(
raw_blocks
);
}
if
let
Some
(
block
)
=
duplicate
&&
let
Some
(
raw_blocks
)
=
block
.try_take_block
(
private
::
PrivateToken
)
{
self
.inactive
.return_blocks
(
raw_blocks
);
}
}
}
if
offload
{
if
let
Some
(
priority
)
=
immutable
.metadata
()
.offload_priority
()
{
immutable
.enqueue_offload
(
priority
)
.await
.unwrap
();
}
if
offload
&&
let
Some
(
priority
)
=
immutable
.metadata
()
.offload_priority
()
{
immutable
.enqueue_offload
(
priority
)
.await
.unwrap
();
}
immutable_blocks
.push
(
immutable
);
...
...
lib/llm/src/block_manager/state.rs
View file @
bce74588
...
...
@@ -17,7 +17,7 @@ mod local;
mod
logical
;
mod
resources
;
use
crate
::
block_manager
::
block
::{
factory
::
IntoBlocks
,
MutableBlock
};
use
crate
::
block_manager
::
block
::{
MutableBlock
,
factory
::
IntoBlocks
};
use
crate
::
block_manager
::
locality
::
LogicalResources
;
use
crate
::
block_manager
::
offload
::
request
::
BlockResult
;
...
...
@@ -26,8 +26,8 @@ use super::*;
// use super::offload::OffloadManager;
use
super
::{
block
::{
factory
::
LocalBlockDataFactory
,
locality
::
LocalityProvider
,
Block
,
GlobalRegist
ry
,
ImmutableBlock
,
Block
,
GlobalRegistry
,
ImmutableBlock
,
factory
::
LocalBlockDataFacto
ry
,
locality
::
LocalityProvider
,
},
config
::
NixlOptions
,
events
::{
EventManager
,
NullEventManager
},
...
...
lib/llm/src/block_manager/storage.rs
View file @
bce74588
...
...
@@ -88,7 +88,7 @@ pub use disk::*;
use
torch
::
*
;
use
std
::{
alloc
::{
alloc_zeroed
,
dealloc
,
Layout
},
alloc
::{
Layout
,
alloc_zeroed
,
dealloc
},
collections
::
HashMap
,
fmt
::
Debug
,
ptr
::
NonNull
,
...
...
@@ -322,7 +322,10 @@ impl std::fmt::Debug for RegistrationHandles {
impl
Drop
for
RegistrationHandles
{
fn
drop
(
&
mut
self
)
{
if
!
self
.handles
.is_empty
()
{
panic!
(
"RegistrationHandles dropped with {} handles remaining; RegistrationHandles::release() needs to be explicitly called"
,
self
.handles
.len
());
panic!
(
"RegistrationHandles dropped with {} handles remaining; RegistrationHandles::release() needs to be explicitly called"
,
self
.handles
.len
()
);
}
}
}
...
...
lib/llm/src/block_manager/storage/arena.rs
View file @
bce74588
...
...
@@ -207,7 +207,7 @@ mod nixl {
S
:
MemoryRegion
,
{
unsafe
fn
as_ptr
(
&
self
)
->
*
const
u8
{
Storage
::
as_ptr
(
self
.storage
.as_ref
())
unsafe
{
Storage
::
as_ptr
(
self
.storage
.as_ref
())
}
}
fn
size
(
&
self
)
->
usize
{
...
...
lib/llm/src/block_manager/storage/cuda.rs
View file @
bce74588
...
...
@@ -86,7 +86,7 @@ use std::{
sync
::{
Arc
,
Mutex
,
OnceLock
},
};
use
cudarc
::
driver
::{
sys
,
CudaContext
};
use
cudarc
::
driver
::{
CudaContext
,
sys
};
/// Trait for [Storage] types that can be accessed by CUDA
pub
trait
CudaAccessible
:
Storage
{}
...
...
lib/llm/src/block_manager/storage/disk.rs
View file @
bce74588
...
...
@@ -16,7 +16,7 @@
use
super
::
*
;
use
core
::
ffi
::
c_char
;
use
nix
::
fcntl
::{
fallocate
,
FallocateFlags
};
use
nix
::
fcntl
::{
FallocateFlags
,
fallocate
};
use
nix
::
unistd
::
unlink
;
use
std
::
ffi
::
CStr
;
use
std
::
ffi
::
CString
;
...
...
lib/llm/src/block_manager/storage/nixl.rs
View file @
bce74588
...
...
@@ -342,7 +342,7 @@ impl NixlRegisterableStorage for PinnedStorage {}
impl
MemoryRegion
for
PinnedStorage
{
unsafe
fn
as_ptr
(
&
self
)
->
*
const
u8
{
Storage
::
as_ptr
(
self
)
unsafe
{
Storage
::
as_ptr
(
self
)
}
}
fn
size
(
&
self
)
->
usize
{
...
...
@@ -367,7 +367,7 @@ impl NixlRegisterableStorage for DeviceStorage {}
impl
MemoryRegion
for
DeviceStorage
{
unsafe
fn
as_ptr
(
&
self
)
->
*
const
u8
{
Storage
::
as_ptr
(
self
)
unsafe
{
Storage
::
as_ptr
(
self
)
}
}
fn
size
(
&
self
)
->
usize
{
...
...
@@ -406,7 +406,7 @@ impl NixlRegisterableStorage for DiskStorage {
impl
MemoryRegion
for
DiskStorage
{
unsafe
fn
as_ptr
(
&
self
)
->
*
const
u8
{
Storage
::
as_ptr
(
self
)
unsafe
{
Storage
::
as_ptr
(
self
)
}
}
fn
size
(
&
self
)
->
usize
{
...
...
lib/llm/src/cuda.rs
View file @
bce74588
...
...
@@ -17,8 +17,8 @@
//! them within Dynamo.
use
cudarc
::
driver
::{
sys
::{
cuCtxPopCurrent_v2
,
cuCtxPushCurrent_v2
,
cudaError_enum
,
CUcontext
,
CUstream
},
CudaContext
,
CudaStream
,
sys
::{
CUcontext
,
CUstream
,
cuCtxPopCurrent_v2
,
cuCtxPushCurrent_v2
,
cudaError_enum
},
};
use
std
::
pin
::
Pin
;
use
std
::{
marker
::
PhantomData
,
sync
::
Arc
};
...
...
lib/llm/src/disagg_router.rs
View file @
bce74588
...
...
@@ -18,8 +18,8 @@ use std::sync::{Arc, Mutex};
use
tokio
::
sync
::
watch
;
use
tracing
;
use
dynamo_runtime
::
transports
::
etcd
::
WatchEvent
;
use
dynamo_runtime
::
DistributedRuntime
;
use
dynamo_runtime
::
transports
::
etcd
::
WatchEvent
;
#[derive(Clone,
Debug,
Serialize,
Deserialize)]
pub
struct
DisaggRouterConf
{
...
...
@@ -218,23 +218,23 @@ impl DisaggregatedRouter {
}
pub
fn
check_for_updates
(
&
self
)
{
if
let
Some
(
watcher
)
=
&
self
.config_watcher
{
if
watcher
.has_changed
()
.unwrap_or
(
false
)
{
let
config
=
watcher
.borrow
()
.clone
();
let
new_value
=
config
.max_local_prefill_length
;
// Update the value using the mutex
let
mut
current_value
=
self
.max_local_prefill_le
ngth
.lock
()
.unwrap
();
let
old_value
=
*
current_value
;
if
old_value
!
=
new
_value
{
*
current
_value
=
new_value
;
tracing
::
info!
(
"Applied config update for model {}: max_local_prefill_length changed from {} to {}"
,
self
.model_name
,
old_valu
e
,
new
_value
);
}
if
let
Some
(
watcher
)
=
&
self
.config_watcher
&&
watcher
.has_changed
()
.unwrap_or
(
false
)
{
let
config
=
watcher
.borrow
()
.clone
()
;
let
new_value
=
config
.max_local_prefill_length
;
// Update the value usi
ng
th
e mutex
let
mut
current_value
=
self
.max_local_prefill_length
.lock
()
.unwrap
()
;
let
old_value
=
*
current
_value
;
if
old
_value
!
=
new_value
{
*
current_value
=
new_value
;
tracing
::
info!
(
"Applied config update for model {}: max_local_prefill_length changed from {} to {}"
,
self
.model_nam
e
,
old
_value
,
new_value
);
}
}
}
...
...
lib/llm/src/discovery/model_manager.rs
View file @
bce74588
...
...
@@ -7,7 +7,7 @@ use dynamo_runtime::slug::Slug;
use
crate
::
discovery
::
ModelEntry
;
use
crate
::
kv_router
::{
scheduler
::
DefaultWorkerSelector
,
KvRouterConfig
};
use
crate
::
kv_router
::{
KvRouterConfig
,
scheduler
::
DefaultWorkerSelector
};
use
crate
::{
kv_router
::
KvRouter
,
types
::
openai
::{
...
...
lib/llm/src/discovery/watcher.rs
View file @
bce74588
...
...
@@ -5,16 +5,16 @@ use std::sync::Arc;
use
tokio
::
sync
::
mpsc
::
Sender
;
use
anyhow
::
Context
as
_
;
use
tokio
::
sync
::{
mpsc
::
Receiver
,
Notify
};
use
tokio
::
sync
::{
Notify
,
mpsc
::
Receiver
};
use
dynamo_runtime
::{
DistributedRuntime
,
pipeline
::{
network
::
egress
::
push_router
::
PushRouter
,
ManyOut
,
Operator
,
RouterMode
,
SegmentSource
,
ServiceBackend
,
SingleIn
,
Source
,
ManyOut
,
Operator
,
RouterMode
,
SegmentSource
,
ServiceBackend
,
SingleIn
,
Source
,
network
::
egress
::
push_router
::
PushRouter
,
},
protocols
::
annotated
::
Annotated
,
transports
::
etcd
::{
KeyValue
,
WatchEvent
},
DistributedRuntime
,
};
use
crate
::{
...
...
@@ -35,7 +35,7 @@ use crate::{
},
};
use
super
::{
ModelEntry
,
ModelManager
,
MODEL_ROOT_PATH
};
use
super
::{
MODEL_ROOT_PATH
,
ModelEntry
,
ModelManager
};
#[derive(Debug,
Clone,
Copy,
PartialEq)]
pub
enum
ModelUpdate
{
...
...
@@ -213,10 +213,8 @@ impl ModelWatcher {
);
update_tx
=
false
;
}
if
update_tx
{
if
let
Some
(
tx
)
=
&
self
.model_update_tx
{
tx
.send
(
ModelUpdate
::
Removed
(
model_type
))
.await
.ok
();
}
if
update_tx
&&
let
Some
(
tx
)
=
&
self
.model_update_tx
{
tx
.send
(
ModelUpdate
::
Removed
(
model_type
))
.await
.ok
();
}
return
Ok
(
None
);
}
...
...
@@ -251,13 +249,12 @@ impl ModelWatcher {
);
}
else
{
for
model_type
in
ALL_MODEL_TYPES
{
if
(
chat_model_removed
&&
*
model_type
==
ModelType
::
Chat
)
if
(
(
chat_model_removed
&&
*
model_type
==
ModelType
::
Chat
)
||
(
completions_model_removed
&&
*
model_type
==
ModelType
::
Completion
)
||
(
embeddings_model_removed
&&
*
model_type
==
ModelType
::
Embedding
)
||
(
embeddings_model_removed
&&
*
model_type
==
ModelType
::
Embedding
))
&&
let
Some
(
tx
)
=
&
self
.model_update_tx
{
if
let
Some
(
tx
)
=
&
self
.model_update_tx
{
tx
.send
(
ModelUpdate
::
Removed
(
*
model_type
))
.await
.ok
();
}
tx
.send
(
ModelUpdate
::
Removed
(
*
model_type
))
.await
.ok
();
}
}
}
...
...
lib/llm/src/engines.rs
View file @
bce74588
...
...
@@ -18,7 +18,7 @@ use crate::preprocessor::PreprocessedRequest;
use
crate
::
protocols
::
common
::
llm_backend
::
LLMEngineOutput
;
use
crate
::
protocols
::
openai
::{
chat_completions
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
},
completions
::{
prompt_to_string
,
NvCreateCompletionRequest
,
NvCreateCompletionResponse
},
completions
::{
NvCreateCompletionRequest
,
NvCreateCompletionResponse
,
prompt_to_string
},
};
use
crate
::
types
::
openai
::
embeddings
::
NvCreateEmbeddingRequest
;
use
crate
::
types
::
openai
::
embeddings
::
NvCreateEmbeddingResponse
;
...
...
lib/llm/src/entrypoint/input/batch.rs
View file @
bce74588
...
...
@@ -8,18 +8,18 @@ use crate::types::openai::chat_completions::{
};
use
anyhow
::
Context
as
_
;
use
dynamo_async_openai
::
types
::
FinishReason
;
use
dynamo_runtime
::{
pipeline
::
Context
,
runtime
::
CancellationToken
,
Runtime
};
use
dynamo_runtime
::{
Runtime
,
pipeline
::
Context
,
runtime
::
CancellationToken
};
use
futures
::
StreamExt
;
use
serde
::{
Deserialize
,
Serialize
};
use
std
::
cmp
;
use
std
::
path
::{
Path
,
PathBuf
};
use
std
::
sync
::
atomic
::{
AtomicU64
,
Ordering
};
use
std
::
sync
::
Arc
;
use
std
::
sync
::
atomic
::{
AtomicU64
,
Ordering
};
use
std
::
time
::{
Duration
,
Instant
};
use
tokio
::
io
::{
AsyncBufReadExt
,
AsyncWriteExt
};
use
crate
::
entrypoint
::
input
::
common
;
use
crate
::
entrypoint
::
EngineConfig
;
use
crate
::
entrypoint
::
input
::
common
;
/// Max tokens in each response.
/// TODO: For batch mode this should be the full context size of the model
...
...
lib/llm/src/entrypoint/input/common.rs
View file @
bce74588
...
...
@@ -5,7 +5,7 @@ use std::pin::Pin;
use
crate
::{
backend
::{
Backend
,
ExecutionContext
},
discovery
::{
ModelManager
,
ModelWatcher
,
MODEL_ROOT_PATH
},
discovery
::{
MODEL_ROOT_PATH
,
ModelManager
,
ModelWatcher
},
engines
::
StreamingEngineAdapter
,
entrypoint
::{
self
,
EngineConfig
},
kv_router
::{
KvPushRouter
,
KvRouter
},
...
...
@@ -15,15 +15,16 @@ use crate::{
protocols
::
common
::
llm_backend
::{
BackendOutput
,
LLMEngineOutput
,
PreprocessedRequest
},
request_template
::
RequestTemplate
,
types
::{
Annotated
,
openai
::
chat_completions
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
,
OpenAIChatCompletionsStreamingEngine
,
},
Annotated
,
},
};
use
dynamo_runtime
::{
DistributedRuntime
,
Runtime
,
component
::
Client
,
distributed
::
DistributedConfig
,
engine
::{
AsyncEngineStream
,
Data
},
...
...
@@ -31,7 +32,6 @@ use dynamo_runtime::{
Context
,
ManyOut
,
Operator
,
PushRouter
,
RouterMode
,
SegmentSource
,
ServiceBackend
,
ServiceEngine
,
ServiceFrontend
,
SingleIn
,
Source
,
},
DistributedRuntime
,
Runtime
,
};
use
std
::
sync
::
Arc
;
...
...
@@ -191,11 +191,11 @@ where
Req
:
Data
,
Resp
:
Data
,
OpenAIPreprocessor
:
Operator
<
Context
<
Req
>
,
Pin
<
Box
<
dyn
AsyncEngineStream
<
Annotated
<
Resp
>>>>
,
Context
<
PreprocessedRequest
>
,
Pin
<
Box
<
dyn
AsyncEngineStream
<
Annotated
<
BackendOutput
>>>>
,
>
,
Context
<
Req
>
,
Pin
<
Box
<
dyn
AsyncEngineStream
<
Annotated
<
Resp
>>>>
,
Context
<
PreprocessedRequest
>
,
Pin
<
Box
<
dyn
AsyncEngineStream
<
Annotated
<
BackendOutput
>>>>
,
>
,
{
let
frontend
=
ServiceFrontend
::
<
SingleIn
<
Req
>
,
ManyOut
<
Annotated
<
Resp
>>>
::
new
();
let
preprocessor
=
OpenAIPreprocessor
::
new
((
*
card
)
.clone
())
...
...
@@ -224,11 +224,11 @@ where
Req
:
Data
,
Resp
:
Data
,
OpenAIPreprocessor
:
Operator
<
Context
<
Req
>
,
Pin
<
Box
<
dyn
AsyncEngineStream
<
Annotated
<
Resp
>>>>
,
Context
<
PreprocessedRequest
>
,
Pin
<
Box
<
dyn
AsyncEngineStream
<
Annotated
<
BackendOutput
>>>>
,
>
,
Context
<
Req
>
,
Pin
<
Box
<
dyn
AsyncEngineStream
<
Annotated
<
Resp
>>>>
,
Context
<
PreprocessedRequest
>
,
Pin
<
Box
<
dyn
AsyncEngineStream
<
Annotated
<
BackendOutput
>>>>
,
>
,
{
let
frontend
=
SegmentSource
::
<
SingleIn
<
Req
>
,
ManyOut
<
Annotated
<
Resp
>>>
::
new
();
let
preprocessor
=
OpenAIPreprocessor
::
new
(
card
.clone
())
.await
?
.into_operator
();
...
...
lib/llm/src/entrypoint/input/endpoint.rs
View file @
bce74588
...
...
@@ -9,18 +9,18 @@ use crate::{
model_type
::
ModelType
,
preprocessor
::{
BackendOutput
,
PreprocessedRequest
},
types
::{
Annotated
,
openai
::
chat_completions
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
,
},
Annotated
,
},
};
use
dynamo_runtime
::
engine
::
AsyncEngineStream
;
use
dynamo_runtime
::
pipeline
::{
network
::
Ingress
,
Context
,
ManyOut
,
Operator
,
SegmentSource
,
ServiceBackend
,
SingleIn
,
Source
,
Context
,
ManyOut
,
Operator
,
SegmentSource
,
ServiceBackend
,
SingleIn
,
Source
,
network
::
Ingress
,
};
use
dynamo_runtime
::{
protocols
::
EndpointId
,
DistributedRuntime
};
use
dynamo_runtime
::{
DistributedRuntime
,
protocols
::
EndpointId
};
use
crate
::
entrypoint
::
EngineConfig
;
...
...
@@ -125,13 +125,12 @@ pub async fn run(
result
?
;
// Cleanup on shutdown
if
let
Some
(
mut
card
)
=
card
{
if
let
Err
(
err
)
=
card
if
let
Some
(
mut
card
)
=
card
&&
let
Err
(
err
)
=
card
.delete_from_nats
(
distributed_runtime
.nats_client
())
.await
{
tracing
::
error!
(
%
err
,
"delete_from_nats error on shutdown"
);
}
{
tracing
::
error!
(
%
err
,
"delete_from_nats error on shutdown"
);
}
Ok
(())
...
...
lib/llm/src/entrypoint/input/http.rs
View file @
bce74588
...
...
@@ -4,10 +4,10 @@
use
std
::
sync
::
Arc
;
use
crate
::{
discovery
::{
ModelManager
,
ModelUpdate
,
ModelWatcher
,
MODEL_ROOT_PATH
},
discovery
::{
MODEL_ROOT_PATH
,
ModelManager
,
ModelUpdate
,
ModelWatcher
},
endpoint_type
::
EndpointType
,
engines
::
StreamingEngineAdapter
,
entrypoint
::{
self
,
input
::
common
,
EngineConfig
},
entrypoint
::{
self
,
EngineConfig
,
input
::
common
},
http
::
service
::
service_v2
::{
self
,
HttpService
},
kv_router
::
KvRouterConfig
,
model_type
::
ModelType
,
...
...
@@ -17,8 +17,8 @@ use crate::{
},
};
use
dynamo_runtime
::
transports
::
etcd
;
use
dynamo_runtime
::{
distributed
::
DistributedConfig
,
pipeline
::
RouterMode
};
use
dynamo_runtime
::{
DistributedRuntime
,
Runtime
};
use
dynamo_runtime
::{
distributed
::
DistributedConfig
,
pipeline
::
RouterMode
};
/// Build and run an HTTP service
pub
async
fn
run
(
runtime
:
Runtime
,
engine_config
:
EngineConfig
)
->
anyhow
::
Result
<
()
>
{
...
...
lib/llm/src/entrypoint/input/text.rs
View file @
bce74588
...
...
@@ -6,12 +6,12 @@ use crate::request_template::RequestTemplate;
use
crate
::
types
::
openai
::
chat_completions
::{
NvCreateChatCompletionRequest
,
OpenAIChatCompletionsStreamingEngine
,
};
use
dynamo_runtime
::{
pipeline
::
Context
,
runtime
::
CancellationToken
,
Runtime
};
use
dynamo_runtime
::{
Runtime
,
pipeline
::
Context
,
runtime
::
CancellationToken
};
use
futures
::
StreamExt
;
use
std
::
io
::{
ErrorKind
,
Write
};
use
crate
::
entrypoint
::
input
::
common
;
use
crate
::
entrypoint
::
EngineConfig
;
use
crate
::
entrypoint
::
input
::
common
;
/// Max response tokens for each single query. Must be less than model context size.
/// TODO: Cmd line flag to overwrite this
...
...
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment