Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
c8adbe6f
Unverified
Commit
c8adbe6f
authored
Oct 22, 2025
by
atchernych
Committed by
GitHub
Oct 22, 2025
Browse files
fix: Load deployment card from ModelExpress for EPP (#3793)
Signed-off-by:
Anna Tchernych
<
atchernych@nvidia.com
>
parent
8642c4bd
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
92 additions
and
96 deletions
+92
-96
container/Dockerfile.epp
container/Dockerfile.epp
+14
-9
deploy/inference-gateway/build-epp-dynamo.sh
deploy/inference-gateway/build-epp-dynamo.sh
+2
-2
lib/bindings/c/src/lib.rs
lib/bindings/c/src/lib.rs
+76
-57
lib/llm/src/model_card.rs
lib/llm/src/model_card.rs
+0
-28
No files found.
container/Dockerfile.epp
View file @
c8adbe6f
...
@@ -18,11 +18,11 @@ ENV CXX=g++
...
@@ -18,11 +18,11 @@ ENV CXX=g++
# C/C++ toolchain for cgo, and libstdc++ for link-time
# C/C++ toolchain for cgo, and libstdc++ for link-time
RUN apt-get update && apt-get install -y --no-install-recommends \
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
build-essential \
gcc g++ \
gcc g++ \
libc6-dev \
libc6-dev \
ca-certificates \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
&& rm -rf /var/lib/apt/lists/*
ARG COMMIT_SHA=unknown
ARG COMMIT_SHA=unknown
ARG BUILD_REF
ARG BUILD_REF
...
@@ -56,13 +56,18 @@ FROM ${BASE_IMAGE} AS runtime
...
@@ -56,13 +56,18 @@ FROM ${BASE_IMAGE} AS runtime
# Minimal runtime deps; include libstdc++ runtime for -lstdc++
# Minimal runtime deps; include libstdc++ runtime for -lstdc++
RUN apt-get update && apt-get install -y --no-install-recommends \
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
ca-certificates \
libstdc++6 \
libstdc++6 \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r nonroot && useradd -r -g nonroot nonroot
&& groupadd -r nonroot && useradd -r -g nonroot -m -d /home/nonroot nonroot \
&& mkdir -p /home/nonroot/.cache/huggingface/hub \
&& chown -R nonroot:nonroot /home/nonroot
WORKDIR /
WORKDIR /
COPY --from=builder /epp /epp
COPY --from=builder /epp /epp
# Set HOME so ModelExpress can find the cache directory
ENV HOME=/home/nonroot
USER nonroot:nonroot
USER nonroot:nonroot
ENTRYPOINT ["/epp"]
ENTRYPOINT ["/epp"]
deploy/inference-gateway/build-epp-dynamo.sh
View file @
c8adbe6f
...
@@ -69,8 +69,8 @@ if [[ ! -f "${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a" ]]; then
...
@@ -69,8 +69,8 @@ if [[ ! -f "${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a" ]]; then
exit
1
exit
1
fi
fi
if
[[
!
-f
"
${
GAIE_DIR
}
/Dockerfile.
epp
"
]]
;
then
if
[[
!
-f
"
${
GAIE_DIR
}
/Dockerfile.
dynamo
"
]]
;
then
echo
"Docker.
epp
file copy failed!"
echo
"Docker.
dynamo
file copy failed!"
exit
1
exit
1
fi
fi
...
...
lib/bindings/c/src/lib.rs
View file @
c8adbe6f
...
@@ -10,7 +10,7 @@ use std::sync::atomic::{AtomicU32, Ordering};
...
@@ -10,7 +10,7 @@ use std::sync::atomic::{AtomicU32, Ordering};
use
dynamo_llm
::
kv_router
::{
use
dynamo_llm
::
kv_router
::{
indexer
::
compute_block_hash_for_seq
,
protocols
::
*
,
publisher
::
KvEventPublisher
,
indexer
::
compute_block_hash_for_seq
,
protocols
::
*
,
publisher
::
KvEventPublisher
,
};
};
use
dynamo_runtime
::{
DistributedRuntime
,
Worker
,
storage
::
key_value_store
::
Key
};
use
dynamo_runtime
::{
DistributedRuntime
,
Worker
};
static
WK
:
OnceCell
<
Worker
>
=
OnceCell
::
new
();
static
WK
:
OnceCell
<
Worker
>
=
OnceCell
::
new
();
static
DRT
:
AsyncOnceCell
<
DistributedRuntime
>
=
AsyncOnceCell
::
new
();
static
DRT
:
AsyncOnceCell
<
DistributedRuntime
>
=
AsyncOnceCell
::
new
();
// [FIXME] shouldn't the publisher be instance passing between API calls?
// [FIXME] shouldn't the publisher be instance passing between API calls?
...
@@ -329,18 +329,15 @@ pub extern "C" fn dynamo_kv_event_publish_removed(
...
@@ -329,18 +329,15 @@ pub extern "C" fn dynamo_kv_event_publish_removed(
/* ------------------------------------------------------------------------
/* ------------------------------------------------------------------------
* Worker selection pipeline
* Worker selection pipeline
* ------------------------------------------------------------------------ */
* ------------------------------------------------------------------------ */
use
std
::
{
pin
::
Pin
,
sync
::
Arc
}
;
use
std
::
pin
::
Pin
;
const
GENERATE_ENDPOINT
:
&
str
=
"generate"
;
const
GENERATE_ENDPOINT
:
&
str
=
"generate"
;
use
anyhow
::
Context
;
use
anyhow
::
Context
;
use
dynamo_runtime
::{
use
dynamo_runtime
::{
Runtime
,
distributed
::
DistributedConfig
,
traits
::
DistributedRuntimeProvider
};
Runtime
,
distributed
::
DistributedConfig
,
slug
::
Slug
,
traits
::
DistributedRuntimeProvider
,
};
use
dynamo_llm
::
discovery
::
ModelManager
;
use
dynamo_llm
::
discovery
::
ModelManager
;
use
dynamo_llm
::
entrypoint
::
build_routed_pipeline
;
use
dynamo_llm
::
entrypoint
::
build_routed_pipeline
;
use
dynamo_llm
::
kv_router
::
KvRouter
;
use
dynamo_llm
::
kv_router
::
KvRouterConfig
;
use
dynamo_llm
::
kv_router
::
KvRouterConfig
;
use
dynamo_llm
::
model_card
::
ModelDeploymentCard
;
use
dynamo_llm
::
model_card
::
ModelDeploymentCard
;
use
dynamo_llm
::
protocols
::
openai
::
nvext
::
NvExt
;
use
dynamo_llm
::
protocols
::
openai
::
nvext
::
NvExt
;
...
@@ -351,7 +348,6 @@ use dynamo_llm::types::{
...
@@ -351,7 +348,6 @@ use dynamo_llm::types::{
},
},
};
};
use
dynamo_runtime
::{
use
dynamo_runtime
::{
component
::
Client
,
engine
::
AsyncEngineStream
,
engine
::
AsyncEngineStream
,
pipeline
::{
ManyOut
,
RouterMode
,
ServiceEngine
,
SingleIn
},
pipeline
::{
ManyOut
,
RouterMode
,
ServiceEngine
,
SingleIn
},
};
};
...
@@ -902,41 +898,7 @@ pub async fn query_worker_selection_and_annotate(
...
@@ -902,41 +898,7 @@ pub async fn query_worker_selection_and_annotate(
Ok
((
worker_id
,
tokens
,
original_request
))
Ok
((
worker_id
,
tokens
,
original_request
))
}
}
/// Build a worker selection pipeline
/// Create a worker selection pipeline for OpenAI Chat Completion requests
/// The router handles query_instance_id annotations and returns worker_instance_id and token_data annotations.
pub
async
fn
build_worker_selection_pipeline_chat
(
card
:
&
ModelDeploymentCard
,
client
:
&
Client
,
router_mode
:
RouterMode
,
busy_threshold
:
Option
<
f64
>
,
chooser
:
Option
<
Arc
<
KvRouter
>>
,
)
->
anyhow
::
Result
<
ServiceEngine
<
SingleIn
<
NvCreateChatCompletionRequest
>
,
ManyOut
<
Annotated
<
NvCreateChatCompletionStreamResponse
>>
,
>
,
>
{
let
hf_tokenizer
=
card
.tokenizer_hf
()
.with_context
(||
"Failed to load HF tokenizer"
)
?
;
let
engine
=
build_routed_pipeline
::
<
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
,
>
(
card
,
client
,
router_mode
,
busy_threshold
,
chooser
,
hf_tokenizer
,
None
,
)
.await
?
;
Ok
(
engine
)
}
/// Helper function to create worker selection pipeline for OpenAI Chat Completion requests
///
///
/// This is a concrete implementation that works specifically with NvCreateChatCompletionRequest
/// This is a concrete implementation that works specifically with NvCreateChatCompletionRequest
/// and is designed for use with C bindings. Uses the "generate" endpoint by default.
/// and is designed for use with C bindings. Uses the "generate" endpoint by default.
...
@@ -974,22 +936,32 @@ pub async fn create_worker_selection_pipeline_chat(
...
@@ -974,22 +936,32 @@ pub async fn create_worker_selection_pipeline_chat(
.component
(
component_name
)
?
;
.component
(
component_name
)
?
;
let
client
=
component
.endpoint
(
GENERATE_ENDPOINT
)
.client
()
.await
?
;
let
client
=
component
.endpoint
(
GENERATE_ENDPOINT
)
.client
()
.await
?
;
let
model_slug
=
Slug
::
from_string
(
model_name
);
// Discover the model card by searching all instances with this model name
let
card
=
match
ModelDeploymentCard
::
load_from_store
(
tracing
::
debug!
(
"Looking for model: {}"
,
model_name
);
&
Key
::
from_raw
(
model_slug
.to_string
()),
tracing
::
debug!
(
"Namespace: {}"
,
namespace
);
component
.drt
(),
)
use
dynamo_llm
::
discovery
::
ModelWatcher
;
.await
let
model_manager
=
std
::
sync
::
Arc
::
new
(
ModelManager
::
new
());
{
let
watcher
=
ModelWatcher
::
new
(
Ok
(
Some
(
card
))
=>
card
,
component
.drt
()
.clone
(),
Ok
(
None
)
=>
anyhow
::
bail!
(
"ModelDeploymentCard not found for model: {}"
,
model_name
),
model_manager
.clone
(),
Err
(
err
)
=>
anyhow
::
bail!
(
router_mode
,
"Error fetching ModelDeploymentCard from storage under key {model_slug}. {err}"
kv_router_config
,
),
busy_threshold
,
};
);
let
cards
=
watcher
.cards_for_model
(
model_name
,
Some
(
namespace
),
false
)
.await
.with_context
(||
format!
(
"Failed to discover model: {}"
,
model_name
))
?
;
tracing
::
debug!
(
"Found {} cards for model {}"
,
cards
.len
(),
model_name
);
let
card
=
cards
.into_iter
()
.next
()
.ok_or_else
(||
{
tracing
::
error!
(
"No ModelDeploymentCard found for model: {}"
,
model_name
);
anyhow
::
anyhow!
(
"ModelDeploymentCard not found for model: {}"
,
model_name
)
})
?
;
let
chooser
=
if
router_mode
==
RouterMode
::
KV
{
let
chooser
=
if
router_mode
==
RouterMode
::
KV
{
let
model_manager
=
std
::
sync
::
Arc
::
new
(
ModelManager
::
new
());
Some
(
Some
(
model_manager
model_manager
.kv_chooser_for
(
&
component
,
card
.kv_cache_block_size
,
kv_router_config
)
.kv_chooser_for
(
&
component
,
card
.kv_cache_block_size
,
kv_router_config
)
...
@@ -999,5 +971,52 @@ pub async fn create_worker_selection_pipeline_chat(
...
@@ -999,5 +971,52 @@ pub async fn create_worker_selection_pipeline_chat(
None
None
};
};
build_worker_selection_pipeline_chat
(
&
card
,
&
client
,
router_mode
,
busy_threshold
,
chooser
)
.await
// Download model config files from HuggingFace for EPP
// The backend's card has NATS URLs which aren't accessible from EPP
tracing
::
debug!
(
"Downloading model config files for EPP: {}"
,
card
.display_name
);
let
local_path
=
dynamo_llm
::
hub
::
from_hf
(
&
card
.display_name
,
true
)
.await
.with_context
(||
{
format!
(
"Failed to download model config files for: {}"
,
card
.display_name
)
})
?
;
// Load a fresh card from local files, then copy runtime config from original card
tracing
::
debug!
(
"Loading ModelDeploymentCard from local path..."
);
let
mut
card_with_local_files
=
ModelDeploymentCard
::
load_from_disk
(
&
local_path
,
None
)
.with_context
(||
format!
(
"Failed to load card from disk: {:?}"
,
local_path
))
?
;
// Copy runtime settings from the backend's card
tracing
::
debug!
(
"Copying runtime config from backend card..."
);
card_with_local_files
.runtime_config
=
card
.runtime_config
.clone
();
card_with_local_files
.kv_cache_block_size
=
card
.kv_cache_block_size
;
card_with_local_files
.context_length
=
card
.context_length
;
// Load the tokenizer from the downloaded files
tracing
::
debug!
(
"Loading tokenizer from local files..."
);
let
hf_tokenizer
=
card_with_local_files
.tokenizer_hf
()
.with_context
(||
format!
(
"Failed to load tokenizer for: {}"
,
card
.display_name
))
?
;
let
engine
=
build_routed_pipeline
::
<
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
,
>
(
&
card_with_local_files
,
&
client
,
router_mode
,
busy_threshold
,
chooser
,
hf_tokenizer
,
None
,
)
.await
?
;
Ok
(
engine
)
}
}
lib/llm/src/model_card.rs
View file @
c8adbe6f
...
@@ -21,10 +21,6 @@ use crate::local_model::runtime_config::ModelRuntimeConfig;
...
@@ -21,10 +21,6 @@ use crate::local_model::runtime_config::ModelRuntimeConfig;
use
crate
::
model_type
::{
ModelInput
,
ModelType
};
use
crate
::
model_type
::{
ModelInput
,
ModelType
};
use
anyhow
::{
Context
,
Result
};
use
anyhow
::{
Context
,
Result
};
use
derive_builder
::
Builder
;
use
derive_builder
::
Builder
;
use
dynamo_runtime
::
DistributedRuntime
;
use
dynamo_runtime
::
storage
::
key_value_store
::{
EtcdStore
,
Key
,
KeyValueStore
,
KeyValueStoreManager
,
};
use
dynamo_runtime
::{
slug
::
Slug
,
storage
::
key_value_store
::
Versioned
};
use
dynamo_runtime
::{
slug
::
Slug
,
storage
::
key_value_store
::
Versioned
};
use
serde
::{
Deserialize
,
Serialize
};
use
serde
::{
Deserialize
,
Serialize
};
use
tokenizers
::
Tokenizer
as
HfTokenizer
;
use
tokenizers
::
Tokenizer
as
HfTokenizer
;
...
@@ -373,30 +369,6 @@ impl ModelDeploymentCard {
...
@@ -373,30 +369,6 @@ impl ModelDeploymentCard {
matches!
(
self
.model_input
,
ModelInput
::
Tokens
)
matches!
(
self
.model_input
,
ModelInput
::
Tokens
)
}
}
/// Load a ModelDeploymentCard from storage the DistributedRuntime is configured to use.
/// Card should be fully local and ready to use when the call returns.
pub
async
fn
load_from_store
(
mdc_key
:
&
Key
,
drt
:
&
DistributedRuntime
,
)
->
anyhow
::
Result
<
Option
<
Self
>>
{
let
Some
(
etcd_client
)
=
drt
.etcd_client
()
else
{
// Should be impossible because we only get here on an etcd event
anyhow
::
bail!
(
"Missing etcd_client"
);
};
let
store
:
Box
<
dyn
KeyValueStore
>
=
Box
::
new
(
EtcdStore
::
new
(
etcd_client
));
let
card_store
=
Arc
::
new
(
KeyValueStoreManager
::
new
(
store
));
let
Some
(
mut
card
)
=
card_store
.load
::
<
ModelDeploymentCard
>
(
ROOT_PATH
,
mdc_key
)
.await
?
else
{
return
Ok
(
None
);
};
card
.download_config
()
.await
?
;
Ok
(
Some
(
card
))
}
/// Download the files this card needs to work: config.json, tokenizer.json, etc.
/// Download the files this card needs to work: config.json, tokenizer.json, etc.
pub
async
fn
download_config
(
&
mut
self
)
->
anyhow
::
Result
<
()
>
{
pub
async
fn
download_config
(
&
mut
self
)
->
anyhow
::
Result
<
()
>
{
if
self
.has_local_files
()
{
if
self
.has_local_files
()
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment