fix(dynamo-run): For internal comms use a random endpoint instead of hard coded (#1335)

To talk to the vllm/sglang/trtllm engine we previously hardcoded an endpoint. The user never sees it so it doesn't matter which one. However if you try to run _two_ instances of Dynamo on one machine they will conflict. Use a UUID as the component name to resolve that. Part of the solution for: https://github.com/ai-dynamo/dynamo/issues/1073

fix(dynamo-run): For internal comms use a random endpoint instead of hard coded (#1335)
To talk to the vllm/sglang/trtllm engine we previously hardcoded an endpoint. The user never sees it so it doesn't matter which one. However if you try to run _two_ instances of Dynamo on one machine they will conflict. Use a UUID as the component name to resolve that. Part of the solution for: https://github.com/ai-dynamo/dynamo/issues/1073
43991e76 · Graham King · GitHub · aba3ab03 · 43991e76 · 43991e76
Unverified Commit 43991e76 authored Jun 03, 2025 by Graham King Committed by GitHub Jun 03, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 32 additions and 11 deletions

Cargo.lock Cargo.lock +1 -0

launch/dynamo-run/Cargo.toml launch/dynamo-run/Cargo.toml +1 -0

launch/dynamo-run/src/lib.rs launch/dynamo-run/src/lib.rs +30 -11

No files found.
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1682,6 +1682,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
+ "uuid 1.16.0",
 ]

 [[package]]

--- a/launch/dynamo-run/Cargo.toml
+++ b/launch/dynamo-run/Cargo.toml
@@ -43,6 +43,7 @@ tokio = { workspace = true }
 tokio-util = { workspace = true }
 tracing = { workspace = true }
 tracing-subscriber = { workspace = true }
+uuid = { workspace = true }

 async-openai = { version = "0.27.2" }
 clap = { version = "4.5", features = ["derive", "env"] }

--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -6,6 +6,8 @@ use std::{io::Read, sync::Arc, time::Duration};

 use anyhow::Context;
 use dynamo_llm::{backend::ExecutionContext, engines::StreamingEngine, local_model::LocalModel};
+use dynamo_runtime::protocols::Endpoint as EndpointId;
+use dynamo_runtime::slug::Slug;
 use dynamo_runtime::{CancellationToken, DistributedRuntime};

 mod flags;
@@ -18,9 +20,6 @@ mod subprocess;

 const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);

-/// Where we will attach the vllm/sglang subprocess. Invisible to users.
-pub const INTERNAL_ENDPOINT: &str = "dyn://dynamo.internal.worker";
-
 /// Default size of a KV cache block. Override with --kv-cache-block-size
 const DEFAULT_KV_CACHE_BLOCK_SIZE: usize = 16;

@@ -171,7 +170,7 @@ pub async fn run(
            // If not, then the endpoint isn't exposed so we invent an internal one.
            let endpoint = match &in_opt {
                Input::Endpoint(path) => path.parse()?,
-                _ => INTERNAL_ENDPOINT.parse()?,
+                _ => internal_endpoint("sglang"),
            };

            let multi_node_conf = dynamo_llm::engines::MultiNodeConfig {
@@ -214,7 +213,7 @@ pub async fn run(
            // If not, then the endpoint isn't exposed so we invent an internal one.
            let endpoint = match &in_opt {
                Input::Endpoint(path) => path.parse()?,
-                _ => INTERNAL_ENDPOINT.parse()?,
+                _ => internal_endpoint("vllm"),
            };

            let (py_script, child) = match subprocess::start(
@@ -248,7 +247,7 @@ pub async fn run(
            // If not, then the endpoint isn't exposed so we invent an internal one.
            let endpoint = match &in_opt {
                Input::Endpoint(path) => path.parse()?,
-                _ => INTERNAL_ENDPOINT.parse()?,
+                _ => internal_endpoint("trtllm"),
            };

            let (py_script, child) = match subprocess::start(
@@ -403,19 +402,39 @@ fn print_cuda(_output: &Output) {}

 fn gguf_default() -> Output {
    #[cfg(feature = "llamacpp")]
-    return Output::LlamaCpp;
+    {
+        Output::LlamaCpp
+    }

    #[cfg(all(feature = "mistralrs", not(feature = "llamacpp")))]
-    return Output::MistralRs;
+    {
+        Output::MistralRs
+    }

    #[cfg(not(any(feature = "mistralrs", feature = "llamacpp")))]
-    return Output::EchoFull;
+    {
+        Output::EchoFull
+    }
 }

 fn safetensors_default() -> Output {
    #[cfg(feature = "mistralrs")]
-    return Output::MistralRs;
+    {
+        Output::MistralRs
+    }

    #[cfg(not(feature = "mistralrs"))]
-    return Output::EchoFull;
+    {
+        Output::EchoFull
+    }
+}
+
+/// A random endpoint to use for internal communication
+/// We can't hard code because we may be running several on the same machine (GPUs 0-3 and 4-7)
+fn internal_endpoint(engine: &str) -> EndpointId {
+    EndpointId {
+        namespace: Slug::slugify(&uuid::Uuid::new_v4().to_string()).to_string(),
+        component: engine.to_string(),
+        name: "generate".to_string(),
+    }
 }