Commit 602352ce authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

chore: rename dynamo (#44)


Co-authored-by: default avatarBiswa Panda <biswa.panda@gmail.com>
parent ecf53ce2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import random
import string
import uvloop
from dynemo.runtime import DistributedRuntime, dynemo_worker
# Soak Test
#
# This was a failure case for the distributed runtime. If the Rust Tokio
# runtime is started with a small number of threads, it will starve the
# the GIL + asyncio event loop can starve timeout the ingress handler.
#
# There may still be some blocking operations in the ingress handler that
# could still eventually be a problem.
@dynemo_worker()
async def worker(runtime: DistributedRuntime):
ns = random_string()
task = asyncio.create_task(server_init(runtime, ns))
await client_init(runtime, ns)
runtime.shutdown()
await task
async def client_init(runtime: DistributedRuntime, ns: str):
"""
Instantiate a `backend` client and call the `generate` endpoint
"""
# get endpoint
endpoint = runtime.namespace(ns).component("backend").endpoint("generate")
# create client
client = await endpoint.client()
# wait for an endpoint to be ready
await client.wait_for_endpoints()
# Issue many concurrent requests to put load on the server,
# the task should issue the request and process the response
tasks = []
for i in range(20000):
tasks.append(asyncio.create_task(do_one(client)))
await asyncio.gather(*tasks)
# ensure all tasks are done and without errors
error_count = 0
for task in tasks:
if task.exception():
error_count += 1
assert error_count == 0, f"expected 0 errors, got {error_count}"
async def do_one(client):
stream = await client.generate("hello world")
async for char in stream:
pass
async def server_init(runtime: DistributedRuntime, ns: str):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace(ns).component("backend")
await component.create_service()
endpoint = component.endpoint("generate")
print("Started server instance")
await endpoint.serve_endpoint(RequestHandler().generate)
class RequestHandler:
"""
Request handler for the generate endpoint
"""
async def generate(self, request):
for char in request:
await asyncio.sleep(0.1)
yield char
def random_string(length=10):
chars = string.ascii_letters + string.digits # a-z, A-Z, 0-9
return "".join(random.choices(chars, k=length))
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
...@@ -20,7 +20,7 @@ pytestmark = pytest.mark.pre_merge ...@@ -20,7 +20,7 @@ pytestmark = pytest.mark.pre_merge
def test_bindings_install(): def test_bindings_install():
# Verify python bindings to rust can be imported # Verify python bindings to rust can be imported
import dynemo.runtime as tdr import dynamo.runtime as tdr
# Placeholder to avoid unused import errors or removal by linters # Placeholder to avoid unused import errors or removal by linters
assert tdr assert tdr
...@@ -15,7 +15,10 @@ ...@@ -15,7 +15,10 @@
import asyncio import asyncio
from dynemo._core import DistributedRuntime from dynamo._core import DistributedRuntime
# Todo add support for launching etcd
# pytestmark = pytest.mark.pre_merge
async def test_simple_put_get(): async def test_simple_put_get():
......
...@@ -24,8 +24,8 @@ from typing import List ...@@ -24,8 +24,8 @@ from typing import List
import pytest import pytest
from dynemo.llm import KvIndexer, KvMetricsAggregator, KvMetricsPublisher from dynamo.llm import KvIndexer, KvMetricsAggregator, KvMetricsPublisher
from dynemo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
pytestmark = pytest.mark.pre_merge pytestmark = pytest.mark.pre_merge
...@@ -89,7 +89,7 @@ async def test_event_handler(): ...@@ -89,7 +89,7 @@ async def test_event_handler():
# KV events # KV events
class DynemoResult: class DynamoResult:
OK = 0 OK = 0
ERR = 1 ERR = 1
...@@ -101,13 +101,13 @@ class EventPublisher: ...@@ -101,13 +101,13 @@ class EventPublisher:
# load event publisher library # load event publisher library
self.lib = ctypes.CDLL(os.environ["VLLM_KV_CAPI_PATH"]) self.lib = ctypes.CDLL(os.environ["VLLM_KV_CAPI_PATH"])
self.lib.dynemo_llm_init.argtypes = [c_char_p, c_char_p, c_int64] self.lib.dynamo_llm_init.argtypes = [c_char_p, c_char_p, c_int64]
self.lib.dynemo_llm_init.restype = c_uint32 self.lib.dynamo_llm_init.restype = c_uint32
result = self.lib.dynemo_llm_init( result = self.lib.dynamo_llm_init(
namespace.encode(), component.encode(), worker_id namespace.encode(), component.encode(), worker_id
) )
assert result == DynemoResult.OK assert result == DynamoResult.OK
self.lib.dynemo_kv_event_publish_stored.argtypes = [ self.lib.dynamo_kv_event_publish_stored.argtypes = [
ctypes.c_uint64, # event_id ctypes.c_uint64, # event_id
ctypes.POINTER(ctypes.c_uint32), # token_ids ctypes.POINTER(ctypes.c_uint32), # token_ids
ctypes.POINTER(ctypes.c_size_t), # num_block_tokens ctypes.POINTER(ctypes.c_size_t), # num_block_tokens
...@@ -116,18 +116,18 @@ class EventPublisher: ...@@ -116,18 +116,18 @@ class EventPublisher:
ctypes.POINTER(ctypes.c_uint64), # parent_hash ctypes.POINTER(ctypes.c_uint64), # parent_hash
ctypes.c_uint64, # lora_id ctypes.c_uint64, # lora_id
] ]
self.lib.dynemo_kv_event_publish_stored.restype = ( self.lib.dynamo_kv_event_publish_stored.restype = (
ctypes.c_uint32 ctypes.c_uint32
) # dynemo_llm_result_t ) # dynamo_llm_result_t
self.lib.dynemo_kv_event_publish_removed.argtypes = [ self.lib.dynamo_kv_event_publish_removed.argtypes = [
ctypes.c_uint64, # event_id ctypes.c_uint64, # event_id
ctypes.POINTER(ctypes.c_uint64), # block_ids ctypes.POINTER(ctypes.c_uint64), # block_ids
ctypes.c_size_t, # num_blocks ctypes.c_size_t, # num_blocks
] ]
self.lib.dynemo_kv_event_publish_removed.restype = ( self.lib.dynamo_kv_event_publish_removed.restype = (
ctypes.c_uint32 ctypes.c_uint32
) # dynemo_llm_result_t ) # dynamo_llm_result_t
def store_event(self, tokens, lora_id): def store_event(self, tokens, lora_id):
parent_hash = ( parent_hash = (
...@@ -135,7 +135,7 @@ class EventPublisher: ...@@ -135,7 +135,7 @@ class EventPublisher:
if self.event_id_counter > 0 if self.event_id_counter > 0
else None else None
) )
result = self.lib.dynemo_kv_event_publish_stored( result = self.lib.dynamo_kv_event_publish_stored(
self.event_id_counter, # uint64_t event_id self.event_id_counter, # uint64_t event_id
(ctypes.c_uint32 * len(tokens))(*tokens), # const uint32_t *token_ids (ctypes.c_uint32 * len(tokens))(*tokens), # const uint32_t *token_ids
(ctypes.c_size_t * 1)(len(tokens)), # const uintptr_t *num_block_tokens (ctypes.c_size_t * 1)(len(tokens)), # const uintptr_t *num_block_tokens
...@@ -147,17 +147,17 @@ class EventPublisher: ...@@ -147,17 +147,17 @@ class EventPublisher:
self.block_ids.append(self.event_id_counter) self.block_ids.append(self.event_id_counter)
self.event_id_counter += 1 self.event_id_counter += 1
assert result == DynemoResult.OK assert result == DynamoResult.OK
def remove_event(self): def remove_event(self):
result = self.lib.dynemo_kv_event_publish_removed( result = self.lib.dynamo_kv_event_publish_removed(
self.event_id_counter, # uint64_t event_id self.event_id_counter, # uint64_t event_id
(ctypes.c_uint64 * 1)(self.block_ids[-1]), # const uint64_t *block_ids (ctypes.c_uint64 * 1)(self.block_ids[-1]), # const uint64_t *block_ids
1, # uintptr_t num_blocks 1, # uintptr_t num_blocks
) )
self.event_id_counter += 1 self.event_id_counter += 1
assert result == DynemoResult.OK assert result == DynamoResult.OK
async def test_metrics_aggregator(): async def test_metrics_aggregator():
......
...@@ -1409,7 +1409,7 @@ dependencies = [ ...@@ -1409,7 +1409,7 @@ dependencies = [
] ]
[[package]] [[package]]
name = "dynemo-llm" name = "dynamo-llm"
version = "0.2.1" version = "0.2.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
...@@ -1425,7 +1425,7 @@ dependencies = [ ...@@ -1425,7 +1425,7 @@ dependencies = [
"chrono", "chrono",
"cmake", "cmake",
"derive_builder", "derive_builder",
"dynemo-runtime", "dynamo-runtime",
"either", "either",
"erased-serde", "erased-serde",
"futures", "futures",
...@@ -1470,7 +1470,7 @@ dependencies = [ ...@@ -1470,7 +1470,7 @@ dependencies = [
] ]
[[package]] [[package]]
name = "dynemo-runtime" name = "dynamo-runtime"
version = "0.2.1" version = "0.2.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
......
...@@ -22,7 +22,7 @@ homepage = "https://github.com/dynemo-ai/dynemo" ...@@ -22,7 +22,7 @@ homepage = "https://github.com/dynemo-ai/dynemo"
repository = "https://github.com/dynemo-ai/dynemo.git" repository = "https://github.com/dynemo-ai/dynemo.git"
[package] [package]
name = "dynemo-llm" name = "dynamo-llm"
version.workspace = true version.workspace = true
edition.workspace = true edition.workspace = true
authors.workspace = true authors.workspace = true
...@@ -44,7 +44,7 @@ vulkan = ["llama-cpp-2/vulkan"] ...@@ -44,7 +44,7 @@ vulkan = ["llama-cpp-2/vulkan"]
[workspace.dependencies] [workspace.dependencies]
# local or crates.io # local or crates.io
dynemo-runtime = { version = "0.2.0", path = "../runtime" } dynamo-runtime = { version = "0.2.0", path = "../runtime" }
# crates.io # crates.io
anyhow = { version = "1" } anyhow = { version = "1" }
...@@ -67,7 +67,7 @@ strum = { version = "0.27", features = ["derive"] } ...@@ -67,7 +67,7 @@ strum = { version = "0.27", features = ["derive"] }
[dependencies] [dependencies]
# repo # repo
dynemo-runtime = { workspace = true } dynamo-runtime = { workspace = true }
# workspace # workspace
anyhow = { workspace = true } anyhow = { workspace = true }
......
...@@ -34,7 +34,7 @@ use futures::stream::{self, StreamExt}; ...@@ -34,7 +34,7 @@ use futures::stream::{self, StreamExt};
use tracing as log; use tracing as log;
use crate::model_card::model::{ModelDeploymentCard, TokenizerKind}; use crate::model_card::model::{ModelDeploymentCard, TokenizerKind};
use dynemo_runtime::{ use dynamo_runtime::{
pipeline::{ pipeline::{
async_trait, AsyncEngineContextProvider, ManyOut, Operator, ResponseStream, async_trait, AsyncEngineContextProvider, ManyOut, Operator, ResponseStream,
ServerStreamingEngine, SingleIn, ServerStreamingEngine, SingleIn,
......
...@@ -22,11 +22,11 @@ use std::{ ...@@ -22,11 +22,11 @@ use std::{
use anyhow::Context; use anyhow::Context;
use async_stream::stream; use async_stream::stream;
use async_trait::async_trait; use async_trait::async_trait;
use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream}; use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynemo_runtime::pipeline::error as pipeline_error; use dynamo_runtime::pipeline::error as pipeline_error;
use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn}; use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynemo_runtime::protocols::annotated::Annotated; use dynamo_runtime::protocols::annotated::Annotated;
use dynemo_runtime::CancellationToken; use dynamo_runtime::CancellationToken;
use llama_cpp_2::{ use llama_cpp_2::{
context::{params::LlamaContextParams, LlamaContext}, context::{params::LlamaContextParams, LlamaContext},
llama_backend::LlamaBackend, llama_backend::LlamaBackend,
......
...@@ -28,10 +28,10 @@ use mistralrs::{ ...@@ -28,10 +28,10 @@ use mistralrs::{
}; };
use tokio::sync::mpsc::channel; use tokio::sync::mpsc::channel;
use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream}; use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynemo_runtime::pipeline::error as pipeline_error; use dynamo_runtime::pipeline::error as pipeline_error;
use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn}; use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynemo_runtime::protocols::annotated::Annotated; use dynamo_runtime::protocols::annotated::Annotated;
use crate::protocols::openai::chat_completions::{ use crate::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse, NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
use std::ffi::CStr; use std::ffi::CStr;
use std::{path::Path, sync::Arc}; use std::{path::Path, sync::Arc};
use dynemo_runtime::pipeline::error as pipeline_error; use dynamo_runtime::pipeline::error as pipeline_error;
pub use dynemo_runtime::{ pub use dynamo_runtime::{
error, error,
pipeline::{ pipeline::{
async_trait, AsyncEngine, AsyncEngineContextProvider, Data, ManyOut, ResponseStream, async_trait, AsyncEngine, AsyncEngineContextProvider, Data, ManyOut, ResponseStream,
......
...@@ -17,8 +17,8 @@ use std::path::Path; ...@@ -17,8 +17,8 @@ use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use crate::backend::ExecutionContext; use crate::backend::ExecutionContext;
use dynemo_runtime::pipeline::error as pipeline_error; use dynamo_runtime::pipeline::error as pipeline_error;
use dynemo_runtime::CancellationToken; use dynamo_runtime::CancellationToken;
mod worker; mod worker;
......
...@@ -19,10 +19,10 @@ use async_stream::stream; ...@@ -19,10 +19,10 @@ use async_stream::stream;
use async_trait::async_trait; use async_trait::async_trait;
use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput}; use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream}; use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn}; use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynemo_runtime::protocols::annotated::Annotated; use dynamo_runtime::protocols::annotated::Annotated;
use dynemo_runtime::runtime::CancellationToken; use dynamo_runtime::runtime::CancellationToken;
use crate::engines::MultiNodeConfig; use crate::engines::MultiNodeConfig;
......
...@@ -37,8 +37,8 @@ use tokio::sync::mpsc::Sender; ...@@ -37,8 +37,8 @@ use tokio::sync::mpsc::Sender;
use tokio::{io::AsyncBufReadExt, sync::mpsc::error::SendError}; use tokio::{io::AsyncBufReadExt, sync::mpsc::error::SendError};
use tokio::{io::AsyncReadExt as _, task::JoinHandle}; use tokio::{io::AsyncReadExt as _, task::JoinHandle};
use dynemo_runtime::protocols::annotated::Annotated; use dynamo_runtime::protocols::annotated::Annotated;
use dynemo_runtime::runtime::CancellationToken; use dynamo_runtime::runtime::CancellationToken;
use crate::engines::sglang::MultiGPUConfig; use crate::engines::sglang::MultiGPUConfig;
use crate::engines::MultiNodeConfig; use crate::engines::MultiNodeConfig;
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
use std::sync::Arc; use std::sync::Arc;
use crate::backend::ExecutionContext; use crate::backend::ExecutionContext;
use dynemo_runtime::pipeline::error as pipeline_error; use dynamo_runtime::pipeline::error as pipeline_error;
pub mod executor; pub mod executor;
......
...@@ -15,9 +15,9 @@ ...@@ -15,9 +15,9 @@
use anyhow::{Error, Result}; use anyhow::{Error, Result};
use async_trait::async_trait; use async_trait::async_trait;
use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream}; use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynemo_runtime::pipeline::{ManyOut, SingleIn}; use dynamo_runtime::pipeline::{ManyOut, SingleIn};
use dynemo_runtime::protocols::annotated::Annotated; use dynamo_runtime::protocols::annotated::Annotated;
use futures::stream; use futures::stream;
use tokio::sync::mpsc; use tokio::sync::mpsc;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
......
...@@ -19,8 +19,8 @@ use std::pin::Pin; ...@@ -19,8 +19,8 @@ use std::pin::Pin;
use std::sync::Arc; use std::sync::Arc;
use std::task::{Context, Poll}; use std::task::{Context, Poll};
use dynemo_runtime::pipeline::error as pipeline_error; use dynamo_runtime::pipeline::error as pipeline_error;
use dynemo_runtime::CancellationToken; use dynamo_runtime::CancellationToken;
use crate::backend::ExecutionContext; use crate::backend::ExecutionContext;
use crate::engines::MultiNodeConfig; use crate::engines::MultiNodeConfig;
......
...@@ -21,10 +21,10 @@ use async_trait::async_trait; ...@@ -21,10 +21,10 @@ use async_trait::async_trait;
use crate::engines::vllm::worker; use crate::engines::vllm::worker;
use crate::engines::MultiNodeConfig; use crate::engines::MultiNodeConfig;
use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput}; use crate::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use dynemo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream}; use dynamo_runtime::engine::{AsyncEngine, AsyncEngineContextProvider, ResponseStream};
use dynemo_runtime::pipeline::{Error, ManyOut, SingleIn}; use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynemo_runtime::protocols::annotated::Annotated; use dynamo_runtime::protocols::annotated::Annotated;
use dynemo_runtime::runtime::CancellationToken; use dynamo_runtime::runtime::CancellationToken;
pub struct VllmEngine { pub struct VllmEngine {
cancel_token: CancellationToken, cancel_token: CancellationToken,
......
...@@ -24,7 +24,7 @@ use tokio::select; ...@@ -24,7 +24,7 @@ use tokio::select;
use tokio::time; use tokio::time;
use tracing; use tracing;
use dynemo_runtime::CancellationToken; use dynamo_runtime::CancellationToken;
/// Default is 16 seconds, we make it a bit shorter /// Default is 16 seconds, we make it a bit shorter
const RAY_STOP_TIMEOUT_SECS: u32 = 10; const RAY_STOP_TIMEOUT_SECS: u32 = 10;
......
...@@ -19,8 +19,8 @@ use std::{ ...@@ -19,8 +19,8 @@ use std::{
}; };
use async_zmq::{SinkExt, StreamExt}; use async_zmq::{SinkExt, StreamExt};
use dynemo_runtime::protocols::annotated::Annotated; use dynamo_runtime::protocols::annotated::Annotated;
use dynemo_runtime::CancellationToken; use dynamo_runtime::CancellationToken;
use pyo3::{ use pyo3::{
prelude::*, prelude::*,
types::{IntoPyDict, PyBytes, PyString}, types::{IntoPyDict, PyBytes, PyString},
......
...@@ -18,7 +18,7 @@ use std::sync::Arc; ...@@ -18,7 +18,7 @@ use std::sync::Arc;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::sync::mpsc::Receiver; use tokio::sync::mpsc::Receiver;
use dynemo_runtime::{ use dynamo_runtime::{
protocols::{self, annotated::Annotated}, protocols::{self, annotated::Annotated},
raise, raise,
transports::etcd::{KeyValue, WatchEvent}, transports::etcd::{KeyValue, WatchEvent},
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment