Commit 678cffb4 authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files
parent 6ba39b09
...@@ -21,7 +21,7 @@ Run this example using command below ...@@ -21,7 +21,7 @@ Run this example using command below
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
dynemo-sdk serve sdk_kv_router.frontend:Frontend dynamo-sdk serve sdk_kv_router.frontend:Frontend
``` ```
...@@ -35,4 +35,4 @@ curl -X 'POST' \ ...@@ -35,4 +35,4 @@ curl -X 'POST' \
-d '{ -d '{
"msg": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." "msg": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}' }'
```` ```
\ No newline at end of file
...@@ -13,11 +13,12 @@ ...@@ -13,11 +13,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from dynemo.sdk import DYNEMO_IMAGE, api, depends, service
from sdk_kv_router.processor import Processor from sdk_kv_router.processor import Processor
from dynamo.sdk import DYNAMO_IMAGE, api, depends, service
@service(traffic={"timeout": 10000}, image=DYNEMO_IMAGE)
@service(traffic={"timeout": 10000}, image=DYNAMO_IMAGE)
class Frontend: class Frontend:
processor = depends(Processor) processor = depends(Processor)
......
...@@ -29,13 +29,13 @@ with bentoml.importing(): ...@@ -29,13 +29,13 @@ with bentoml.importing():
from common.chat_processor import ChatProcessor, ProcessMixIn from common.chat_processor import ChatProcessor, ProcessMixIn
from common.protocol import MyRequestOutput, Tokens, vLLMGenerateRequest from common.protocol import MyRequestOutput, Tokens, vLLMGenerateRequest
from dynemo.sdk import depends, dynemo_context, dynemo_endpoint, service from dynamo.sdk import depends, dynamo_context, dynamo_endpoint, service
@service( @service(
dynemo={ dynamo={
"enabled": True, "enabled": True,
"namespace": "dynemo", "namespace": "dynamo",
}, },
resources={"cpu": "10", "memory": "20Gi"}, resources={"cpu": "10", "memory": "20Gi"},
workers=1, workers=1,
...@@ -92,7 +92,7 @@ class Processor(ProcessMixIn): ...@@ -92,7 +92,7 @@ class Processor(ProcessMixIn):
metrics=output.metrics, metrics=output.metrics,
) )
@dynemo_endpoint() @dynamo_endpoint()
async def generate(self, raw_request: ChatCompletionRequest): async def generate(self, raw_request: ChatCompletionRequest):
request_id = str(uuid.uuid4()) request_id = str(uuid.uuid4())
( (
...@@ -108,8 +108,8 @@ class Processor(ProcessMixIn): ...@@ -108,8 +108,8 @@ class Processor(ProcessMixIn):
): ):
worker_id = worker worker_id = worker
break break
runtime = dynemo_context["runtime"] runtime = dynamo_context["runtime"]
comp_ns, comp_name = VllmEngine.dynemo_address() # type: ignore comp_ns, comp_name = VllmEngine.dynamo_address() # type: ignore
worker_client = ( worker_client = (
await runtime.namespace(comp_ns) await runtime.namespace(comp_ns)
.component(comp_name) .component(comp_name)
......
...@@ -17,10 +17,11 @@ from enum import Enum ...@@ -17,10 +17,11 @@ from enum import Enum
import bentoml import bentoml
from common.protocol import Tokens from common.protocol import Tokens
from dynemo.sdk import async_onstart, dynemo_context, dynemo_endpoint, service
from dynamo.sdk import async_onstart, dynamo_context, dynamo_endpoint, service
with bentoml.importing(): with bentoml.importing():
from dynemo.runtime import KvRouter from dynamo.runtime import KvRouter
WorkerId = str WorkerId = str
...@@ -33,9 +34,9 @@ class RoutingStrategy(Enum): ...@@ -33,9 +34,9 @@ class RoutingStrategy(Enum):
@service( @service(
dynemo={ dynamo={
"enabled": True, "enabled": True,
"namespace": "dynemo", "namespace": "dynamo",
}, },
resources={"cpu": "10", "memory": "20Gi"}, resources={"cpu": "10", "memory": "20Gi"},
workers=1, workers=1,
...@@ -48,13 +49,13 @@ class Router: ...@@ -48,13 +49,13 @@ class Router:
def __init__(self): def __init__(self):
self.model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" self.model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
self.routing_strategy = RoutingStrategy.PREFIX self.routing_strategy = RoutingStrategy.PREFIX
self.runtime = dynemo_context["runtime"] self.runtime = dynamo_context["runtime"]
self.min_workers = 1 self.min_workers = 1
@async_onstart @async_onstart
async def init_engine(self): async def init_engine(self):
workers_client = ( workers_client = (
await self.runtime.namespace("dynemo") await self.runtime.namespace("dynamo")
.component("VllmEngine") .component("VllmEngine")
.endpoint("generate") .endpoint("generate")
.client() .client()
...@@ -74,11 +75,11 @@ class Router: ...@@ -74,11 +75,11 @@ class Router:
) )
await asyncio.sleep(5) await asyncio.sleep(5)
kv_listener = self.runtime.namespace("dynemo").component(self.model_name) kv_listener = self.runtime.namespace("dynamo").component(self.model_name)
await kv_listener.create_service() await kv_listener.create_service()
self.router = KvRouter(self.runtime, kv_listener) self.router = KvRouter(self.runtime, kv_listener)
@dynemo_endpoint() @dynamo_endpoint()
async def generate(self, request: Tokens): async def generate(self, request: Tokens):
lora_id = 0 lora_id = 0
worker_id = "" worker_id = ""
......
...@@ -27,11 +27,11 @@ with bentoml.importing(): ...@@ -27,11 +27,11 @@ with bentoml.importing():
from common.protocol import MyRequestOutput, vLLMGenerateRequest from common.protocol import MyRequestOutput, vLLMGenerateRequest
from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.engine.multiprocessing.client import MQLLMEngineClient
from dynemo.llm import KvMetricsPublisher from dynamo.llm import KvMetricsPublisher
from dynemo.sdk import ( from dynamo.sdk import (
async_onstart, async_onstart,
dynemo_context, dynamo_context,
dynemo_endpoint, dynamo_endpoint,
server_context, server_context,
service, service,
) )
...@@ -42,9 +42,9 @@ lease_id = None ...@@ -42,9 +42,9 @@ lease_id = None
@service( @service(
dynemo={ dynamo={
"enabled": True, "enabled": True,
"namespace": "dynemo", "namespace": "dynamo",
}, },
resources={"gpu": 1, "cpu": "10", "memory": "20Gi"}, resources={"gpu": 1, "cpu": "10", "memory": "20Gi"},
workers=1, workers=1,
...@@ -63,9 +63,9 @@ class VllmEngine(BaseVllmEngine): ...@@ -63,9 +63,9 @@ class VllmEngine(BaseVllmEngine):
block_size=64, block_size=64,
max_model_len=16384, max_model_len=16384,
) )
VLLM_WORKER_ID = dynemo_context["endpoints"][0].lease_id() VLLM_WORKER_ID = dynamo_context["endpoints"][0].lease_id()
os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID) os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
os.environ["VLLM_KV_NAMESPACE"] = "dynemo" os.environ["VLLM_KV_NAMESPACE"] = "dynamo"
os.environ["VLLM_KV_COMPONENT"] = "vllm" os.environ["VLLM_KV_COMPONENT"] = "vllm"
vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}") vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}")
os.environ["CUDA_VISIBLE_DEVICES"] = f"{server_context.worker_index - 1}" os.environ["CUDA_VISIBLE_DEVICES"] = f"{server_context.worker_index - 1}"
...@@ -74,7 +74,7 @@ class VllmEngine(BaseVllmEngine): ...@@ -74,7 +74,7 @@ class VllmEngine(BaseVllmEngine):
super().__init__(self.engine_args) super().__init__(self.engine_args)
async def create_metrics_publisher_endpoint(self): async def create_metrics_publisher_endpoint(self):
component = dynemo_context["component"] component = dynamo_context["component"]
await self.metrics_publisher.create_endpoint(component) await self.metrics_publisher.create_endpoint(component)
@async_onstart @async_onstart
...@@ -88,7 +88,7 @@ class VllmEngine(BaseVllmEngine): ...@@ -88,7 +88,7 @@ class VllmEngine(BaseVllmEngine):
task = asyncio.create_task(self.create_metrics_publisher_endpoint()) task = asyncio.create_task(self.create_metrics_publisher_endpoint())
task.add_done_callback(lambda _: print("metrics publisher endpoint created")) task.add_done_callback(lambda _: print("metrics publisher endpoint created"))
@dynemo_endpoint() @dynamo_endpoint()
async def generate(self, request: vLLMGenerateRequest): async def generate(self, request: vLLMGenerateRequest):
sampling_params = request.sampling_params sampling_params = request.sampling_params
# rust HTTP requires Delta streaming # rust HTTP requires Delta streaming
......
...@@ -27,8 +27,8 @@ version = "0.2.0" ...@@ -27,8 +27,8 @@ version = "0.2.0"
edition = "2021" edition = "2021"
authors = ["NVIDIA"] authors = ["NVIDIA"]
license = "Apache-2.0" license = "Apache-2.0"
homepage = "https://github.com/dynemo-ai/dynemo" homepage = "https://github.com/ai-dynamo/dynamo"
repository = "https://github.com/dynemo-ai/dynemo.git" repository = "https://github.com/ai-dynamo/dynamo.git"
[workspace.dependencies] [workspace.dependencies]
......
...@@ -27,7 +27,7 @@ Annotated { data: Some("o"), id: None, event: None, comment: None } ...@@ -27,7 +27,7 @@ Annotated { data: Some("o"), id: None, event: None, comment: None }
Annotated { data: Some("r"), id: None, event: None, comment: None } Annotated { data: Some("r"), id: None, event: None, comment: None }
Annotated { data: Some("l"), id: None, event: None, comment: None } Annotated { data: Some("l"), id: None, event: None, comment: None }
Annotated { data: Some("d"), id: None, event: None, comment: None } Annotated { data: Some("d"), id: None, event: None, comment: None }
ServiceSet { services: [ServiceInfo { name: "dynemo_init_backend_720278f8", id: "eOHMc4ndRw8s5flv4WOZx7", version: "0.0.1", started: "2025-02-26T18:54:04.917294605Z", endpoints: [EndpointInfo { name: "dynemo_init_backend_720278f8-generate-694d951a80e06abf", subject: "dynemo_init_backend_720278f8.generate-694d951a80e06abf", data: Some(Metrics(Object {"average_processing_time": Number(53662), "data": Object {"val": Number(10)}, "last_error": String(""), "num_errors": Number(0), "num_requests": Number(2), "processing_time": Number(107325), "queue_group": String("q")})) }] }] } ServiceSet { services: [ServiceInfo { name: "dynamo_init_backend_720278f8", id: "eOHMc4ndRw8s5flv4WOZx7", version: "0.0.1", started: "2025-02-26T18:54:04.917294605Z", endpoints: [EndpointInfo { name: "dynamo_init_backend_720278f8-generate-694d951a80e06abf", subject: "dynamo_init_backend_720278f8.generate-694d951a80e06abf", data: Some(Metrics(Object {"average_processing_time": Number(53662), "data": Object {"val": Number(10)}, "last_error": String(""), "num_errors": Number(0), "num_requests": Number(2), "processing_time": Number(107325), "queue_group": String("q")})) }] }] }
``` ```
Note the following stats in the output demonstrate the custom Note the following stats in the output demonstrate the custom
......
...@@ -18,7 +18,7 @@ name = "dynamo-run" ...@@ -18,7 +18,7 @@ name = "dynamo-run"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
authors = ["NVIDIA"] authors = ["NVIDIA"]
homepage = "https://github.com/dynemo-ai/dynemo" homepage = "https://github.com/ai-dynamo/dynamo"
license = "Apache-2.0" license = "Apache-2.0"
[features] [features]
......
...@@ -19,8 +19,8 @@ version = "0.2.1" ...@@ -19,8 +19,8 @@ version = "0.2.1"
edition = "2021" edition = "2021"
authors = ["NVIDIA"] authors = ["NVIDIA"]
license = "Apache-2.0" license = "Apache-2.0"
homepage = "https://github.com/dynemo-ai/dynemo" homepage = "https://github.com/ai-dynamo/dynamo"
repository = "https://github.com/dynemo-ai/dynemo.git" repository = "https://github.com/ai-dynamo/dynamo.git"
[lib] [lib]
name = "dynamo_llm_capi" name = "dynamo_llm_capi"
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
language = "C++" language = "C++"
cpp_compat = true cpp_compat = true
include_guard = "__NVIDIA_DYNEMO_LLM_API__" include_guard = "__NVIDIA_DYNAMO_LLM_API__"
[enum] [enum]
......
...@@ -19,8 +19,8 @@ version = "0.2.1" ...@@ -19,8 +19,8 @@ version = "0.2.1"
edition = "2021" edition = "2021"
authors = ["NVIDIA"] authors = ["NVIDIA"]
license = "Apache-2.0" license = "Apache-2.0"
homepage = "https://github.com/dynemo-ai/dynemo" homepage = "https://github.com/ai-dynamo/dynamo"
repository = "https://github.com/dynemo-ai/dynemo.git" repository = "https://github.com/ai-dynamo/dynamo.git"
[lib] [lib]
path = "rust/lib.rs" path = "rust/lib.rs"
......
...@@ -18,8 +18,8 @@ version = "0.2.1" ...@@ -18,8 +18,8 @@ version = "0.2.1"
edition = "2021" edition = "2021"
authors = ["NVIDIA"] authors = ["NVIDIA"]
license = "Apache-2.0" license = "Apache-2.0"
homepage = "https://github.com/dynemo-ai/dynemo" homepage = "https://github.com/ai-dynamo/dynamo"
repository = "https://github.com/dynemo-ai/dynemo.git" repository = "https://github.com/ai-dynamo/dynamo.git"
[package] [package]
name = "dynamo-llm" name = "dynamo-llm"
......
...@@ -21,8 +21,8 @@ version = "0.2.1" # TODO: Centralize Version Automation ...@@ -21,8 +21,8 @@ version = "0.2.1" # TODO: Centralize Version Automation
edition = "2021" edition = "2021"
authors = ["NVIDIA"] authors = ["NVIDIA"]
license = "Apache-2.0" license = "Apache-2.0"
homepage = "https://github.com/dynemo-ai/dynemo" homepage = "https://github.com/ai-dynamo/dynamo"
repository = "https://github.com/dynemo-ai/dynemo.git" repository = "https://github.com/ai-dynamo/dynamo.git"
keywords = ["llm", "genai", "inference", "nvidia", "distributed", "dynamo"] keywords = ["llm", "genai", "inference", "nvidia", "distributed", "dynamo"]
[features] [features]
......
...@@ -19,7 +19,7 @@ version = "0.2.1" ...@@ -19,7 +19,7 @@ version = "0.2.1"
description = "Distributed Inference Framework" description = "Distributed Inference Framework"
readme = "README.md" readme = "README.md"
authors = [ authors = [
{ name = "NVIDIA Inc.", email = "sw-dl-dynemo@nvidia.com" }, { name = "NVIDIA Inc.", email = "sw-dl-dynamo@nvidia.com" },
] ]
license = { file = "LICENSE" } license = { file = "LICENSE" }
requires-python = ">=3.10" requires-python = ">=3.10"
...@@ -146,4 +146,4 @@ ignore_missing_imports = true ...@@ -146,4 +146,4 @@ ignore_missing_imports = true
# declare namespace packages # declare namespace packages
[tool.setuptools] [tool.setuptools]
namespace-packages = ["dynemo", "dynemo.sdk"] namespace-packages = ["dynamo", "dynamo.sdk"]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment