"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "b9cecc26359794af863b3484a3464108b7d5ee5f"
Unverified Commit a6e73484 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore(bindings): Remove 'Backend' binding (#5458)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent ddee21cb
...@@ -162,7 +162,6 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { ...@@ -162,7 +162,6 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<llm::preprocessor::OAIChatPreprocessor>()?; m.add_class::<llm::preprocessor::OAIChatPreprocessor>()?;
m.add_class::<llm::preprocessor::MediaDecoder>()?; m.add_class::<llm::preprocessor::MediaDecoder>()?;
m.add_class::<llm::preprocessor::MediaFetcher>()?; m.add_class::<llm::preprocessor::MediaFetcher>()?;
m.add_class::<llm::backend::Backend>()?;
m.add_class::<llm::kv::OverlapScores>()?; m.add_class::<llm::kv::OverlapScores>()?;
m.add_class::<llm::kv::KvIndexer>()?; m.add_class::<llm::kv::KvIndexer>()?;
m.add_class::<llm::kv::ApproxKvIndexer>()?; m.add_class::<llm::kv::ApproxKvIndexer>()?;
......
...@@ -6,9 +6,6 @@ ...@@ -6,9 +6,6 @@
/// ///
/// It is organized into several specialized sub-modules, each responsible for a particular aspect of the system: /// It is organized into several specialized sub-modules, each responsible for a particular aspect of the system:
/// ///
/// - `backend`:
/// Wraps low-level interfaces for LLM inference, manages resource allocation,
/// and integrates with specialized hardware for optimized execution.
/// - `disagg_route`: /// - `disagg_route`:
/// Implements distributed routing of inference requests with dynamic /// Implements distributed routing of inference requests with dynamic
/// load balancing and efficient resource allocation across clusters. /// load balancing and efficient resource allocation across clusters.
...@@ -26,7 +23,6 @@ ...@@ -26,7 +23,6 @@
/// integration between Python tools and the Dynamo runtime. /// integration between Python tools and the Dynamo runtime.
use super::*; use super::*;
pub mod backend;
pub mod entrypoint; pub mod entrypoint;
pub mod kv; pub mod kv;
pub mod local_model; pub mod local_model;
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use super::*;
use crate::llm::model_card::ModelDeploymentCard;
use llm_rs::protocols::common::llm_backend::{BackendOutput, PreprocessedRequest};
use llm_rs::types::Annotated;
use dynamo_runtime::pipeline::{Operator, ServiceBackend, ServiceFrontend, Source};
use crate::engine::PythonAsyncEngine;
#[pyclass]
pub(crate) struct Backend {
inner: Arc<llm_rs::backend::Backend>,
endpoint: Endpoint,
}
#[pymethods]
impl Backend {
#[new]
fn new(mdc: ModelDeploymentCard, endpoint: Endpoint) -> PyResult<Self> {
let backend = llm_rs::backend::Backend::from_mdc(&mdc.inner);
Ok(Self {
inner: backend,
endpoint,
})
}
fn start<'p>(&self, py: Python<'p>, generator: PyObject) -> PyResult<Bound<'p, PyAny>> {
let frontend = ServiceFrontend::<
SingleIn<PreprocessedRequest>,
ManyOut<Annotated<BackendOutput>>,
>::new();
let backend = self.inner.into_operator();
let engine = Arc::new(PythonAsyncEngine::new(
generator,
self.endpoint.event_loop.clone(),
)?);
let engine = ServiceBackend::from_engine(engine);
let pipeline = frontend
.link(backend.forward_edge())
.map_err(to_pyerr)?
.link(engine)
.map_err(to_pyerr)?
.link(backend.backward_edge())
.map_err(to_pyerr)?
.link(frontend)
.map_err(to_pyerr)?;
let ingress = Ingress::for_engine(pipeline).map_err(to_pyerr)?;
let builder = self.endpoint.inner.endpoint_builder().handler(ingress);
pyo3_async_runtimes::tokio::future_into_py(py, async move {
builder.start().await.map_err(to_pyerr)?;
Ok(())
})
}
}
...@@ -527,20 +527,6 @@ class OAIChatPreprocessor: ...@@ -527,20 +527,6 @@ class OAIChatPreprocessor:
""" """
... ...
class Backend:
"""
LLM Backend engine manages resources and concurrency for executing inference
requests in LLM engines (trtllm, vllm, sglang etc)
"""
...
async def start(self, handler: RequestHandler) -> None:
"""
Start the backend engine and requests to the downstream LLM engine
"""
...
class OverlapScores: class OverlapScores:
""" """
A collection of prefix matching scores of workers for a given token ids. A collection of prefix matching scores of workers for a given token ids.
...@@ -1655,7 +1641,6 @@ class VirtualConnectorClient: ...@@ -1655,7 +1641,6 @@ class VirtualConnectorClient:
... ...
__all__ = [ __all__ = [
"Backend",
"Client", "Client",
"Component", "Component",
"Context", "Context",
......
...@@ -10,7 +10,6 @@ from pydantic import BaseModel, ValidationError ...@@ -10,7 +10,6 @@ from pydantic import BaseModel, ValidationError
# List all the classes in the _core module for re-export # List all the classes in the _core module for re-export
# import * causes "unable to detect undefined names" # import * causes "unable to detect undefined names"
from dynamo._core import Backend as Backend
from dynamo._core import Client as Client from dynamo._core import Client as Client
from dynamo._core import Component as Component from dynamo._core import Component as Component
from dynamo._core import Context as Context from dynamo._core import Context as Context
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment