chore(bindings): Remove 'Backend' binding (#5458)

Signed-off-by: Graham King <grahamk@nvidia.com>

chore(bindings): Remove 'Backend' binding (#5458)
Signed-off-by: Graham King <grahamk@nvidia.com>
a6e73484 · Graham King · GitHub · ddee21cb · a6e73484 · a6e73484
Unverified Commit a6e73484 authored Jan 16, 2026 by Graham King Committed by GitHub Jan 16, 2026
5 changed files
--- a/lib/bindings/python/rust/lib.rs
+++ b/lib/bindings/python/rust/lib.rs
@@ -162,7 +162,6 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_class::<llm::preprocessor::OAIChatPreprocessor>()?;
    m.add_class::<llm::preprocessor::MediaDecoder>()?;
    m.add_class::<llm::preprocessor::MediaFetcher>()?;
-    m.add_class::<llm::backend::Backend>()?;
    m.add_class::<llm::kv::OverlapScores>()?;
    m.add_class::<llm::kv::KvIndexer>()?;
    m.add_class::<llm::kv::ApproxKvIndexer>()?;

--- a/lib/bindings/python/rust/llm.rs
+++ b/lib/bindings/python/rust/llm.rs
@@ -6,9 +6,6 @@
 ///
 /// It is organized into several specialized sub-modules, each responsible for a particular aspect of the system:
 ///
-/// - `backend`:
-///   Wraps low-level interfaces for LLM inference, manages resource allocation,
-///   and integrates with specialized hardware for optimized execution.
 /// - `disagg_route`:
 ///   Implements distributed routing of inference requests with dynamic
 ///   load balancing and efficient resource allocation across clusters.
@@ -26,7 +23,6 @@
 /// integration between Python tools and the Dynamo runtime.
 use super::*;
-pub mod backend;
 pub mod entrypoint;
 pub mod kv;
 pub mod local_model;

--- a/lib/bindings/python/rust/llm/backend.rs
+++ b/lib/bindings/python/rust/llm/backend.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-use super::*;
-use crate::llm::model_card::ModelDeploymentCard;
-use llm_rs::protocols::common::llm_backend::{BackendOutput, PreprocessedRequest};
-use llm_rs::types::Annotated;
-use dynamo_runtime::pipeline::{Operator, ServiceBackend, ServiceFrontend, Source};
-use crate::engine::PythonAsyncEngine;
-#[pyclass]
-pub(crate) struct Backend {
-    inner: Arc<llm_rs::backend::Backend>,
-    endpoint: Endpoint,
-}
-#[pymethods]
-impl Backend {
-    #[new]
-    fn new(mdc: ModelDeploymentCard, endpoint: Endpoint) -> PyResult<Self> {
-        let backend = llm_rs::backend::Backend::from_mdc(&mdc.inner);
-        Ok(Self {
-            inner: backend,
-            endpoint,
-        })
-    }
-    fn start<'p>(&self, py: Python<'p>, generator: PyObject) -> PyResult<Bound<'p, PyAny>> {
-        let frontend = ServiceFrontend::<
-            SingleIn<PreprocessedRequest>,
-            ManyOut<Annotated<BackendOutput>>,
-        >::new();
-        let backend = self.inner.into_operator();
-        let engine = Arc::new(PythonAsyncEngine::new(
-            generator,
-            self.endpoint.event_loop.clone(),
-        )?);
-        let engine = ServiceBackend::from_engine(engine);
-        let pipeline = frontend
-            .link(backend.forward_edge())
-            .map_err(to_pyerr)?
-            .link(engine)
-            .map_err(to_pyerr)?
-            .link(backend.backward_edge())
-            .map_err(to_pyerr)?
-            .link(frontend)
-            .map_err(to_pyerr)?;
-        let ingress = Ingress::for_engine(pipeline).map_err(to_pyerr)?;
-        let builder = self.endpoint.inner.endpoint_builder().handler(ingress);
-        pyo3_async_runtimes::tokio::future_into_py(py, async move {
-            builder.start().await.map_err(to_pyerr)?;
-            Ok(())
-        })
-    }
-}
--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -527,20 +527,6 @@ class OAIChatPreprocessor:
        """
        ...
-class Backend:
-    """
-    LLM Backend engine manages resources and concurrency for executing inference
-    requests in LLM engines (trtllm, vllm, sglang etc)
-    """
-    ...
-    async def start(self, handler: RequestHandler) -> None:
-        """
-        Start the backend engine and requests to the downstream LLM engine
-        """
-        ...
 class OverlapScores:
    """
    A collection of prefix matching scores of workers for a given token ids.
@@ -1655,7 +1641,6 @@ class VirtualConnectorClient:
        ...
 __all__ = [
-    "Backend",
    "Client",
    "Component",
    "Context",

--- a/lib/bindings/python/src/dynamo/runtime/__init__.py
+++ b/lib/bindings/python/src/dynamo/runtime/__init__.py
@@ -10,7 +10,6 @@ from pydantic import BaseModel, ValidationError
 # List all the classes in the _core module for re-export
 # import * causes "unable to detect undefined names"
-from dynamo._core import Backend as Backend
 from dynamo._core import Client as Client
 from dynamo._core import Component as Component
 from dynamo._core import Context as Context