chore: remove unused NIM specific code (part 2) (#5893)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>

chore: remove unused NIM specific code (part 2) (#5893)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>
cb7ebdd7 · Keiven C · GitHub · 8b651fe9 · cb7ebdd7 · cb7ebdd7
Unverified Commit cb7ebdd7 authored Feb 05, 2026 by Keiven C Committed by GitHub Feb 05, 2026
13 changed files
--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -43,10 +43,6 @@ from dynamo.runtime.logging import configure_dynamo_logging
 from . import __version__

 DYN_NAMESPACE_ENV_VAR = "DYN_NAMESPACE"
-CUSTOM_BACKEND_METRICS_POLLING_INTERVAL_ENV_VAR = (
-    "CUSTOM_BACKEND_METRICS_POLLING_INTERVAL"
-)
-CUSTOM_BACKEND_ENDPOINT_ENV_VAR = "CUSTOM_BACKEND_ENDPOINT"

 configure_dynamo_logging()
 logger = logging.getLogger(__name__)
@@ -276,22 +272,6 @@ def parse_args():
        help="HTTP metrics port for gRPC service (u16). Only used with --kserve-grpc-server. Defaults to 8788.",
    )
    add_config_dump_args(parser)
-    parser.add_argument(
-        "--custom-backend-metrics-endpoint",
-        type=str,
-        default=os.environ.get(
-            CUSTOM_BACKEND_ENDPOINT_ENV_VAR, "nim.backend.runtime_stats"
-        ),
-        help=f"Custom backend endpoint to poll for metrics in format 'namespace.component.endpoint' (default: 'nim.backend.runtime_stats'). Required if --custom-backend-metrics-polling-interval is specified. All metrics will be prefixed with 'dynamo_component_' in Prometheus. Can be set via {CUSTOM_BACKEND_ENDPOINT_ENV_VAR} env var.",
-    )
-    parser.add_argument(
-        "--custom-backend-metrics-polling-interval",
-        type=float,
-        default=float(
-            os.environ.get(CUSTOM_BACKEND_METRICS_POLLING_INTERVAL_ENV_VAR, "0")
-        ),
-        help=f"Interval in seconds for polling custom backend metrics. Set to > 0 to enable polling (default: 0=disabled, suggested: 9.2s which is less than typical Prometheus scrape interval). Can be set via {CUSTOM_BACKEND_METRICS_POLLING_INTERVAL_ENV_VAR} env var.",
-    )
    parser.add_argument(
        "--store-kv",
        type=str,
@@ -324,10 +304,6 @@ def parse_args():

    if bool(flags.tls_cert_path) ^ bool(flags.tls_key_path):  # ^ is XOR
        parser.error("--tls-cert-path and --tls-key-path must be provided together")
-    if flags.custom_backend_metrics_polling_interval < 0:
-        parser.error(
-            "--custom-backend-metrics-polling-interval must be >= 0 (0=disabled)"
-        )

    return flags

@@ -431,14 +407,6 @@ async def async_main():
        kwargs["namespace"] = flags.namespace
    if flags.kserve_grpc_server and flags.grpc_metrics_port:
        kwargs["http_metrics_port"] = flags.grpc_metrics_port
-    if flags.custom_backend_metrics_endpoint:
-        kwargs[
-            "custom_backend_metrics_endpoint"
-        ] = flags.custom_backend_metrics_endpoint
-    if flags.custom_backend_metrics_polling_interval:
-        kwargs[
-            "custom_backend_metrics_polling_interval"
-        ] = flags.custom_backend_metrics_polling_interval

    if flags.exp_python_factory:
        kwargs["engine_factory"] = engine_factory

--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -806,7 +806,6 @@ class BaseWorkerHandler(ABC):
        Decode base64-encoded prompt embeddings in PyTorch format.

        Format: PyTorch tensor serialized with torch.save() and base64-encoded.
-        This matches NIM-LLM's implementation for compatibility.

        Args:
            prompt_embeds_base64: Base64-encoded PyTorch tensor

--- a/lib/bindings/python/rust/llm/entrypoint.rs
+++ b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -171,8 +171,6 @@ pub(crate) struct EntrypointArgs {
    tls_key_path: Option<PathBuf>,
    extra_engine_args: Option<PathBuf>,
    namespace: Option<String>,
-    custom_backend_metrics_endpoint: Option<String>,
-    custom_backend_metrics_polling_interval: Option<f64>,
    is_prefill: bool,
    engine_factory: Option<PyEngineFactory>,
 }
@@ -181,7 +179,7 @@ pub(crate) struct EntrypointArgs {
 impl EntrypointArgs {
    #[allow(clippy::too_many_arguments)]
    #[new]
-    #[pyo3(signature = (engine_type, model_path=None, model_name=None, endpoint_id=None, context_length=None, template_file=None, router_config=None, kv_cache_block_size=None, http_host=None, http_port=None, http_metrics_port=None, tls_cert_path=None, tls_key_path=None, extra_engine_args=None, namespace=None, custom_backend_metrics_endpoint=None, custom_backend_metrics_polling_interval=None, is_prefill=false, engine_factory=None))]
+    #[pyo3(signature = (engine_type, model_path=None, model_name=None, endpoint_id=None, context_length=None, template_file=None, router_config=None, kv_cache_block_size=None, http_host=None, http_port=None, http_metrics_port=None, tls_cert_path=None, tls_key_path=None, extra_engine_args=None, namespace=None, is_prefill=false, engine_factory=None))]
    pub fn new(
        py: Python<'_>,
        engine_type: EngineType,
@@ -199,8 +197,6 @@ impl EntrypointArgs {
        tls_key_path: Option<PathBuf>,
        extra_engine_args: Option<PathBuf>,
        namespace: Option<String>,
-        custom_backend_metrics_endpoint: Option<String>,
-        custom_backend_metrics_polling_interval: Option<f64>,
        is_prefill: bool,
        engine_factory: Option<PyObject>,
    ) -> PyResult<Self> {
@@ -245,8 +241,6 @@ impl EntrypointArgs {
            tls_key_path,
            extra_engine_args,
            namespace,
-            custom_backend_metrics_endpoint,
-            custom_backend_metrics_polling_interval,
            is_prefill,
            engine_factory,
        })
@@ -287,9 +281,7 @@ pub fn make_engine<'p>(
        .tls_key_path(args.tls_key_path.clone())
        .is_mocker(matches!(args.engine_type, EngineType::Mocker))
        .extra_engine_args(args.extra_engine_args.clone())
-        .namespace(args.namespace.clone())
-        .custom_backend_metrics_endpoint(args.custom_backend_metrics_endpoint.clone())
-        .custom_backend_metrics_polling_interval(args.custom_backend_metrics_polling_interval);
+        .namespace(args.namespace.clone());
    pyo3_async_runtimes::tokio::future_into_py(py, async move {
        if let Some(model_path) = args.model_path.clone() {
            let local_path = if model_path.exists() {

--- a/lib/llm/src/bin/generate_frontend_openapi.rs
+++ b/lib/llm/src/bin/generate_frontend_openapi.rs
@@ -3,7 +3,7 @@

 //! Helper binary to generate the Dynamo HTTP frontend OpenAPI specification.
 //!
-//! This allows CI, documentation tooling, and NIM to obtain the exact same
+//! This allows CI and documentation tooling to obtain the exact same
 //! OpenAPI document that is served at `/openapi.json` by the frontend
 //! without having to start the HTTP service and scrape the endpoint.
 //!

--- a/lib/llm/src/entrypoint/input/http.rs
+++ b/lib/llm/src/entrypoint/input/http.rs
@@ -51,15 +51,6 @@ pub async fn run(
    http_service_builder =
        http_service_builder.with_request_template(engine_config.local_model().request_template());

-    // DEPRECATED: To be removed after custom backends migrate to Dynamo backend.
-    // Pass the custom backend metrics endpoint as-is (already in namespace.component.endpoint format)
-    http_service_builder = http_service_builder.with_custom_backend_config(
-        local_model
-            .custom_backend_metrics_endpoint()
-            .map(|s| s.to_string()),
-        local_model.custom_backend_metrics_polling_interval(),
-    );
-
    let http_service = match engine_config {
        EngineConfig::Dynamic {
            ref model,
@@ -145,46 +136,10 @@ pub async fn run(
            .collect::<Vec<String>>()
    );

-    // DEPRECATED: To be removed after custom backends migrate to Dynamo backend.
-    // Start custom backend metrics polling if configured
-    let polling_task =
-        if let (Some(namespace_component_endpoint), Some(polling_interval), Some(registry)) = (
-            http_service
-                .custom_backend_namespace_component_endpoint
-                .as_ref(),
-            http_service.custom_backend_metrics_polling_interval,
-            http_service.custom_backend_registry.as_ref(),
-        ) {
-            tracing::info!(
-                namespace_component_endpoint=%namespace_component_endpoint,
-                polling_interval_secs=polling_interval,
-                "Starting custom backend metrics polling task"
-            );
-            // Spawn the polling task and keep the JoinHandle alive so it can be aborted during
-            // shutdown. While graceful shutdown is not strictly necessary for this non-critical
-            // metrics polling, explicitly aborting it prevents the task from running during the
-            // shutdown phase.
-            Some(
-                crate::http::service::custom_backend_metrics::spawn_custom_backend_polling_task(
-                    distributed_runtime.clone(),
-                    namespace_component_endpoint.clone(),
-                    polling_interval,
-                    registry.clone(),
-                ),
-            )
-        } else {
-            None
-        };
-
    http_service
        .run(distributed_runtime.primary_token())
        .await?;

-    // Abort the polling task if it was started
-    if let Some(task) = polling_task {
-        task.abort();
-    }
-
    distributed_runtime.shutdown(); // Cancel primary token
    Ok(())
 }

--- a/lib/llm/src/http/service.rs
+++ b/lib/llm/src/http/service.rs
@@ -21,7 +21,6 @@
 mod openai;

 pub mod busy_threshold;
-pub mod custom_backend_metrics;
 pub mod disconnect;
 pub mod error;
 pub mod health;

--- a/lib/llm/src/http/service/custom_backend_metrics.rs
+++ b/lib/llm/src/http/service/custom_backend_metrics.rs
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-// DEPRECATED: To be removed after custom backends migrate to Dynamo backend.
-//
-// Custom backend metrics polling and collection.
-//
-// This module provides a bridge to poll metrics from custom backends (like NIM) that expose
-// their own metrics endpoints, and makes them available through Prometheus.
-
-use std::{
-    collections::HashMap,
-    sync::{Arc, Mutex},
-    time::Duration,
-};
-
-use serde::Deserialize;
-
-/// Maximum number of custom backend gauges that can be registered to prevent unbounded growth.
-pub const MAX_CUSTOM_BACKEND_GAUGES: usize = 100;
-
-/// Registry for custom backend metrics discovered at runtime.
-///
-/// Metrics from custom backends are exposed as Prometheus gauges since we're setting
-/// absolute values received from polling, not incrementing them locally.
-///
-/// All metrics are automatically prefixed when registered. For example, if the prefix is
-/// `dynamo_component` and a backend reports a gauge named `kv_cache_usage_perc`, it will
-/// be exposed as `dynamo_component_kv_cache_usage_perc` in Prometheus metrics.
-pub struct CustomBackendMetricsRegistry {
-    gauges: Mutex<HashMap<String, prometheus::Gauge>>,
-    prefix: String,
-    prometheus_registry: prometheus::Registry,
-}
-
-impl CustomBackendMetricsRegistry {
-    pub fn new(prefix: String, prometheus_registry: prometheus::Registry) -> Self {
-        Self {
-            gauges: Mutex::new(HashMap::new()),
-            prefix,
-            prometheus_registry,
-        }
-    }
-
-    /// Get or create a gauge for the given metric name, registering it with Prometheus if new.
-    /// Returns None if the maximum number of gauges has been reached.
-    fn get_or_create_gauge(&self, name: &str) -> Option<prometheus::Gauge> {
-        let mut gauges = self.gauges.lock().unwrap();
-
-        if let Some(gauge) = gauges.get(name) {
-            return Some(gauge.clone());
-        }
-
-        // Cap the number of gauges to prevent unbounded growth
-        if gauges.len() >= MAX_CUSTOM_BACKEND_GAUGES {
-            tracing::warn!(
-                "Maximum number of custom backend gauges ({}) reached, dropping metric: {}",
-                MAX_CUSTOM_BACKEND_GAUGES,
-                name
-            );
-            return None;
-        }
-
-        let full_name = format!("{}_{}", self.prefix, name);
-        let gauge = prometheus::Gauge::new(full_name.as_str(), name)
-            .unwrap_or_else(|e| panic!("Failed to create gauge {}: {}", full_name, e));
-
-        if let Err(e) = self.prometheus_registry.register(Box::new(gauge.clone())) {
-            tracing::warn!(
-                "Failed to register custom backend gauge {}: {}",
-                full_name,
-                e
-            );
-        }
-
-        gauges.insert(name.to_string(), gauge.clone());
-        Some(gauge)
-    }
-
-    /// Update a gauge metric with a new value.
-    pub fn set_gauge(&self, name: &str, value: f64) {
-        if let Some(gauge) = self.get_or_create_gauge(name) {
-            gauge.set(value);
-        }
-    }
-}
-
-/// Response format from custom backend runtime_stats endpoint
-#[derive(Debug, Deserialize)]
-struct CustomBackendStatsResponse {
-    metrics: CustomBackendMetrics,
-}
-
-#[derive(Debug, Deserialize)]
-struct CustomBackendMetrics {
-    gauges: HashMap<String, f64>,
-}
-
-/// Spawn a background task that polls custom backend metrics periodically.
-///
-/// All metrics collected from the backend will be prefixed according to the registry's prefix
-/// (typically `dynamo_component_`). For example, a backend gauge `kv_cache_usage_perc` will
-/// appear as `dynamo_component_kv_cache_usage_perc` in Prometheus.
-///
-/// This task does not use a CancellationToken for graceful shutdown. When the executable exits,
-/// the task is abruptly terminated by the tokio runtime shutdown. This is acceptable because
-/// metrics polling is non-critical with no risk of data corruption or resource leaks, typical
-/// polling intervals are short, and the Worker already has a graceful shutdown timeout mechanism.
-pub fn spawn_custom_backend_polling_task(
-    drt: dynamo_runtime::DistributedRuntime,
-    namespace_component_endpoint: String,
-    polling_interval_secs: f64,
-    registry: Arc<CustomBackendMetricsRegistry>,
-) -> tokio::task::JoinHandle<()> {
-    tokio::spawn(async move {
-        tracing::info!(
-            namespace_component_endpoint=%namespace_component_endpoint,
-            interval_secs=polling_interval_secs,
-            "Starting custom backend metrics polling"
-        );
-
-        // Parse namespace.component.endpoint format
-        let parts: Vec<&str> = namespace_component_endpoint.split('.').collect();
-        if parts.len() != 3 {
-            tracing::error!(
-                namespace_component_endpoint=%namespace_component_endpoint,
-                "Invalid endpoint format, expected 'namespace.component.endpoint'"
-            );
-            return;
-        }
-        let (namespace, component_name, endpoint_name) = (parts[0], parts[1], parts[2]);
-
-        // Get namespace, component, and endpoint from DRT
-        let Ok(ns) = drt.namespace(namespace.to_string()) else {
-            tracing::error!("Namespace not available: {}", namespace);
-            return;
-        };
-        let Ok(component) = ns.component(component_name) else {
-            tracing::error!("Component not available: {}", component_name);
-            return;
-        };
-        let endpoint = component.endpoint(endpoint_name);
-
-        // Wait for client to be ready (backend might not be available yet)
-        let client = loop {
-            match endpoint.client().await {
-                Ok(client) => break client,
-                Err(e) => {
-                    tracing::warn!(
-                        error=%e,
-                        namespace=%namespace,
-                        component=%component_name,
-                        endpoint=%endpoint_name,
-                        "Failed to create client for custom backend endpoint, retrying in 5s"
-                    );
-                    tokio::time::sleep(Duration::from_secs(5)).await;
-                }
-            }
-        };
-
-        // Create router for sending requests to the backend
-        use dynamo_runtime::pipeline::{PushRouter, RouterMode};
-        use dynamo_runtime::protocols::annotated::Annotated;
-        let Ok(router) =
-            PushRouter::<String, Annotated<String>>::from_client(client, RouterMode::Random).await
-        else {
-            tracing::error!(
-                namespace=%namespace,
-                component=%component_name,
-                endpoint=%endpoint_name,
-                "Failed to create router for custom backend endpoint"
-            );
-            return;
-        };
-
-        tracing::info!(
-            namespace=%namespace,
-            component=%component_name,
-            endpoint=%endpoint_name,
-            "Custom backend metrics polling started"
-        );
-
-        // Poll backend at regular intervals
-        let interval = Duration::from_secs_f64(polling_interval_secs);
-        loop {
-            tokio::time::sleep(interval).await;
-
-            match poll_backend_once(&router, &registry).await {
-                Ok(num_metrics) => {
-                    tracing::debug!(
-                        num_metrics=%num_metrics,
-                        "Successfully polled custom backend metrics"
-                    );
-                }
-                Err(e) => {
-                    tracing::warn!(
-                        error=%e,
-                        "Failed to poll custom backend metrics"
-                    );
-                }
-            }
-        }
-    })
-}
-
-/// Poll the backend once and update the registry.
-async fn poll_backend_once(
-    router: &dynamo_runtime::pipeline::PushRouter<
-        String,
-        dynamo_runtime::protocols::annotated::Annotated<String>,
-    >,
-    registry: &Arc<CustomBackendMetricsRegistry>,
-) -> anyhow::Result<usize> {
-    use dynamo_runtime::pipeline::Context;
-
-    let response_stream = router.random(Context::new("".to_string())).await?;
-
-    // Collect responses from the stream
-    let mut responses = Vec::new();
-    {
-        use futures::StreamExt;
-        let mut stream = response_stream;
-        while let Some(response) = stream.next().await {
-            if let Some(data) = response.data {
-                responses.push(data);
-            }
-        }
-    }
-
-    if responses.is_empty() {
-        anyhow::bail!("No responses received from custom backend");
-    }
-
-    // Parse the first response as JSON
-    // Expected format from backend (as JSON string):
-    // {
-    //   "schema_version": 1,
-    //   "worker_id": "mock-worker-1",
-    //   "backend": "vllm",
-    //   "ts": 1759967807,
-    //   "metrics": {
-    //     "gauges": {
-    //       "kv_cache_usage_perc": 0.3,
-    //       "gpu_utilization_perc": 75.5,
-    //       "active_requests": 5
-    //     }
-    //   }
-    // }
-    let stats: CustomBackendStatsResponse = serde_json::from_str(&responses[0])
-        .map_err(|e| anyhow::anyhow!("Failed to parse backend stats JSON: {}", e))?;
-
-    // Update gauges in the registry
-    for (name, value) in &stats.metrics.gauges {
-        registry.set_gauge(name, *value);
-    }
-
-    Ok(stats.metrics.gauges.len())
-}
--- a/lib/llm/src/http/service/metrics.rs
+++ b/lib/llm/src/http/service/metrics.rs
@@ -1270,9 +1270,6 @@ pub fn router(registry: Registry, path: Option<String>) -> (Vec<RouteDoc>, Route

 /// Unified metrics handler
 async fn handler_metrics(State(state): State<Arc<MetricsHandlerState>>) -> impl IntoResponse {
-    // Gather and encode metrics
-    // Note: If nim_on_demand is enabled, the NimMetricsCollector registered with the registry
-    // will automatically call poll_nim_backend_stats when gather() is invoked
    let encoder = prometheus::TextEncoder::new();
    let metric_families = state.registry.gather();
    let mut buffer = vec![];

--- a/lib/llm/src/http/service/openapi_docs.rs
+++ b/lib/llm/src/http/service/openapi_docs.rs
@@ -69,7 +69,7 @@ struct ApiDoc;
 /// Generate OpenAPI specification from route documentation
 ///
 /// This is the core helper used both by the embedded Swagger UI and by
-/// external tools (for example CI or NIM) which need to materialize the
+/// external tools (for example CI) which need to materialize the
 /// same frontend OpenAPI specification without running the HTTP service.
 pub fn generate_openapi_spec(route_docs: &[RouteDoc]) -> utoipa::openapi::OpenApi {
    let mut openapi = ApiDoc::openapi();

--- a/lib/llm/src/http/service/service_v2.rs
+++ b/lib/llm/src/http/service/service_v2.rs
@@ -25,7 +25,6 @@ use derive_builder::Builder;
 use dynamo_runtime::config::environment_names::llm as env_llm;
 use dynamo_runtime::discovery::{Discovery, KVStoreDiscovery};
 use dynamo_runtime::logging::make_request_span;
-use dynamo_runtime::metrics::prometheus_names::name_prefix;
 use dynamo_runtime::storage::kv;
 use std::net::SocketAddr;
 use tokio::task::JoinHandle;
@@ -162,12 +161,6 @@ pub struct HttpService {
    tls_cert_path: Option<PathBuf>,
    tls_key_path: Option<PathBuf>,
    route_docs: Vec<RouteDoc>,
-
-    // DEPRECATED: To be removed after custom backends migrate to Dynamo backend.
-    pub(crate) custom_backend_namespace_component_endpoint: Option<String>,
-    pub(crate) custom_backend_metrics_polling_interval: Option<f64>,
-    pub(crate) custom_backend_registry:
-        Option<Arc<super::custom_backend_metrics::CustomBackendMetricsRegistry>>,
 }

 #[derive(Clone, Builder)]
@@ -207,13 +200,6 @@ pub struct HttpServiceConfig {

    #[builder(default)]
    store: kv::Manager,
-
-    // DEPRECATED: To be removed after custom backends migrate to Dynamo backend.
-    #[builder(default = "None")]
-    custom_backend_namespace_component_endpoint: Option<String>,
-
-    #[builder(default = "None")]
-    custom_backend_metrics_polling_interval: Option<f64>,
 }

 impl HttpService {
@@ -405,22 +391,6 @@ impl HttpServiceConfigBuilder {
            tracing::warn!("Failed to register worker timing metrics: {}", e);
        }

-        // DEPRECATED: To be removed after custom backends migrate to Dynamo backend.
-        // Setup custom backend metrics if configured
-        let custom_backend_registry =
-            if config.custom_backend_namespace_component_endpoint.is_some()
-                && config.custom_backend_metrics_polling_interval.is_some()
-            {
-                Some(Arc::new(
-                    super::custom_backend_metrics::CustomBackendMetricsRegistry::new(
-                        name_prefix::COMPONENT.to_string(),
-                        registry.clone(),
-                    ),
-                ))
-            } else {
-                None
-            };
-
        let mut router = axum::Router::new();

        let mut all_docs = Vec::new();
@@ -490,10 +460,6 @@ impl HttpServiceConfigBuilder {
            tls_cert_path: config.tls_cert_path,
            tls_key_path: config.tls_key_path,
            route_docs: all_docs,
-            custom_backend_namespace_component_endpoint: config
-                .custom_backend_namespace_component_endpoint,
-            custom_backend_metrics_polling_interval: config.custom_backend_metrics_polling_interval,
-            custom_backend_registry,
        })
    }

@@ -502,17 +468,6 @@ impl HttpServiceConfigBuilder {
        self
    }

-    // DEPRECATED: To be removed after custom backends migrate to Dynamo backend.
-    pub fn with_custom_backend_config(
-        mut self,
-        namespace_component_endpoint: Option<String>,
-        polling_interval: Option<f64>,
-    ) -> Self {
-        self.custom_backend_namespace_component_endpoint = Some(namespace_component_endpoint);
-        self.custom_backend_metrics_polling_interval = Some(polling_interval);
-        self
-    }
-
    fn get_endpoints_router(
        state: Arc<State>,
        request_template: &Option<RequestTemplate>,

--- a/lib/llm/src/local_model.rs
+++ b/lib/llm/src/local_model.rs
@@ -56,8 +56,6 @@ pub struct LocalModelBuilder {
    user_data: Option<serde_json::Value>,
    custom_template_path: Option<PathBuf>,
    namespace: Option<String>,
-    custom_backend_metrics_endpoint: Option<String>,
-    custom_backend_metrics_polling_interval: Option<f64>,
    media_decoder: Option<MediaDecoder>,
    media_fetcher: Option<MediaFetcher>,
 }
@@ -85,8 +83,6 @@ impl Default for LocalModelBuilder {
            user_data: Default::default(),
            custom_template_path: Default::default(),
            namespace: Default::default(),
-            custom_backend_metrics_endpoint: Default::default(),
-            custom_backend_metrics_polling_interval: Default::default(),
            media_decoder: Default::default(),
            media_fetcher: Default::default(),
        }
@@ -199,16 +195,6 @@ impl LocalModelBuilder {
        self
    }

-    pub fn custom_backend_metrics_endpoint(&mut self, endpoint: Option<String>) -> &mut Self {
-        self.custom_backend_metrics_endpoint = endpoint;
-        self
-    }
-
-    pub fn custom_backend_metrics_polling_interval(&mut self, interval: Option<f64>) -> &mut Self {
-        self.custom_backend_metrics_polling_interval = interval;
-        self
-    }
-
    pub fn media_decoder(&mut self, media_decoder: Option<MediaDecoder>) -> &mut Self {
        self.media_decoder = media_decoder;
        self
@@ -304,9 +290,6 @@ impl LocalModelBuilder {
                router_config: self.router_config.take().unwrap_or_default(),
                runtime_config: self.runtime_config.clone(),
                namespace: self.namespace.clone(),
-                custom_backend_metrics_endpoint: self.custom_backend_metrics_endpoint.clone(),
-                custom_backend_metrics_polling_interval: self
-                    .custom_backend_metrics_polling_interval,
            });
        }

@@ -358,8 +341,6 @@ impl LocalModelBuilder {
            router_config: self.router_config.take().unwrap_or_default(),
            runtime_config: self.runtime_config.clone(),
            namespace: self.namespace.clone(),
-            custom_backend_metrics_endpoint: self.custom_backend_metrics_endpoint.clone(),
-            custom_backend_metrics_polling_interval: self.custom_backend_metrics_polling_interval,
        })
    }
 }
@@ -378,8 +359,6 @@ pub struct LocalModel {
    router_config: RouterConfig,
    runtime_config: ModelRuntimeConfig,
    namespace: Option<String>,
-    custom_backend_metrics_endpoint: Option<String>,
-    custom_backend_metrics_polling_interval: Option<f64>,
 }

 impl LocalModel {
@@ -447,14 +426,6 @@ impl LocalModel {
        self.namespace.as_deref()
    }

-    pub fn custom_backend_metrics_endpoint(&self) -> Option<&str> {
-        self.custom_backend_metrics_endpoint.as_deref()
-    }
-
-    pub fn custom_backend_metrics_polling_interval(&self) -> Option<f64> {
-        self.custom_backend_metrics_polling_interval
-    }
-
    /// An endpoint to identify this model by.
    pub fn endpoint_id(&self) -> &EndpointId {
        &self.endpoint_id

--- a/lib/llm/src/protocols/openai/completions/aggregator.rs
+++ b/lib/llm/src/protocols/openai/completions/aggregator.rs
@@ -71,8 +71,6 @@ impl DeltaAggregator {
                };

                if aggregator.error.is_none() && delta.data.is_some() {
-                    // note: we could extract annotations here and add them to the aggregator
-                    // to be return as part of the NIM Response Extension
                    // TODO(#14) - Aggregate Annotation

                    // these are cheap to move so we do it every time since we are consuming the delta

--- a/lib/llm/tests/preprocessor.rs
+++ b/lib/llm/tests/preprocessor.rs
@@ -15,15 +15,12 @@ use rstest::rstest;

 use std::path::PathBuf;

-/// ----------------- NOTE ---------------
-/// Currently ModelDeploymentCard does support downloading models using nim-hub.
-/// As a temporary workaround, we will download the models from Hugging Face to a local cache
-/// directory in `tests/data/sample-models`. These tests require a Hugging Face token to be
-/// set in the environment variable `HF_TOKEN`.
-/// The model is downloaded and cached in `tests/data/sample-models` directory.
-/// make sure the token has access to `meta-llama/Llama-3.1-70B-Instruct` model
 /// Gets the HF_TOKEN environment variable if it exists and is not empty.
 ///
+/// These tests require a Hugging Face token to be set in the environment variable `HF_TOKEN`.
+/// The model is downloaded and cached in `tests/data/sample-models` directory.
+/// Make sure the token has access to `meta-llama/Llama-3.1-70B-Instruct` model.
+///
 /// This function checks for the presence of the `HF_TOKEN` environment variable
 /// and validates that it's not empty or whitespace-only. The token is used for
 /// downloading models from Hugging Face to a local cache directory in
@@ -57,7 +54,6 @@ async fn make_mdc_from_repo(
    hf_revision: &str,
    mixins: Option<Vec<PromptContextMixin>>,
 ) -> ModelDeploymentCard {
-    //TODO: remove this once we have nim-hub support. See the NOTE above.
    let downloaded_path = maybe_download_model(local_path, hf_repo, hf_revision).await;
    let display_name = format!("{}--{}", hf_repo, hf_revision);
    let mut mdc = ModelDeploymentCard::load_from_disk(downloaded_path, None).unwrap();
@@ -110,30 +106,6 @@ async fn make_mdcs() -> Vec<ModelDeploymentCard> {
    ]
 }

-// fn load_nim_mdcs() -> Vec<ModelDeploymentCard> {
-//     // get all .json files from test/data/model_deployment_cards/nim
-//     std::fs::read_dir("tests/data/model_deployment_cards/nim")
-//         .unwrap()
-//         .map(|res| res.map(|e| e.path()).unwrap().clone())
-//         .filter(|path| path.extension().unwrap() == "json")
-//         .map(|path| ModelDeploymentCard::load_from_json_file(path).unwrap())
-//         .collect::<Vec<_>>()
-// }
-
-// #[ignore]
-// #[tokio::test]
-// async fn create_mdc_from_repo() {
-//     for repo in NGC_MODEL_REPOS.iter() {
-//         println!("Creating MDC for {}", repo);
-//         let mdc = make_mdc_from_repo(repo).await;
-//         mdc.save_to_json_file(&format!(
-//             "tests/data/model_deployment_cards/nim/{}.json",
-//             Slug::slugify(repo)
-//         ))
-//         .unwrap();
-//     }
-// }
-
 const SINGLE_CHAT_MESSAGE: &str = r#"
 [
    {