feat: KServe gRPC support (#2638)

91a459c0 · GuanLuo · GitHub · ef535edb · 91a459c0 · 91a459c0
Unverified Commit 91a459c0 authored Aug 26, 2025 by GuanLuo Committed by GitHub Aug 26, 2025
19 changed files
--- a/.github/workflows/pre-merge-rust.yml
+++ b/.github/workflows/pre-merge-rust.yml
@@ -100,11 +100,11 @@ jobs:
    # Have an explicit step to build tests first to separate time spent on build vs execution.
    - name: Compile Tests
      working-directory: ${{ matrix.dir }}
-      run: cargo test --locked --no-run
+      run: cargo test --locked --no-run --target-dir ${HOME}/tmp || df -h
    - name: Run Doc Tests
      working-directory: ${{ matrix.dir }}
      run: cargo doc --no-deps && cargo test --locked --doc
    - name: Run Unit Tests
      working-directory: ${{ matrix.dir }}
      # NOTE: --all-targets doesn't run doc tests
-      run: cargo test --locked --all-targets
+      run: cargo test --locked --all-targets --target-dir ${HOME}/tmp
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/components/frontend/src/dynamo/frontend/main.py
+++ b/components/frontend/src/dynamo/frontend/main.py
@@ -176,6 +176,12 @@ def parse_args():
        default=None,
        help="Prefix for Dynamo frontend metrics. If unset, uses DYN_METRICS_PREFIX env var or 'dynamo_frontend'.",
    )
+    parser.add_argument(
+        "--kserve-grpc-server",
+        action="store_true",
+        default=False,
+        help="Start KServe gRPC server.",
+    )
    flags = parser.parse_args()
@@ -246,6 +252,8 @@ async def async_main():
    try:
        if flags.interactive:
            await run_input(runtime, "text", engine)
+        elif flags.kserve_grpc_server:
+            await run_input(runtime, "grpc", engine)
        else:
            await run_input(runtime, "http", engine)
    except asyncio.exceptions.CancelledError:

--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -83,7 +83,6 @@ pub async fn run(
        rt.clone(),
    )
    .await?;
    //
    // Run in from an input
    //

--- a/launch/dynamo-run/src/main.rs
+++ b/launch/dynamo-run/src/main.rs
@@ -26,7 +26,7 @@ Example:
 See `docs/guides/dynamo_run.md` in the repo for full details.
 "#;
-const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|auto|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--context-length=N] [--kv-cache-block-size=16] [--extra-engine-args=args.json] [--static-worker] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--router-temperature=0.0] [--use-kv-events] [--max-num-batched-tokens=1.0] [--migration-limit=0] [--verbosity (-v|-vv)]";
+const USAGE: &str = "USAGE: dynamo-run in=[http|grpc|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|auto|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--context-length=N] [--kv-cache-block-size=16] [--extra-engine-args=args.json] [--static-worker] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--router-temperature=0.0] [--use-kv-events] [--max-num-batched-tokens=1.0] [--migration-limit=0] [--verbosity (-v|-vv)]";
 fn main() -> anyhow::Result<()> {
    // Set log level based on verbosity flag

--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
--- a/lib/llm/Cargo.toml
+++ b/lib/llm/Cargo.toml
@@ -110,6 +110,13 @@ tower-http = {workspace = true}
 rustls = { version = "0.23" }
+# grpc-service
+# ping version to 0.13.1 so it depends on prost 0.13.5
+# which is used across other libraries
+tonic = { version = "0.13.1" }
+# Request prost specifically so tonic-build properly compiles protobuf message
+prost = { version = "0.13.5" }
 # tokenizers
 tokenizers = { version = "0.21.4", default-features = false, features = [
  "onig",
@@ -157,3 +164,6 @@ insta = { version = "1.41", features = [
 ] }
 aligned-vec = "0.6.4"
 lazy_static = "1.4"
+[build-dependencies]
+tonic-build = { version = "0.13.1"}
\ No newline at end of file
--- a/lib/llm/build.rs
+++ b/lib/llm/build.rs
@@ -13,8 +13,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-fn main() {
+fn main() -> Result<(), Box<dyn std::error::Error>> {
    println!("cargo:warning=Building with CUDA KV off");
+    build_protos()
+}
+fn build_protos() -> Result<(), Box<dyn std::error::Error>> {
+    tonic_build::compile_protos("src/grpc/protos/kserve.proto")?;
+    Ok(())
 }
 // NOTE: Preserving this build.rs for reference. We may want to re-enable

--- a/lib/llm/src/entrypoint/input.rs
+++ b/lib/llm/src/entrypoint/input.rs
@@ -18,6 +18,7 @@ pub mod batch;
 mod common;
 pub use common::build_routed_pipeline;
 pub mod endpoint;
+pub mod grpc;
 pub mod http;
 pub mod text;
@@ -43,6 +44,9 @@ pub enum Input {
    /// Batch mode. Run all the prompts, write the outputs, exit.
    Batch(PathBuf),
+    // Run an KServe compatible gRPC server
+    Grpc,
 }
 impl FromStr for Input {
@@ -59,6 +63,7 @@ impl TryFrom<&str> for Input {
    fn try_from(s: &str) -> anyhow::Result<Self> {
        match s {
            "http" => Ok(Input::Http),
+            "grpc" => Ok(Input::Grpc),
            "text" => Ok(Input::Text),
            "stdin" => Ok(Input::Stdin),
            endpoint_path if endpoint_path.starts_with(ENDPOINT_SCHEME) => {
@@ -77,6 +82,7 @@ impl fmt::Display for Input {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        let s = match self {
            Input::Http => "http",
+            Input::Grpc => "grpc",
            Input::Text => "text",
            Input::Stdin => "stdin",
            Input::Endpoint(path) => path,
@@ -113,6 +119,9 @@ pub async fn run_input(
        Input::Http => {
            http::run(runtime, engine_config).await?;
        }
+        Input::Grpc => {
+            grpc::run(runtime, engine_config).await?;
+        }
        Input::Text => {
            text::run(runtime, None, engine_config).await?;
        }

--- a/lib/llm/src/entrypoint/input/grpc.rs
+++ b/lib/llm/src/entrypoint/input/grpc.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+use std::sync::Arc;
+use crate::{
+    discovery::{MODEL_ROOT_PATH, ModelManager, ModelWatcher},
+    engines::StreamingEngineAdapter,
+    entrypoint::{self, EngineConfig, input::common},
+    grpc::service::kserve,
+    kv_router::KvRouterConfig,
+    types::openai::{
+        chat_completions::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse},
+        completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},
+    },
+};
+use dynamo_runtime::transports::etcd;
+use dynamo_runtime::{DistributedRuntime, Runtime};
+use dynamo_runtime::{distributed::DistributedConfig, pipeline::RouterMode};
+/// Build and run an KServe gRPC service
+pub async fn run(runtime: Runtime, engine_config: EngineConfig) -> anyhow::Result<()> {
+    let mut grpc_service_builder = kserve::KserveService::builder()
+        .port(engine_config.local_model().http_port()) // [WIP] generalize port..
+        .with_request_template(engine_config.local_model().request_template());
+    let grpc_service = match engine_config {
+        EngineConfig::Dynamic(_) => {
+            let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
+            let etcd_client = distributed_runtime.etcd_client();
+            // This allows the /health endpoint to query etcd for active instances
+            grpc_service_builder = grpc_service_builder.with_etcd_client(etcd_client.clone());
+            let grpc_service = grpc_service_builder.build()?;
+            match etcd_client {
+                Some(ref etcd_client) => {
+                    let router_config = engine_config.local_model().router_config();
+                    // Listen for models registering themselves in etcd, add them to gRPC service
+                    run_watcher(
+                        distributed_runtime,
+                        grpc_service.state().manager_clone(),
+                        etcd_client.clone(),
+                        MODEL_ROOT_PATH,
+                        router_config.router_mode,
+                        Some(router_config.kv_router_config),
+                        router_config.busy_threshold,
+                    )
+                    .await?;
+                }
+                None => {
+                    // Static endpoints don't need discovery
+                }
+            }
+            grpc_service
+        }
+        EngineConfig::StaticRemote(local_model) => {
+            let card = local_model.card();
+            let router_mode = local_model.router_config().router_mode;
+            let dst_config = DistributedConfig::from_settings(true); // true means static
+            let distributed_runtime = DistributedRuntime::new(runtime.clone(), dst_config).await?;
+            let grpc_service = grpc_service_builder.build()?;
+            let manager = grpc_service.model_manager();
+            let endpoint_id = local_model.endpoint_id();
+            let component = distributed_runtime
+                .namespace(&endpoint_id.namespace)?
+                .component(&endpoint_id.component)?;
+            let client = component.endpoint(&endpoint_id.name).client().await?;
+            let kv_chooser = if router_mode == RouterMode::KV {
+                Some(
+                    manager
+                        .kv_chooser_for(
+                            local_model.display_name(),
+                            &component,
+                            card.kv_cache_block_size,
+                            Some(local_model.router_config().kv_router_config),
+                        )
+                        .await?,
+                )
+            } else {
+                None
+            };
+            let chat_engine = entrypoint::build_routed_pipeline::<
+                NvCreateChatCompletionRequest,
+                NvCreateChatCompletionStreamResponse,
+            >(card, &client, router_mode, None, kv_chooser.clone())
+            .await?;
+            manager.add_chat_completions_model(local_model.display_name(), chat_engine)?;
+            let completions_engine = entrypoint::build_routed_pipeline::<
+                NvCreateCompletionRequest,
+                NvCreateCompletionResponse,
+            >(card, &client, router_mode, None, kv_chooser)
+            .await?;
+            manager.add_completions_model(local_model.display_name(), completions_engine)?;
+            grpc_service
+        }
+        EngineConfig::StaticFull { engine, model, .. } => {
+            let grpc_service = grpc_service_builder.build()?;
+            let engine = Arc::new(StreamingEngineAdapter::new(engine));
+            let manager = grpc_service.model_manager();
+            manager.add_completions_model(model.service_name(), engine.clone())?;
+            manager.add_chat_completions_model(model.service_name(), engine)?;
+            grpc_service
+        }
+        EngineConfig::StaticCore {
+            engine: inner_engine,
+            model,
+            ..
+        } => {
+            let grpc_service = grpc_service_builder.build()?;
+            let manager = grpc_service.model_manager();
+            let chat_pipeline = common::build_pipeline::<
+                NvCreateChatCompletionRequest,
+                NvCreateChatCompletionStreamResponse,
+            >(model.card(), inner_engine.clone())
+            .await?;
+            manager.add_chat_completions_model(model.service_name(), chat_pipeline)?;
+            let cmpl_pipeline = common::build_pipeline::<
+                NvCreateCompletionRequest,
+                NvCreateCompletionResponse,
+            >(model.card(), inner_engine)
+            .await?;
+            manager.add_completions_model(model.service_name(), cmpl_pipeline)?;
+            grpc_service
+        }
+    };
+    grpc_service.run(runtime.primary_token()).await?;
+    runtime.shutdown(); // Cancel primary token
+    Ok(())
+}
+/// Spawns a task that watches for new models in etcd at network_prefix,
+/// and registers them with the ModelManager so that the HTTP service can use them.
+async fn run_watcher(
+    runtime: DistributedRuntime,
+    model_manager: Arc<ModelManager>,
+    etcd_client: etcd::Client,
+    network_prefix: &str,
+    router_mode: RouterMode,
+    kv_router_config: Option<KvRouterConfig>,
+    busy_threshold: Option<f64>,
+) -> anyhow::Result<()> {
+    let watch_obj = ModelWatcher::new(
+        runtime,
+        model_manager,
+        router_mode,
+        kv_router_config,
+        busy_threshold,
+    );
+    tracing::info!("Watching for remote model at {network_prefix}");
+    let models_watcher = etcd_client.kv_get_and_watch_prefix(network_prefix).await?;
+    let (_prefix, _watcher, receiver) = models_watcher.dissolve();
+    // [gluo NOTE] This is different from http::run_watcher where it alters the HTTP service
+    // endpoint being exposed, gRPC doesn't have the same concept as the KServe service
+    // only has one kind of inference endpoint.
+    // Pass the sender to the watcher
+    let _watcher_task = tokio::spawn(async move {
+        watch_obj.watch(receiver).await;
+    });
+    Ok(())
+}
--- a/lib/llm/src/grpc.rs
+++ b/lib/llm/src/grpc.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+pub mod service;
--- a/lib/llm/src/grpc/protos/kserve.proto
+++ b/lib/llm/src/grpc/protos/kserve.proto
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+syntax = "proto3";
+package inference;
+//@@.. cpp:namespace:: inference
+import "model_config.proto";
+//@@
+//@@.. cpp:var:: service InferenceService
+//@@
+//@@   Inference Server GRPC endpoints.
+//@@
+service GRPCInferenceService
+{
+  //@@  .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns
+  //@@       (ModelMetadataResponse)
+  //@@
+  //@@     Get model metadata.
+  //@@
+  rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {}
+  //@@  .. cpp:var:: rpc ModelInfer(ModelInferRequest) returns
+  //@@       (ModelInferResponse)
+  //@@
+  //@@     Perform inference using a specific model.
+  //@@
+  rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {}
+  //@@  .. cpp:var:: rpc ModelStreamInfer(stream ModelInferRequest) returns
+  //@@       (stream ModelStreamInferResponse)
+  //@@
+  //@@     Perform streaming inference.
+  //@@
+  rpc ModelStreamInfer(stream ModelInferRequest) returns (stream ModelStreamInferResponse) {}
+  //@@  .. cpp:var:: rpc ModelConfig(ModelConfigRequest) returns
+  //@@       (ModelConfigResponse)
+  //@@
+  //@@     Get model configuration.
+  //@@
+  rpc ModelConfig(ModelConfigRequest) returns (ModelConfigResponse) {}
+}
+//@@
+//@@.. cpp:var:: message ModelMetadataRequest
+//@@
+//@@   Request message for ModelMetadata.
+//@@
+message ModelMetadataRequest
+{
+  //@@
+  //@@  .. cpp:var:: string name
+  //@@
+  //@@     The name of the model.
+  //@@
+  string name = 1;
+  //@@  .. cpp:var:: string version
+  //@@
+  //@@     The version of the model to check for readiness. If not
+  //@@     given the server will choose a version based on the
+  //@@     model and internal policy.
+  //@@
+  string version = 2;
+}
+//@@
+//@@.. cpp:var:: message ModelMetadataResponse
+//@@
+//@@   Response message for ModelMetadata.
+//@@
+message ModelMetadataResponse
+{
+  //@@
+  //@@  .. cpp:var:: message TensorMetadata
+  //@@
+  //@@     Metadata for a tensor.
+  //@@
+  message TensorMetadata
+  {
+    //@@
+    //@@    .. cpp:var:: string name
+    //@@
+    //@@       The tensor name.
+    //@@
+    string name = 1;
+    //@@
+    //@@    .. cpp:var:: string datatype
+    //@@
+    //@@       The tensor data type.
+    //@@
+    string datatype = 2;
+    //@@
+    //@@    .. cpp:var:: int64 shape (repeated)
+    //@@
+    //@@       The tensor shape. A variable-size dimension is represented
+    //@@       by a -1 value.
+    //@@
+    repeated int64 shape = 3;
+  }
+  //@@
+  //@@  .. cpp:var:: string name
+  //@@
+  //@@     The model name.
+  //@@
+  string name = 1;
+  //@@
+  //@@  .. cpp:var:: string versions (repeated)
+  //@@
+  //@@     The versions of the model.
+  //@@
+  repeated string versions = 2;
+  //@@
+  //@@  .. cpp:var:: string platform
+  //@@
+  //@@     The model's platform.
+  //@@
+  string platform = 3;
+  //@@
+  //@@  .. cpp:var:: TensorMetadata inputs (repeated)
+  //@@
+  //@@     The model's inputs.
+  //@@
+  repeated TensorMetadata inputs = 4;
+  //@@
+  //@@  .. cpp:var:: TensorMetadata outputs (repeated)
+  //@@
+  //@@     The model's outputs.
+  //@@
+  repeated TensorMetadata outputs = 5;
+}
+//@@
+//@@.. cpp:var:: message InferParameter
+//@@
+//@@   An inference parameter value.
+//@@
+message InferParameter
+{
+  //@@  .. cpp:var:: oneof parameter_choice
+  //@@
+  //@@     The parameter value can be a string, an int64,
+  //@@     an uint64, a double, or a boolean
+  //@@
+  //@@     Note: double and uint64 are currently
+  //@@           placeholders for future use and
+  //@@           are not supported for custom parameters
+  //@@
+  oneof parameter_choice
+  {
+    //@@    .. cpp:var:: bool bool_param
+    //@@
+    //@@       A boolean parameter value.
+    //@@
+    bool bool_param = 1;
+    //@@    .. cpp:var:: int64 int64_param
+    //@@
+    //@@       An int64 parameter value.
+    //@@
+    int64 int64_param = 2;
+    //@@    .. cpp:var:: string string_param
+    //@@
+    //@@       A string parameter value.
+    //@@
+    string string_param = 3;
+    //@@    .. cpp:var:: double double_param
+    //@@
+    //@@       A double parameter value.
+    //@@
+    double double_param = 4;
+    //@@    .. cpp:var:: uint64 uint64_param
+    //@@
+    //@@       A uint64 parameter value.
+    //@@
+    //@@       Not supported for custom parameters
+    //@@
+    uint64 uint64_param = 5;
+  }
+}
+//@@
+//@@.. cpp:var:: message InferTensorContents
+//@@
+//@@   The data contained in a tensor represented by the repeated type
+//@@   that matches the tensor's data type. Protobuf oneof is not used
+//@@   because oneofs cannot contain repeated fields.
+//@@
+message InferTensorContents
+{
+  //@@
+  //@@  .. cpp:var:: bool bool_contents (repeated)
+  //@@
+  //@@     Representation for BOOL data type. The size must match what is
+  //@@     expected by the tensor's shape. The contents must be the flattened,
+  //@@     one-dimensional, row-major order of the tensor elements.
+  //@@
+  repeated bool bool_contents = 1;
+  //@@
+  //@@  .. cpp:var:: int32 int_contents (repeated)
+  //@@
+  //@@     Representation for INT8, INT16, and INT32 data types. The size
+  //@@     must match what is expected by the tensor's shape. The contents
+  //@@     must be the flattened, one-dimensional, row-major order of the
+  //@@     tensor elements.
+  //@@
+  repeated int32 int_contents = 2;
+  //@@
+  //@@  .. cpp:var:: int64 int64_contents (repeated)
+  //@@
+  //@@     Representation for INT64 data types. The size must match what
+  //@@     is expected by the tensor's shape. The contents must be the
+  //@@     flattened, one-dimensional, row-major order of the tensor elements.
+  //@@
+  repeated int64 int64_contents = 3;
+  //@@
+  //@@  .. cpp:var:: uint32 uint_contents (repeated)
+  //@@
+  //@@     Representation for UINT8, UINT16, and UINT32 data types. The size
+  //@@     must match what is expected by the tensor's shape. The contents
+  //@@     must be the flattened, one-dimensional, row-major order of the
+  //@@     tensor elements.
+  //@@
+  repeated uint32 uint_contents = 4;
+  //@@
+  //@@  .. cpp:var:: uint64 uint64_contents (repeated)
+  //@@
+  //@@     Representation for UINT64 data types. The size must match what
+  //@@     is expected by the tensor's shape. The contents must be the
+  //@@     flattened, one-dimensional, row-major order of the tensor elements.
+  //@@
+  repeated uint64 uint64_contents = 5;
+  //@@
+  //@@  .. cpp:var:: float fp32_contents (repeated)
+  //@@
+  //@@     Representation for FP32 data type. The size must match what is
+  //@@     expected by the tensor's shape. The contents must be the flattened,
+  //@@     one-dimensional, row-major order of the tensor elements.
+  //@@
+  repeated float fp32_contents = 6;
+  //@@
+  //@@  .. cpp:var:: double fp64_contents (repeated)
+  //@@
+  //@@     Representation for FP64 data type. The size must match what is
+  //@@     expected by the tensor's shape. The contents must be the flattened,
+  //@@     one-dimensional, row-major order of the tensor elements.
+  //@@
+  repeated double fp64_contents = 7;
+  //@@
+  //@@  .. cpp:var:: bytes bytes_contents (repeated)
+  //@@
+  //@@     Representation for BYTES data type. The size must match what is
+  //@@     expected by the tensor's shape. The contents must be the flattened,
+  //@@     one-dimensional, row-major order of the tensor elements.
+  //@@
+  repeated bytes bytes_contents = 8;
+}
+//@@
+//@@.. cpp:var:: message ModelInferRequest
+//@@
+//@@   Request message for ModelInfer.
+//@@
+message ModelInferRequest
+{
+  //@@
+  //@@  .. cpp:var:: message InferInputTensor
+  //@@
+  //@@     An input tensor for an inference request.
+  //@@
+  message InferInputTensor
+  {
+    //@@
+    //@@    .. cpp:var:: string name
+    //@@
+    //@@       The tensor name.
+    //@@
+    string name = 1;
+    //@@
+    //@@    .. cpp:var:: string datatype
+    //@@
+    //@@       The tensor data type.
+    //@@
+    string datatype = 2;
+    //@@
+    //@@    .. cpp:var:: int64 shape (repeated)
+    //@@
+    //@@       The tensor shape.
+    //@@
+    repeated int64 shape = 3;
+    //@@    .. cpp:var:: map<string,InferParameter> parameters
+    //@@
+    //@@       Optional inference input tensor parameters.
+    //@@
+    map<string, InferParameter> parameters = 4;
+    //@@    .. cpp:var:: InferTensorContents contents
+    //@@
+    //@@       The tensor contents using a data-type format. This field
+    //@@       must not be specified if tensor contents are being specified
+    //@@       in ModelInferRequest.raw_input_contents.
+    //@@
+    InferTensorContents contents = 5;
+  }
+  //@@
+  //@@  .. cpp:var:: message InferRequestedOutputTensor
+  //@@
+  //@@     An output tensor requested for an inference request.
+  //@@
+  message InferRequestedOutputTensor
+  {
+    //@@
+    //@@    .. cpp:var:: string name
+    //@@
+    //@@       The tensor name.
+    //@@
+    string name = 1;
+    //@@    .. cpp:var:: map<string,InferParameter> parameters
+    //@@
+    //@@       Optional requested output tensor parameters.
+    //@@
+    map<string, InferParameter> parameters = 2;
+  }
+  //@@  .. cpp:var:: string model_name
+  //@@
+  //@@     The name of the model to use for inferencing.
+  //@@
+  string model_name = 1;
+  //@@  .. cpp:var:: string model_version
+  //@@
+  //@@     The version of the model to use for inference. If not
+  //@@     given the latest/most-recent version of the model is used.
+  //@@
+  string model_version = 2;
+  //@@  .. cpp:var:: string id
+  //@@
+  //@@     Optional identifier for the request. If specified will be
+  //@@     returned in the response.
+  //@@
+  string id = 3;
+  //@@  .. cpp:var:: map<string,InferParameter> parameters
+  //@@
+  //@@     Optional inference parameters.
+  //@@
+  map<string, InferParameter> parameters = 4;
+  //@@
+  //@@  .. cpp:var:: InferInputTensor inputs (repeated)
+  //@@
+  //@@     The input tensors for the inference.
+  //@@
+  repeated InferInputTensor inputs = 5;
+  //@@
+  //@@  .. cpp:var:: InferRequestedOutputTensor outputs (repeated)
+  //@@
+  //@@     The requested output tensors for the inference. Optional, if not
+  //@@     specified all outputs specified in the model config will be
+  //@@     returned.
+  //@@
+  repeated InferRequestedOutputTensor outputs = 6;
+  //@@
+  //@@  .. cpp:var:: bytes raw_input_contents
+  //@@
+  //@@     The data contained in an input tensor can be represented in
+  //@@     "raw" bytes form or in the repeated type that matches the
+  //@@     tensor's data type. Using the "raw" bytes form will
+  //@@     typically allow higher performance due to the way protobuf
+  //@@     allocation and reuse interacts with GRPC. For example, see
+  //@@     https://github.com/grpc/grpc/issues/23231.
+  //@@
+  //@@     To use the raw representation 'raw_input_contents' must be
+  //@@     initialized with data for each tensor in the same order as
+  //@@     'inputs'. For each tensor, the size of this content must
+  //@@     match what is expected by the tensor's shape and data
+  //@@     type. The raw data must be the flattened, one-dimensional,
+  //@@     row-major order of the tensor elements without any stride
+  //@@     or padding between the elements. Note that the FP16 and BF16 data
+  //@@     types must be represented as raw content as there is no
+  //@@     specific data type for a 16-bit float type.
+  //@@
+  //@@     If this field is specified then InferInputTensor::contents
+  //@@     must not be specified for any input tensor.
+  //@@
+  repeated bytes raw_input_contents = 7;
+}
+//@@
+//@@.. cpp:var:: message ModelInferResponse
+//@@
+//@@   Response message for ModelInfer.
+//@@
+message ModelInferResponse
+{
+  //@@
+  //@@  .. cpp:var:: message InferOutputTensor
+  //@@
+  //@@     An output tensor returned for an inference request.
+  //@@
+  message InferOutputTensor
+  {
+    //@@
+    //@@    .. cpp:var:: string name
+    //@@
+    //@@       The tensor name.
+    //@@
+    string name = 1;
+    //@@
+    //@@    .. cpp:var:: string datatype
+    //@@
+    //@@       The tensor data type.
+    //@@
+    string datatype = 2;
+    //@@
+    //@@    .. cpp:var:: int64 shape (repeated)
+    //@@
+    //@@       The tensor shape.
+    //@@
+    repeated int64 shape = 3;
+    //@@    .. cpp:var:: map<string,InferParameter> parameters
+    //@@
+    //@@       Optional output tensor parameters.
+    //@@
+    map<string, InferParameter> parameters = 4;
+    //@@    .. cpp:var:: InferTensorContents contents
+    //@@
+    //@@       The tensor contents using a data-type format. This field
+    //@@       must not be specified if tensor contents are being specified
+    //@@       in ModelInferResponse.raw_output_contents.
+    //@@
+    InferTensorContents contents = 5;
+  }
+  //@@  .. cpp:var:: string model_name
+  //@@
+  //@@     The name of the model used for inference.
+  //@@
+  string model_name = 1;
+  //@@  .. cpp:var:: string model_version
+  //@@
+  //@@     The version of the model used for inference.
+  //@@
+  string model_version = 2;
+  //@@  .. cpp:var:: string id
+  //@@
+  //@@     The id of the inference request if one was specified.
+  //@@
+  string id = 3;
+  //@@  .. cpp:var:: map<string,InferParameter> parameters
+  //@@
+  //@@     Optional inference response parameters.
+  //@@
+  map<string, InferParameter> parameters = 4;
+  //@@
+  //@@  .. cpp:var:: InferOutputTensor outputs (repeated)
+  //@@
+  //@@     The output tensors holding inference results.
+  //@@
+  repeated InferOutputTensor outputs = 5;
+  //@@
+  //@@  .. cpp:var:: bytes raw_output_contents
+  //@@
+  //@@     The data contained in an output tensor can be represented in
+  //@@     "raw" bytes form or in the repeated type that matches the
+  //@@     tensor's data type. Using the "raw" bytes form will
+  //@@     typically allow higher performance due to the way protobuf
+  //@@     allocation and reuse interacts with GRPC. For example, see
+  //@@     https://github.com/grpc/grpc/issues/23231.
+  //@@
+  //@@     To use the raw representation 'raw_output_contents' must be
+  //@@     initialized with data for each tensor in the same order as
+  //@@     'outputs'. For each tensor, the size of this content must
+  //@@     match what is expected by the tensor's shape and data
+  //@@     type. The raw data must be the flattened, one-dimensional,
+  //@@     row-major order of the tensor elements without any stride
+  //@@     or padding between the elements. Note that the FP16 and BF16 data
+  //@@     types must be represented as raw content as there is no
+  //@@     specific data type for a 16-bit float type.
+  //@@
+  //@@     If this field is specified then InferOutputTensor::contents
+  //@@     must not be specified for any output tensor.
+  //@@
+  repeated bytes raw_output_contents = 6;
+}
+//@@
+//@@.. cpp:var:: message ModelStreamInferResponse
+//@@
+//@@   Response message for ModelStreamInfer.
+//@@
+message ModelStreamInferResponse
+{
+  //@@
+  //@@  .. cpp:var:: string error_message
+  //@@
+  //@@     The message describing the error. The empty message
+  //@@     indicates the inference was successful without errors.
+  //@@
+  string error_message = 1;
+  //@@
+  //@@  .. cpp:var:: ModelInferResponse infer_response
+  //@@
+  //@@     Holds the results of the request.
+  //@@
+  ModelInferResponse infer_response = 2;
+}
+//@@
+//@@.. cpp:var:: message ModelConfigRequest
+//@@
+//@@   Request message for ModelConfig.
+//@@
+message ModelConfigRequest
+{
+  //@@
+  //@@  .. cpp:var:: string name
+  //@@
+  //@@     The name of the model.
+  //@@
+  string name = 1;
+  //@@  .. cpp:var:: string version
+  //@@
+  //@@     The version of the model. If not given the model version
+  //@@     is selected automatically based on the version policy.
+  //@@
+  string version = 2;
+}
+//@@
+//@@.. cpp:var:: message ModelConfigResponse
+//@@
+//@@   Response message for ModelConfig.
+//@@
+message ModelConfigResponse
+{
+  //@@
+  //@@  .. cpp:var:: ModelConfig config
+  //@@
+  //@@     The model configuration.
+  //@@
+  ModelConfig config = 1;
+}
+//@@
+//@@.. cpp:var:: message ModelRepositoryParameter
+//@@
+//@@   An model repository parameter value.
+//@@
+message ModelRepositoryParameter
+{
+  //@@  .. cpp:var:: oneof parameter_choice
+  //@@
+  //@@     The parameter value can be a string, an int64 or
+  //@@     a boolean
+  //@@
+  oneof parameter_choice
+  {
+    //@@    .. cpp:var:: bool bool_param
+    //@@
+    //@@       A boolean parameter value.
+    //@@
+    bool bool_param = 1;
+    //@@    .. cpp:var:: int64 int64_param
+    //@@
+    //@@       An int64 parameter value.
+    //@@
+    int64 int64_param = 2;
+    //@@    .. cpp:var:: string string_param
+    //@@
+    //@@       A string parameter value.
+    //@@
+    string string_param = 3;
+    //@@    .. cpp:var:: bytes bytes_param
+    //@@
+    //@@       A bytes parameter value.
+    //@@
+    bytes bytes_param = 4;
+  }
+}
--- a/lib/llm/src/grpc/protos/model_config.proto
+++ b/lib/llm/src/grpc/protos/model_config.proto
--- a/lib/llm/src/grpc/service.rs
+++ b/lib/llm/src/grpc/service.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+pub mod kserve;
+pub mod openai;
--- a/lib/llm/src/grpc/service/kserve.rs
+++ b/lib/llm/src/grpc/service/kserve.rs
--- a/lib/llm/src/grpc/service/openai.rs
+++ b/lib/llm/src/grpc/service/openai.rs
--- a/lib/llm/src/http/service/openai.rs
+++ b/lib/llm/src/http/service/openai.rs
@@ -31,7 +31,6 @@ use super::{
    metrics::{Endpoint, ResponseMetricCollector},
    service_v2,
 };
-use crate::preprocessor::LLMMetricAnnotation;
 use crate::protocols::openai::chat_completions::aggregator::ChatCompletionAggregator;
 use crate::protocols::openai::{
    ParsingOptions,
@@ -42,6 +41,7 @@ use crate::protocols::openai::{
 };
 use crate::request_template::RequestTemplate;
 use crate::types::Annotated;
+use crate::{discovery::ModelManager, preprocessor::LLMMetricAnnotation};
 use dynamo_runtime::logging::get_distributed_tracing_context;
 use tracing::Instrument;
@@ -195,8 +195,8 @@ fn get_or_create_request_id(primary: Option<&str>, headers: &HeaderMap) -> Strin
    uuid.to_string()
 }
-fn get_parsing_options(state: &Arc<service_v2::State>, model: &str) -> ParsingOptions {
+fn get_parsing_options(manager: &ModelManager, model: &str) -> ParsingOptions {
-    let tool_call_parser = state.manager().get_model_tool_call_parser(model);
+    let tool_call_parser = manager.get_model_tool_call_parser(model);
    let reasoning_parser = None; // TODO: Implement reasoning parser
    ParsingOptions::new(tool_call_parser, reasoning_parser)
@@ -274,7 +274,7 @@ async fn completions(
        .get_completions_engine(model)
        .map_err(|_| ErrorMessage::model_not_found())?;
-    let parsing_options = get_parsing_options(&state, model);
+    let parsing_options = get_parsing_options(state.manager(), model);
    let mut inflight_guard =
        state
@@ -501,7 +501,7 @@ async fn chat_completions(
        .get_chat_completions_engine(model)
        .map_err(|_| ErrorMessage::model_not_found())?;
-    let parsing_options = get_parsing_options(&state, model);
+    let parsing_options = get_parsing_options(state.manager(), model);
    let mut inflight_guard =
        state
@@ -736,7 +736,7 @@ async fn responses(
        .get_chat_completions_engine(model)
        .map_err(|_| ErrorMessage::model_not_found())?;
-    let parsing_options = get_parsing_options(&state, model);
+    let parsing_options = get_parsing_options(state.manager(), model);
    let mut inflight_guard =
        state

--- a/lib/llm/src/lib.rs
+++ b/lib/llm/src/lib.rs
@@ -18,6 +18,7 @@ pub mod endpoint_type;
 pub mod engines;
 pub mod entrypoint;
 pub mod gguf;
+pub mod grpc;
 pub mod http;
 pub mod hub;
 // pub mod key_value_store;

--- a/lib/llm/tests/kserve_service.rs
+++ b/lib/llm/tests/kserve_service.rs