Unverified Commit 91a459c0 authored by GuanLuo's avatar GuanLuo Committed by GitHub
Browse files

feat: KServe gRPC support (#2638)

parent ef535edb
...@@ -100,11 +100,11 @@ jobs: ...@@ -100,11 +100,11 @@ jobs:
# Have an explicit step to build tests first to separate time spent on build vs execution. # Have an explicit step to build tests first to separate time spent on build vs execution.
- name: Compile Tests - name: Compile Tests
working-directory: ${{ matrix.dir }} working-directory: ${{ matrix.dir }}
run: cargo test --locked --no-run run: cargo test --locked --no-run --target-dir ${HOME}/tmp || df -h
- name: Run Doc Tests - name: Run Doc Tests
working-directory: ${{ matrix.dir }} working-directory: ${{ matrix.dir }}
run: cargo doc --no-deps && cargo test --locked --doc run: cargo doc --no-deps && cargo test --locked --doc
- name: Run Unit Tests - name: Run Unit Tests
working-directory: ${{ matrix.dir }} working-directory: ${{ matrix.dir }}
# NOTE: --all-targets doesn't run doc tests # NOTE: --all-targets doesn't run doc tests
run: cargo test --locked --all-targets run: cargo test --locked --all-targets --target-dir ${HOME}/tmp
This diff is collapsed.
...@@ -176,6 +176,12 @@ def parse_args(): ...@@ -176,6 +176,12 @@ def parse_args():
default=None, default=None,
help="Prefix for Dynamo frontend metrics. If unset, uses DYN_METRICS_PREFIX env var or 'dynamo_frontend'.", help="Prefix for Dynamo frontend metrics. If unset, uses DYN_METRICS_PREFIX env var or 'dynamo_frontend'.",
) )
parser.add_argument(
"--kserve-grpc-server",
action="store_true",
default=False,
help="Start KServe gRPC server.",
)
flags = parser.parse_args() flags = parser.parse_args()
...@@ -246,6 +252,8 @@ async def async_main(): ...@@ -246,6 +252,8 @@ async def async_main():
try: try:
if flags.interactive: if flags.interactive:
await run_input(runtime, "text", engine) await run_input(runtime, "text", engine)
elif flags.kserve_grpc_server:
await run_input(runtime, "grpc", engine)
else: else:
await run_input(runtime, "http", engine) await run_input(runtime, "http", engine)
except asyncio.exceptions.CancelledError: except asyncio.exceptions.CancelledError:
......
...@@ -83,7 +83,6 @@ pub async fn run( ...@@ -83,7 +83,6 @@ pub async fn run(
rt.clone(), rt.clone(),
) )
.await?; .await?;
// //
// Run in from an input // Run in from an input
// //
......
...@@ -26,7 +26,7 @@ Example: ...@@ -26,7 +26,7 @@ Example:
See `docs/guides/dynamo_run.md` in the repo for full details. See `docs/guides/dynamo_run.md` in the repo for full details.
"#; "#;
const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|auto|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--context-length=N] [--kv-cache-block-size=16] [--extra-engine-args=args.json] [--static-worker] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--router-temperature=0.0] [--use-kv-events] [--max-num-batched-tokens=1.0] [--migration-limit=0] [--verbosity (-v|-vv)]"; const USAGE: &str = "USAGE: dynamo-run in=[http|grpc|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|auto|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--context-length=N] [--kv-cache-block-size=16] [--extra-engine-args=args.json] [--static-worker] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--router-temperature=0.0] [--use-kv-events] [--max-num-batched-tokens=1.0] [--migration-limit=0] [--verbosity (-v|-vv)]";
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
// Set log level based on verbosity flag // Set log level based on verbosity flag
......
This diff is collapsed.
...@@ -110,6 +110,13 @@ tower-http = {workspace = true} ...@@ -110,6 +110,13 @@ tower-http = {workspace = true}
rustls = { version = "0.23" } rustls = { version = "0.23" }
# grpc-service
# ping version to 0.13.1 so it depends on prost 0.13.5
# which is used across other libraries
tonic = { version = "0.13.1" }
# Request prost specifically so tonic-build properly compiles protobuf message
prost = { version = "0.13.5" }
# tokenizers # tokenizers
tokenizers = { version = "0.21.4", default-features = false, features = [ tokenizers = { version = "0.21.4", default-features = false, features = [
"onig", "onig",
...@@ -157,3 +164,6 @@ insta = { version = "1.41", features = [ ...@@ -157,3 +164,6 @@ insta = { version = "1.41", features = [
] } ] }
aligned-vec = "0.6.4" aligned-vec = "0.6.4"
lazy_static = "1.4" lazy_static = "1.4"
[build-dependencies]
tonic-build = { version = "0.13.1"}
\ No newline at end of file
...@@ -13,8 +13,14 @@ ...@@ -13,8 +13,14 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
fn main() { fn main() -> Result<(), Box<dyn std::error::Error>> {
println!("cargo:warning=Building with CUDA KV off"); println!("cargo:warning=Building with CUDA KV off");
build_protos()
}
fn build_protos() -> Result<(), Box<dyn std::error::Error>> {
tonic_build::compile_protos("src/grpc/protos/kserve.proto")?;
Ok(())
} }
// NOTE: Preserving this build.rs for reference. We may want to re-enable // NOTE: Preserving this build.rs for reference. We may want to re-enable
......
...@@ -18,6 +18,7 @@ pub mod batch; ...@@ -18,6 +18,7 @@ pub mod batch;
mod common; mod common;
pub use common::build_routed_pipeline; pub use common::build_routed_pipeline;
pub mod endpoint; pub mod endpoint;
pub mod grpc;
pub mod http; pub mod http;
pub mod text; pub mod text;
...@@ -43,6 +44,9 @@ pub enum Input { ...@@ -43,6 +44,9 @@ pub enum Input {
/// Batch mode. Run all the prompts, write the outputs, exit. /// Batch mode. Run all the prompts, write the outputs, exit.
Batch(PathBuf), Batch(PathBuf),
// Run an KServe compatible gRPC server
Grpc,
} }
impl FromStr for Input { impl FromStr for Input {
...@@ -59,6 +63,7 @@ impl TryFrom<&str> for Input { ...@@ -59,6 +63,7 @@ impl TryFrom<&str> for Input {
fn try_from(s: &str) -> anyhow::Result<Self> { fn try_from(s: &str) -> anyhow::Result<Self> {
match s { match s {
"http" => Ok(Input::Http), "http" => Ok(Input::Http),
"grpc" => Ok(Input::Grpc),
"text" => Ok(Input::Text), "text" => Ok(Input::Text),
"stdin" => Ok(Input::Stdin), "stdin" => Ok(Input::Stdin),
endpoint_path if endpoint_path.starts_with(ENDPOINT_SCHEME) => { endpoint_path if endpoint_path.starts_with(ENDPOINT_SCHEME) => {
...@@ -77,6 +82,7 @@ impl fmt::Display for Input { ...@@ -77,6 +82,7 @@ impl fmt::Display for Input {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let s = match self { let s = match self {
Input::Http => "http", Input::Http => "http",
Input::Grpc => "grpc",
Input::Text => "text", Input::Text => "text",
Input::Stdin => "stdin", Input::Stdin => "stdin",
Input::Endpoint(path) => path, Input::Endpoint(path) => path,
...@@ -113,6 +119,9 @@ pub async fn run_input( ...@@ -113,6 +119,9 @@ pub async fn run_input(
Input::Http => { Input::Http => {
http::run(runtime, engine_config).await?; http::run(runtime, engine_config).await?;
} }
Input::Grpc => {
grpc::run(runtime, engine_config).await?;
}
Input::Text => { Input::Text => {
text::run(runtime, None, engine_config).await?; text::run(runtime, None, engine_config).await?;
} }
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::sync::Arc;
use crate::{
discovery::{MODEL_ROOT_PATH, ModelManager, ModelWatcher},
engines::StreamingEngineAdapter,
entrypoint::{self, EngineConfig, input::common},
grpc::service::kserve,
kv_router::KvRouterConfig,
types::openai::{
chat_completions::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse},
completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},
},
};
use dynamo_runtime::transports::etcd;
use dynamo_runtime::{DistributedRuntime, Runtime};
use dynamo_runtime::{distributed::DistributedConfig, pipeline::RouterMode};
/// Build and run an KServe gRPC service
pub async fn run(runtime: Runtime, engine_config: EngineConfig) -> anyhow::Result<()> {
let mut grpc_service_builder = kserve::KserveService::builder()
.port(engine_config.local_model().http_port()) // [WIP] generalize port..
.with_request_template(engine_config.local_model().request_template());
let grpc_service = match engine_config {
EngineConfig::Dynamic(_) => {
let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
let etcd_client = distributed_runtime.etcd_client();
// This allows the /health endpoint to query etcd for active instances
grpc_service_builder = grpc_service_builder.with_etcd_client(etcd_client.clone());
let grpc_service = grpc_service_builder.build()?;
match etcd_client {
Some(ref etcd_client) => {
let router_config = engine_config.local_model().router_config();
// Listen for models registering themselves in etcd, add them to gRPC service
run_watcher(
distributed_runtime,
grpc_service.state().manager_clone(),
etcd_client.clone(),
MODEL_ROOT_PATH,
router_config.router_mode,
Some(router_config.kv_router_config),
router_config.busy_threshold,
)
.await?;
}
None => {
// Static endpoints don't need discovery
}
}
grpc_service
}
EngineConfig::StaticRemote(local_model) => {
let card = local_model.card();
let router_mode = local_model.router_config().router_mode;
let dst_config = DistributedConfig::from_settings(true); // true means static
let distributed_runtime = DistributedRuntime::new(runtime.clone(), dst_config).await?;
let grpc_service = grpc_service_builder.build()?;
let manager = grpc_service.model_manager();
let endpoint_id = local_model.endpoint_id();
let component = distributed_runtime
.namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component)?;
let client = component.endpoint(&endpoint_id.name).client().await?;
let kv_chooser = if router_mode == RouterMode::KV {
Some(
manager
.kv_chooser_for(
local_model.display_name(),
&component,
card.kv_cache_block_size,
Some(local_model.router_config().kv_router_config),
)
.await?,
)
} else {
None
};
let chat_engine = entrypoint::build_routed_pipeline::<
NvCreateChatCompletionRequest,
NvCreateChatCompletionStreamResponse,
>(card, &client, router_mode, None, kv_chooser.clone())
.await?;
manager.add_chat_completions_model(local_model.display_name(), chat_engine)?;
let completions_engine = entrypoint::build_routed_pipeline::<
NvCreateCompletionRequest,
NvCreateCompletionResponse,
>(card, &client, router_mode, None, kv_chooser)
.await?;
manager.add_completions_model(local_model.display_name(), completions_engine)?;
grpc_service
}
EngineConfig::StaticFull { engine, model, .. } => {
let grpc_service = grpc_service_builder.build()?;
let engine = Arc::new(StreamingEngineAdapter::new(engine));
let manager = grpc_service.model_manager();
manager.add_completions_model(model.service_name(), engine.clone())?;
manager.add_chat_completions_model(model.service_name(), engine)?;
grpc_service
}
EngineConfig::StaticCore {
engine: inner_engine,
model,
..
} => {
let grpc_service = grpc_service_builder.build()?;
let manager = grpc_service.model_manager();
let chat_pipeline = common::build_pipeline::<
NvCreateChatCompletionRequest,
NvCreateChatCompletionStreamResponse,
>(model.card(), inner_engine.clone())
.await?;
manager.add_chat_completions_model(model.service_name(), chat_pipeline)?;
let cmpl_pipeline = common::build_pipeline::<
NvCreateCompletionRequest,
NvCreateCompletionResponse,
>(model.card(), inner_engine)
.await?;
manager.add_completions_model(model.service_name(), cmpl_pipeline)?;
grpc_service
}
};
grpc_service.run(runtime.primary_token()).await?;
runtime.shutdown(); // Cancel primary token
Ok(())
}
/// Spawns a task that watches for new models in etcd at network_prefix,
/// and registers them with the ModelManager so that the HTTP service can use them.
async fn run_watcher(
runtime: DistributedRuntime,
model_manager: Arc<ModelManager>,
etcd_client: etcd::Client,
network_prefix: &str,
router_mode: RouterMode,
kv_router_config: Option<KvRouterConfig>,
busy_threshold: Option<f64>,
) -> anyhow::Result<()> {
let watch_obj = ModelWatcher::new(
runtime,
model_manager,
router_mode,
kv_router_config,
busy_threshold,
);
tracing::info!("Watching for remote model at {network_prefix}");
let models_watcher = etcd_client.kv_get_and_watch_prefix(network_prefix).await?;
let (_prefix, _watcher, receiver) = models_watcher.dissolve();
// [gluo NOTE] This is different from http::run_watcher where it alters the HTTP service
// endpoint being exposed, gRPC doesn't have the same concept as the KServe service
// only has one kind of inference endpoint.
// Pass the sender to the watcher
let _watcher_task = tokio::spawn(async move {
watch_obj.watch(receiver).await;
});
Ok(())
}
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub mod service;
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
syntax = "proto3";
package inference;
//@@.. cpp:namespace:: inference
import "model_config.proto";
//@@
//@@.. cpp:var:: service InferenceService
//@@
//@@ Inference Server GRPC endpoints.
//@@
service GRPCInferenceService
{
//@@ .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns
//@@ (ModelMetadataResponse)
//@@
//@@ Get model metadata.
//@@
rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {}
//@@ .. cpp:var:: rpc ModelInfer(ModelInferRequest) returns
//@@ (ModelInferResponse)
//@@
//@@ Perform inference using a specific model.
//@@
rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {}
//@@ .. cpp:var:: rpc ModelStreamInfer(stream ModelInferRequest) returns
//@@ (stream ModelStreamInferResponse)
//@@
//@@ Perform streaming inference.
//@@
rpc ModelStreamInfer(stream ModelInferRequest) returns (stream ModelStreamInferResponse) {}
//@@ .. cpp:var:: rpc ModelConfig(ModelConfigRequest) returns
//@@ (ModelConfigResponse)
//@@
//@@ Get model configuration.
//@@
rpc ModelConfig(ModelConfigRequest) returns (ModelConfigResponse) {}
}
//@@
//@@.. cpp:var:: message ModelMetadataRequest
//@@
//@@ Request message for ModelMetadata.
//@@
message ModelMetadataRequest
{
//@@
//@@ .. cpp:var:: string name
//@@
//@@ The name of the model.
//@@
string name = 1;
//@@ .. cpp:var:: string version
//@@
//@@ The version of the model to check for readiness. If not
//@@ given the server will choose a version based on the
//@@ model and internal policy.
//@@
string version = 2;
}
//@@
//@@.. cpp:var:: message ModelMetadataResponse
//@@
//@@ Response message for ModelMetadata.
//@@
message ModelMetadataResponse
{
//@@
//@@ .. cpp:var:: message TensorMetadata
//@@
//@@ Metadata for a tensor.
//@@
message TensorMetadata
{
//@@
//@@ .. cpp:var:: string name
//@@
//@@ The tensor name.
//@@
string name = 1;
//@@
//@@ .. cpp:var:: string datatype
//@@
//@@ The tensor data type.
//@@
string datatype = 2;
//@@
//@@ .. cpp:var:: int64 shape (repeated)
//@@
//@@ The tensor shape. A variable-size dimension is represented
//@@ by a -1 value.
//@@
repeated int64 shape = 3;
}
//@@
//@@ .. cpp:var:: string name
//@@
//@@ The model name.
//@@
string name = 1;
//@@
//@@ .. cpp:var:: string versions (repeated)
//@@
//@@ The versions of the model.
//@@
repeated string versions = 2;
//@@
//@@ .. cpp:var:: string platform
//@@
//@@ The model's platform.
//@@
string platform = 3;
//@@
//@@ .. cpp:var:: TensorMetadata inputs (repeated)
//@@
//@@ The model's inputs.
//@@
repeated TensorMetadata inputs = 4;
//@@
//@@ .. cpp:var:: TensorMetadata outputs (repeated)
//@@
//@@ The model's outputs.
//@@
repeated TensorMetadata outputs = 5;
}
//@@
//@@.. cpp:var:: message InferParameter
//@@
//@@ An inference parameter value.
//@@
message InferParameter
{
//@@ .. cpp:var:: oneof parameter_choice
//@@
//@@ The parameter value can be a string, an int64,
//@@ an uint64, a double, or a boolean
//@@
//@@ Note: double and uint64 are currently
//@@ placeholders for future use and
//@@ are not supported for custom parameters
//@@
oneof parameter_choice
{
//@@ .. cpp:var:: bool bool_param
//@@
//@@ A boolean parameter value.
//@@
bool bool_param = 1;
//@@ .. cpp:var:: int64 int64_param
//@@
//@@ An int64 parameter value.
//@@
int64 int64_param = 2;
//@@ .. cpp:var:: string string_param
//@@
//@@ A string parameter value.
//@@
string string_param = 3;
//@@ .. cpp:var:: double double_param
//@@
//@@ A double parameter value.
//@@
double double_param = 4;
//@@ .. cpp:var:: uint64 uint64_param
//@@
//@@ A uint64 parameter value.
//@@
//@@ Not supported for custom parameters
//@@
uint64 uint64_param = 5;
}
}
//@@
//@@.. cpp:var:: message InferTensorContents
//@@
//@@ The data contained in a tensor represented by the repeated type
//@@ that matches the tensor's data type. Protobuf oneof is not used
//@@ because oneofs cannot contain repeated fields.
//@@
message InferTensorContents
{
//@@
//@@ .. cpp:var:: bool bool_contents (repeated)
//@@
//@@ Representation for BOOL data type. The size must match what is
//@@ expected by the tensor's shape. The contents must be the flattened,
//@@ one-dimensional, row-major order of the tensor elements.
//@@
repeated bool bool_contents = 1;
//@@
//@@ .. cpp:var:: int32 int_contents (repeated)
//@@
//@@ Representation for INT8, INT16, and INT32 data types. The size
//@@ must match what is expected by the tensor's shape. The contents
//@@ must be the flattened, one-dimensional, row-major order of the
//@@ tensor elements.
//@@
repeated int32 int_contents = 2;
//@@
//@@ .. cpp:var:: int64 int64_contents (repeated)
//@@
//@@ Representation for INT64 data types. The size must match what
//@@ is expected by the tensor's shape. The contents must be the
//@@ flattened, one-dimensional, row-major order of the tensor elements.
//@@
repeated int64 int64_contents = 3;
//@@
//@@ .. cpp:var:: uint32 uint_contents (repeated)
//@@
//@@ Representation for UINT8, UINT16, and UINT32 data types. The size
//@@ must match what is expected by the tensor's shape. The contents
//@@ must be the flattened, one-dimensional, row-major order of the
//@@ tensor elements.
//@@
repeated uint32 uint_contents = 4;
//@@
//@@ .. cpp:var:: uint64 uint64_contents (repeated)
//@@
//@@ Representation for UINT64 data types. The size must match what
//@@ is expected by the tensor's shape. The contents must be the
//@@ flattened, one-dimensional, row-major order of the tensor elements.
//@@
repeated uint64 uint64_contents = 5;
//@@
//@@ .. cpp:var:: float fp32_contents (repeated)
//@@
//@@ Representation for FP32 data type. The size must match what is
//@@ expected by the tensor's shape. The contents must be the flattened,
//@@ one-dimensional, row-major order of the tensor elements.
//@@
repeated float fp32_contents = 6;
//@@
//@@ .. cpp:var:: double fp64_contents (repeated)
//@@
//@@ Representation for FP64 data type. The size must match what is
//@@ expected by the tensor's shape. The contents must be the flattened,
//@@ one-dimensional, row-major order of the tensor elements.
//@@
repeated double fp64_contents = 7;
//@@
//@@ .. cpp:var:: bytes bytes_contents (repeated)
//@@
//@@ Representation for BYTES data type. The size must match what is
//@@ expected by the tensor's shape. The contents must be the flattened,
//@@ one-dimensional, row-major order of the tensor elements.
//@@
repeated bytes bytes_contents = 8;
}
//@@
//@@.. cpp:var:: message ModelInferRequest
//@@
//@@ Request message for ModelInfer.
//@@
message ModelInferRequest
{
//@@
//@@ .. cpp:var:: message InferInputTensor
//@@
//@@ An input tensor for an inference request.
//@@
message InferInputTensor
{
//@@
//@@ .. cpp:var:: string name
//@@
//@@ The tensor name.
//@@
string name = 1;
//@@
//@@ .. cpp:var:: string datatype
//@@
//@@ The tensor data type.
//@@
string datatype = 2;
//@@
//@@ .. cpp:var:: int64 shape (repeated)
//@@
//@@ The tensor shape.
//@@
repeated int64 shape = 3;
//@@ .. cpp:var:: map<string,InferParameter> parameters
//@@
//@@ Optional inference input tensor parameters.
//@@
map<string, InferParameter> parameters = 4;
//@@ .. cpp:var:: InferTensorContents contents
//@@
//@@ The tensor contents using a data-type format. This field
//@@ must not be specified if tensor contents are being specified
//@@ in ModelInferRequest.raw_input_contents.
//@@
InferTensorContents contents = 5;
}
//@@
//@@ .. cpp:var:: message InferRequestedOutputTensor
//@@
//@@ An output tensor requested for an inference request.
//@@
message InferRequestedOutputTensor
{
//@@
//@@ .. cpp:var:: string name
//@@
//@@ The tensor name.
//@@
string name = 1;
//@@ .. cpp:var:: map<string,InferParameter> parameters
//@@
//@@ Optional requested output tensor parameters.
//@@
map<string, InferParameter> parameters = 2;
}
//@@ .. cpp:var:: string model_name
//@@
//@@ The name of the model to use for inferencing.
//@@
string model_name = 1;
//@@ .. cpp:var:: string model_version
//@@
//@@ The version of the model to use for inference. If not
//@@ given the latest/most-recent version of the model is used.
//@@
string model_version = 2;
//@@ .. cpp:var:: string id
//@@
//@@ Optional identifier for the request. If specified will be
//@@ returned in the response.
//@@
string id = 3;
//@@ .. cpp:var:: map<string,InferParameter> parameters
//@@
//@@ Optional inference parameters.
//@@
map<string, InferParameter> parameters = 4;
//@@
//@@ .. cpp:var:: InferInputTensor inputs (repeated)
//@@
//@@ The input tensors for the inference.
//@@
repeated InferInputTensor inputs = 5;
//@@
//@@ .. cpp:var:: InferRequestedOutputTensor outputs (repeated)
//@@
//@@ The requested output tensors for the inference. Optional, if not
//@@ specified all outputs specified in the model config will be
//@@ returned.
//@@
repeated InferRequestedOutputTensor outputs = 6;
//@@
//@@ .. cpp:var:: bytes raw_input_contents
//@@
//@@ The data contained in an input tensor can be represented in
//@@ "raw" bytes form or in the repeated type that matches the
//@@ tensor's data type. Using the "raw" bytes form will
//@@ typically allow higher performance due to the way protobuf
//@@ allocation and reuse interacts with GRPC. For example, see
//@@ https://github.com/grpc/grpc/issues/23231.
//@@
//@@ To use the raw representation 'raw_input_contents' must be
//@@ initialized with data for each tensor in the same order as
//@@ 'inputs'. For each tensor, the size of this content must
//@@ match what is expected by the tensor's shape and data
//@@ type. The raw data must be the flattened, one-dimensional,
//@@ row-major order of the tensor elements without any stride
//@@ or padding between the elements. Note that the FP16 and BF16 data
//@@ types must be represented as raw content as there is no
//@@ specific data type for a 16-bit float type.
//@@
//@@ If this field is specified then InferInputTensor::contents
//@@ must not be specified for any input tensor.
//@@
repeated bytes raw_input_contents = 7;
}
//@@
//@@.. cpp:var:: message ModelInferResponse
//@@
//@@ Response message for ModelInfer.
//@@
message ModelInferResponse
{
//@@
//@@ .. cpp:var:: message InferOutputTensor
//@@
//@@ An output tensor returned for an inference request.
//@@
message InferOutputTensor
{
//@@
//@@ .. cpp:var:: string name
//@@
//@@ The tensor name.
//@@
string name = 1;
//@@
//@@ .. cpp:var:: string datatype
//@@
//@@ The tensor data type.
//@@
string datatype = 2;
//@@
//@@ .. cpp:var:: int64 shape (repeated)
//@@
//@@ The tensor shape.
//@@
repeated int64 shape = 3;
//@@ .. cpp:var:: map<string,InferParameter> parameters
//@@
//@@ Optional output tensor parameters.
//@@
map<string, InferParameter> parameters = 4;
//@@ .. cpp:var:: InferTensorContents contents
//@@
//@@ The tensor contents using a data-type format. This field
//@@ must not be specified if tensor contents are being specified
//@@ in ModelInferResponse.raw_output_contents.
//@@
InferTensorContents contents = 5;
}
//@@ .. cpp:var:: string model_name
//@@
//@@ The name of the model used for inference.
//@@
string model_name = 1;
//@@ .. cpp:var:: string model_version
//@@
//@@ The version of the model used for inference.
//@@
string model_version = 2;
//@@ .. cpp:var:: string id
//@@
//@@ The id of the inference request if one was specified.
//@@
string id = 3;
//@@ .. cpp:var:: map<string,InferParameter> parameters
//@@
//@@ Optional inference response parameters.
//@@
map<string, InferParameter> parameters = 4;
//@@
//@@ .. cpp:var:: InferOutputTensor outputs (repeated)
//@@
//@@ The output tensors holding inference results.
//@@
repeated InferOutputTensor outputs = 5;
//@@
//@@ .. cpp:var:: bytes raw_output_contents
//@@
//@@ The data contained in an output tensor can be represented in
//@@ "raw" bytes form or in the repeated type that matches the
//@@ tensor's data type. Using the "raw" bytes form will
//@@ typically allow higher performance due to the way protobuf
//@@ allocation and reuse interacts with GRPC. For example, see
//@@ https://github.com/grpc/grpc/issues/23231.
//@@
//@@ To use the raw representation 'raw_output_contents' must be
//@@ initialized with data for each tensor in the same order as
//@@ 'outputs'. For each tensor, the size of this content must
//@@ match what is expected by the tensor's shape and data
//@@ type. The raw data must be the flattened, one-dimensional,
//@@ row-major order of the tensor elements without any stride
//@@ or padding between the elements. Note that the FP16 and BF16 data
//@@ types must be represented as raw content as there is no
//@@ specific data type for a 16-bit float type.
//@@
//@@ If this field is specified then InferOutputTensor::contents
//@@ must not be specified for any output tensor.
//@@
repeated bytes raw_output_contents = 6;
}
//@@
//@@.. cpp:var:: message ModelStreamInferResponse
//@@
//@@ Response message for ModelStreamInfer.
//@@
message ModelStreamInferResponse
{
//@@
//@@ .. cpp:var:: string error_message
//@@
//@@ The message describing the error. The empty message
//@@ indicates the inference was successful without errors.
//@@
string error_message = 1;
//@@
//@@ .. cpp:var:: ModelInferResponse infer_response
//@@
//@@ Holds the results of the request.
//@@
ModelInferResponse infer_response = 2;
}
//@@
//@@.. cpp:var:: message ModelConfigRequest
//@@
//@@ Request message for ModelConfig.
//@@
message ModelConfigRequest
{
//@@
//@@ .. cpp:var:: string name
//@@
//@@ The name of the model.
//@@
string name = 1;
//@@ .. cpp:var:: string version
//@@
//@@ The version of the model. If not given the model version
//@@ is selected automatically based on the version policy.
//@@
string version = 2;
}
//@@
//@@.. cpp:var:: message ModelConfigResponse
//@@
//@@ Response message for ModelConfig.
//@@
message ModelConfigResponse
{
//@@
//@@ .. cpp:var:: ModelConfig config
//@@
//@@ The model configuration.
//@@
ModelConfig config = 1;
}
//@@
//@@.. cpp:var:: message ModelRepositoryParameter
//@@
//@@ An model repository parameter value.
//@@
message ModelRepositoryParameter
{
//@@ .. cpp:var:: oneof parameter_choice
//@@
//@@ The parameter value can be a string, an int64 or
//@@ a boolean
//@@
oneof parameter_choice
{
//@@ .. cpp:var:: bool bool_param
//@@
//@@ A boolean parameter value.
//@@
bool bool_param = 1;
//@@ .. cpp:var:: int64 int64_param
//@@
//@@ An int64 parameter value.
//@@
int64 int64_param = 2;
//@@ .. cpp:var:: string string_param
//@@
//@@ A string parameter value.
//@@
string string_param = 3;
//@@ .. cpp:var:: bytes bytes_param
//@@
//@@ A bytes parameter value.
//@@
bytes bytes_param = 4;
}
}
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub mod kserve;
pub mod openai;
This diff is collapsed.
This diff is collapsed.
...@@ -31,7 +31,6 @@ use super::{ ...@@ -31,7 +31,6 @@ use super::{
metrics::{Endpoint, ResponseMetricCollector}, metrics::{Endpoint, ResponseMetricCollector},
service_v2, service_v2,
}; };
use crate::preprocessor::LLMMetricAnnotation;
use crate::protocols::openai::chat_completions::aggregator::ChatCompletionAggregator; use crate::protocols::openai::chat_completions::aggregator::ChatCompletionAggregator;
use crate::protocols::openai::{ use crate::protocols::openai::{
ParsingOptions, ParsingOptions,
...@@ -42,6 +41,7 @@ use crate::protocols::openai::{ ...@@ -42,6 +41,7 @@ use crate::protocols::openai::{
}; };
use crate::request_template::RequestTemplate; use crate::request_template::RequestTemplate;
use crate::types::Annotated; use crate::types::Annotated;
use crate::{discovery::ModelManager, preprocessor::LLMMetricAnnotation};
use dynamo_runtime::logging::get_distributed_tracing_context; use dynamo_runtime::logging::get_distributed_tracing_context;
use tracing::Instrument; use tracing::Instrument;
...@@ -195,8 +195,8 @@ fn get_or_create_request_id(primary: Option<&str>, headers: &HeaderMap) -> Strin ...@@ -195,8 +195,8 @@ fn get_or_create_request_id(primary: Option<&str>, headers: &HeaderMap) -> Strin
uuid.to_string() uuid.to_string()
} }
fn get_parsing_options(state: &Arc<service_v2::State>, model: &str) -> ParsingOptions { fn get_parsing_options(manager: &ModelManager, model: &str) -> ParsingOptions {
let tool_call_parser = state.manager().get_model_tool_call_parser(model); let tool_call_parser = manager.get_model_tool_call_parser(model);
let reasoning_parser = None; // TODO: Implement reasoning parser let reasoning_parser = None; // TODO: Implement reasoning parser
ParsingOptions::new(tool_call_parser, reasoning_parser) ParsingOptions::new(tool_call_parser, reasoning_parser)
...@@ -274,7 +274,7 @@ async fn completions( ...@@ -274,7 +274,7 @@ async fn completions(
.get_completions_engine(model) .get_completions_engine(model)
.map_err(|_| ErrorMessage::model_not_found())?; .map_err(|_| ErrorMessage::model_not_found())?;
let parsing_options = get_parsing_options(&state, model); let parsing_options = get_parsing_options(state.manager(), model);
let mut inflight_guard = let mut inflight_guard =
state state
...@@ -501,7 +501,7 @@ async fn chat_completions( ...@@ -501,7 +501,7 @@ async fn chat_completions(
.get_chat_completions_engine(model) .get_chat_completions_engine(model)
.map_err(|_| ErrorMessage::model_not_found())?; .map_err(|_| ErrorMessage::model_not_found())?;
let parsing_options = get_parsing_options(&state, model); let parsing_options = get_parsing_options(state.manager(), model);
let mut inflight_guard = let mut inflight_guard =
state state
...@@ -736,7 +736,7 @@ async fn responses( ...@@ -736,7 +736,7 @@ async fn responses(
.get_chat_completions_engine(model) .get_chat_completions_engine(model)
.map_err(|_| ErrorMessage::model_not_found())?; .map_err(|_| ErrorMessage::model_not_found())?;
let parsing_options = get_parsing_options(&state, model); let parsing_options = get_parsing_options(state.manager(), model);
let mut inflight_guard = let mut inflight_guard =
state state
......
...@@ -18,6 +18,7 @@ pub mod endpoint_type; ...@@ -18,6 +18,7 @@ pub mod endpoint_type;
pub mod engines; pub mod engines;
pub mod entrypoint; pub mod entrypoint;
pub mod gguf; pub mod gguf;
pub mod grpc;
pub mod http; pub mod http;
pub mod hub; pub mod hub;
// pub mod key_value_store; // pub mod key_value_store;
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment