Unverified Commit d90dc39d authored by GuanLuo's avatar GuanLuo Committed by GitHub
Browse files

feat: add Kserve readiness endpoint support: ServerLive / ServerReady / ModelReady (#4708)


Signed-off-by: default avatarGuan Luo <gluo@nvidia.com>
Signed-off-by: default avatarGuanLuo <41310872+GuanLuo@users.noreply.github.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent 66f319ab
......@@ -16,6 +16,27 @@ import "model_config.proto";
//@@
service GRPCInferenceService
{
//@@ .. cpp:var:: rpc ServerLive(ServerLiveRequest) returns
//@@ (ServerLiveResponse)
//@@
//@@ Check liveness of the inference server.
//@@
rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {}
//@@ .. cpp:var:: rpc ServerReady(ServerReadyRequest) returns
//@@ (ServerReadyResponse)
//@@
//@@ Check readiness of the inference server.
//@@
rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {}
//@@ .. cpp:var:: rpc ModelReady(ModelReadyRequest) returns
//@@ (ModelReadyResponse)
//@@
//@@ Check readiness of a model in the inference server.
//@@
rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {}
//@@ .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns
//@@ (ModelMetadataResponse)
//@@
......@@ -45,6 +66,89 @@ service GRPCInferenceService
rpc ModelConfig(ModelConfigRequest) returns (ModelConfigResponse) {}
}
//@@
//@@.. cpp:var:: message ServerLiveRequest
//@@
//@@ Request message for ServerLive.
//@@
message ServerLiveRequest {}
//@@
//@@.. cpp:var:: message ServerLiveResponse
//@@
//@@ Response message for ServerLive.
//@@
message ServerLiveResponse
{
//@@
//@@ .. cpp:var:: bool live
//@@
//@@ True if the inference server is live, false if not live.
//@@
bool live = 1;
}
//@@
//@@.. cpp:var:: message ServerReadyRequest
//@@
//@@ Request message for ServerReady.
//@@
message ServerReadyRequest {}
//@@
//@@.. cpp:var:: message ServerReadyResponse
//@@
//@@ Response message for ServerReady.
//@@
message ServerReadyResponse
{
//@@
//@@ .. cpp:var:: bool ready
//@@
//@@ True if the inference server is ready, false if not ready. The server
//@@ is considered ready if it has any registered models, since models
//@@ can freely be registered and unregistered at runtime.
//@@
bool ready = 1;
}
//@@
//@@.. cpp:var:: message ModelReadyRequest
//@@
//@@ Request message for ModelReady.
//@@
message ModelReadyRequest
{
//@@
//@@ .. cpp:var:: string name
//@@
//@@ The name of the model to check for readiness.
//@@
string name = 1;
//@@ .. cpp:var:: string version
//@@
//@@ The version of the model to check for readiness. If not given the
//@@ server will choose a version based on the model and internal policy.
//@@
string version = 2;
}
//@@
//@@.. cpp:var:: message ModelReadyResponse
//@@
//@@ Response message for ModelReady.
//@@
message ModelReadyResponse
{
//@@
//@@ .. cpp:var:: bool ready
//@@
//@@ True if the model is ready, false if not ready.
//@@
bool ready = 1;
}
//@@
//@@.. cpp:var:: message ModelMetadataRequest
//@@
......
......@@ -675,4 +675,38 @@ impl GrpcInferenceService for KserveService {
request_model_name
)))
}
async fn server_live(
&self,
_request: Request<inference::ServerLiveRequest>,
) -> Result<Response<inference::ServerLiveResponse>, Status> {
// server is live if we can respond
Ok(Response::new(inference::ServerLiveResponse { live: true }))
}
async fn server_ready(
&self,
_request: Request<inference::ServerReadyRequest>,
) -> Result<Response<inference::ServerReadyResponse>, Status> {
let has_models = !self.state.manager().get_model_cards().is_empty();
Ok(Response::new(inference::ServerReadyResponse {
ready: has_models,
}))
}
async fn model_ready(
&self,
request: Request<inference::ModelReadyRequest>,
) -> Result<Response<inference::ModelReadyResponse>, Status> {
let request_model_name = &request.into_inner().name;
let is_ready = self
.state
.manager()
.get_model_cards()
.into_iter()
.any(|card| request_model_name == &card.display_name);
Ok(Response::new(inference::ModelReadyResponse {
ready: is_ready,
}))
}
}
......@@ -5,6 +5,22 @@
mod ports;
pub mod kserve_test {
// [gluo NOTE] Tests may run in parallel, use this enum to keep track of
// port used for different test cases
enum TestPort {
InferFailure = 8988,
InferSuccess = 8989,
StreamInferFailure = 8990,
StreamInferSuccess = 8991,
InferCancellation = 8992,
StreamInferCancellation = 8993,
ModelInfo = 8994,
TensorModel = 8995,
TensorModelTypes = 8996,
TritonModelConfig = 8997,
LiveReady = 8998,
}
// For using gRPC client for test
pub mod inference {
tonic::include_proto!("inference");
......@@ -16,6 +32,7 @@ pub mod kserve_test {
use inference::grpc_inference_service_client::GrpcInferenceServiceClient;
use inference::{
DataType, ModelConfigRequest, ModelInferRequest, ModelInferResponse, ModelMetadataRequest,
ModelReadyRequest, ServerLiveRequest, ServerReadyRequest,
};
use anyhow::Error;
......@@ -354,21 +371,6 @@ pub mod kserve_test {
}
}
// Tests may run in parallel, use this enum to keep track of port used for different
// test cases
enum TestPort {
InferFailure = 8988,
InferSuccess = 8989,
StreamInferFailure = 8990,
StreamInferSuccess = 8991,
InferCancellation = 8992,
StreamInferCancellation = 8993,
ModelInfo = 8994,
TensorModel = 8995,
TensorModelTypes = 8996,
TritonModelConfig = 8997,
}
#[rstest]
#[tokio::test]
async fn test_infer_failure(
......@@ -1971,4 +1973,86 @@ pub mod kserve_test {
cancel_token.cancel();
let _ = tokio::join!(grpc_task, http_task);
}
#[rstest]
#[tokio::test]
async fn test_live_ready() {
let grpc_port = TestPort::LiveReady as u16;
let service = KserveService::builder().port(grpc_port).build().unwrap();
// start server
let _running = RunningService::spawn(service.clone());
let mut client = get_ready_client(grpc_port, 5).await;
// Check server liveness
let server_live_request = tonic::Request::new(ServerLiveRequest {});
let server_live_response = client.server_live(server_live_request).await.unwrap();
let server_live = server_live_response.get_ref().live;
assert!(server_live, "Server should be live");
// Check server readiness
let server_ready_request = tonic::Request::new(ServerReadyRequest {});
let server_ready_response = client.server_ready(server_ready_request).await.unwrap();
let server_ready = server_ready_response.get_ref().ready;
assert!(
!server_ready,
"Server should not be ready without model registered"
);
// Check model readiness for unregistered model
let model_ready_request = tonic::Request::new(ModelReadyRequest {
name: "tensor".into(),
version: "".into(),
});
let model_ready_response = client.model_ready(model_ready_request).await.unwrap();
let model_ready = model_ready_response.get_ref().ready;
assert!(!model_ready, "Unregistered model should not be ready");
// Register a tensor model
let mut card = ModelDeploymentCard::with_name_only("tensor");
card.model_type = ModelType::TensorBased;
card.model_input = ModelInput::Tensor;
card.runtime_config = ModelRuntimeConfig {
tensor_model_config: Some(tensor::TensorModelConfig {
name: "tensor".to_string(),
inputs: vec![tensor::TensorMetadata {
name: "input".to_string(),
data_type: tensor::DataType::Int32,
shape: vec![1],
parameters: Default::default(),
}],
outputs: vec![tensor::TensorMetadata {
name: "output".to_string(),
data_type: tensor::DataType::Bool,
shape: vec![-1],
parameters: Default::default(),
}],
triton_model_config: None,
}),
..Default::default()
};
let tensor = Arc::new(TensorEngine {});
service
.model_manager()
.add_tensor_model("tensor", card.mdcsum(), tensor.clone())
.unwrap();
let _ = service.model_manager().save_model_card("key", card);
// Re-check readiness
// Check server readiness
let server_ready_request = tonic::Request::new(ServerReadyRequest {});
let server_ready_response = client.server_ready(server_ready_request).await.unwrap();
let server_ready = server_ready_response.get_ref().ready;
assert!(server_ready, "Server should be ready with model registered");
// Check model readiness for unregistered model
let model_ready_request = tonic::Request::new(ModelReadyRequest {
name: "tensor".into(),
version: "".into(),
});
let model_ready_response = client.model_ready(model_ready_request).await.unwrap();
let model_ready = model_ready_response.get_ref().ready;
assert!(model_ready, "Registered model should be ready");
}
}
......@@ -125,5 +125,6 @@ def start_services(request, runtime_services):
@pytest.mark.integration
@pytest.mark.model(TEST_MODEL)
def test_echo() -> None:
triton_echo_client.check_health()
triton_echo_client.run_infer()
triton_echo_client.get_config()
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import sys
import numpy as np
import tritonclient.grpc as grpcclient
SERVER_URL = "localhost:8000"
def check_health():
triton_client = grpcclient.InferenceServerClient(url=SERVER_URL)
assert triton_client.is_server_live()
assert triton_client.is_server_ready()
assert triton_client.is_model_ready("echo")
def run_infer():
server_url = "localhost:8000"
try:
triton_client = grpcclient.InferenceServerClient(url=server_url)
except Exception as e:
print("channel creation failed: " + str(e))
sys.exit()
triton_client = grpcclient.InferenceServerClient(url=SERVER_URL)
model_name = "echo"
......@@ -46,12 +49,7 @@ def run_infer():
def get_config():
server_url = "localhost:8000"
try:
triton_client = grpcclient.InferenceServerClient(url=server_url)
except Exception as e:
print("channel creation failed: " + str(e))
sys.exit()
triton_client = grpcclient.InferenceServerClient(url=SERVER_URL)
model_name = "echo"
response = triton_client.get_model_config(model_name=model_name)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment