// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 syntax = "proto3"; package inference; //@@.. cpp:namespace:: inference import "model_config.proto"; //@@ //@@.. cpp:var:: service InferenceService //@@ //@@ Inference Server GRPC endpoints. //@@ service GRPCInferenceService { //@@ .. cpp:var:: rpc ServerLive(ServerLiveRequest) returns //@@ (ServerLiveResponse) //@@ //@@ Check liveness of the inference server. //@@ rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {} //@@ .. cpp:var:: rpc ServerReady(ServerReadyRequest) returns //@@ (ServerReadyResponse) //@@ //@@ Check readiness of the inference server. //@@ rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {} //@@ .. cpp:var:: rpc ModelReady(ModelReadyRequest) returns //@@ (ModelReadyResponse) //@@ //@@ Check readiness of a model in the inference server. //@@ rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {} //@@ .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns //@@ (ModelMetadataResponse) //@@ //@@ Get model metadata. //@@ rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {} //@@ .. cpp:var:: rpc ModelInfer(ModelInferRequest) returns //@@ (ModelInferResponse) //@@ //@@ Perform inference using a specific model. //@@ rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {} //@@ .. cpp:var:: rpc ModelStreamInfer(stream ModelInferRequest) returns //@@ (stream ModelStreamInferResponse) //@@ //@@ Perform streaming inference. //@@ rpc ModelStreamInfer(stream ModelInferRequest) returns (stream ModelStreamInferResponse) {} //@@ .. cpp:var:: rpc ModelConfig(ModelConfigRequest) returns //@@ (ModelConfigResponse) //@@ //@@ Get model configuration. //@@ rpc ModelConfig(ModelConfigRequest) returns (ModelConfigResponse) {} } //@@ //@@.. cpp:var:: message ServerLiveRequest //@@ //@@ Request message for ServerLive. //@@ message ServerLiveRequest {} //@@ //@@.. cpp:var:: message ServerLiveResponse //@@ //@@ Response message for ServerLive. //@@ message ServerLiveResponse { //@@ //@@ .. cpp:var:: bool live //@@ //@@ True if the inference server is live, false if not live. //@@ bool live = 1; } //@@ //@@.. cpp:var:: message ServerReadyRequest //@@ //@@ Request message for ServerReady. //@@ message ServerReadyRequest {} //@@ //@@.. cpp:var:: message ServerReadyResponse //@@ //@@ Response message for ServerReady. //@@ message ServerReadyResponse { //@@ //@@ .. cpp:var:: bool ready //@@ //@@ True if the inference server is ready, false if not ready. The server //@@ is considered ready if it has any registered models, since models //@@ can freely be registered and unregistered at runtime. //@@ bool ready = 1; } //@@ //@@.. cpp:var:: message ModelReadyRequest //@@ //@@ Request message for ModelReady. //@@ message ModelReadyRequest { //@@ //@@ .. cpp:var:: string name //@@ //@@ The name of the model to check for readiness. //@@ string name = 1; //@@ .. cpp:var:: string version //@@ //@@ The version of the model to check for readiness. If not given the //@@ server will choose a version based on the model and internal policy. //@@ string version = 2; } //@@ //@@.. cpp:var:: message ModelReadyResponse //@@ //@@ Response message for ModelReady. //@@ message ModelReadyResponse { //@@ //@@ .. cpp:var:: bool ready //@@ //@@ True if the model is ready, false if not ready. //@@ bool ready = 1; } //@@ //@@.. cpp:var:: message ModelMetadataRequest //@@ //@@ Request message for ModelMetadata. //@@ message ModelMetadataRequest { //@@ //@@ .. cpp:var:: string name //@@ //@@ The name of the model. //@@ string name = 1; //@@ .. cpp:var:: string version //@@ //@@ The version of the model to check for readiness. If not //@@ given the server will choose a version based on the //@@ model and internal policy. //@@ string version = 2; } //@@ //@@.. cpp:var:: message ModelMetadataResponse //@@ //@@ Response message for ModelMetadata. //@@ message ModelMetadataResponse { //@@ //@@ .. cpp:var:: message TensorMetadata //@@ //@@ Metadata for a tensor. //@@ message TensorMetadata { //@@ //@@ .. cpp:var:: string name //@@ //@@ The tensor name. //@@ string name = 1; //@@ //@@ .. cpp:var:: string datatype //@@ //@@ The tensor data type. //@@ string datatype = 2; //@@ //@@ .. cpp:var:: int64 shape (repeated) //@@ //@@ The tensor shape. A variable-size dimension is represented //@@ by a -1 value. //@@ repeated int64 shape = 3; } //@@ //@@ .. cpp:var:: string name //@@ //@@ The model name. //@@ string name = 1; //@@ //@@ .. cpp:var:: string versions (repeated) //@@ //@@ The versions of the model. //@@ repeated string versions = 2; //@@ //@@ .. cpp:var:: string platform //@@ //@@ The model's platform. //@@ string platform = 3; //@@ //@@ .. cpp:var:: TensorMetadata inputs (repeated) //@@ //@@ The model's inputs. //@@ repeated TensorMetadata inputs = 4; //@@ //@@ .. cpp:var:: TensorMetadata outputs (repeated) //@@ //@@ The model's outputs. //@@ repeated TensorMetadata outputs = 5; } //@@ //@@.. cpp:var:: message InferParameter //@@ //@@ An inference parameter value. //@@ message InferParameter { //@@ .. cpp:var:: oneof parameter_choice //@@ //@@ The parameter value can be a string, an int64, //@@ an uint64, a double, or a boolean //@@ //@@ Note: double and uint64 are currently //@@ placeholders for future use and //@@ are not supported for custom parameters //@@ oneof parameter_choice { //@@ .. cpp:var:: bool bool_param //@@ //@@ A boolean parameter value. //@@ bool bool_param = 1; //@@ .. cpp:var:: int64 int64_param //@@ //@@ An int64 parameter value. //@@ int64 int64_param = 2; //@@ .. cpp:var:: string string_param //@@ //@@ A string parameter value. //@@ string string_param = 3; //@@ .. cpp:var:: double double_param //@@ //@@ A double parameter value. //@@ double double_param = 4; //@@ .. cpp:var:: uint64 uint64_param //@@ //@@ A uint64 parameter value. //@@ //@@ Not supported for custom parameters //@@ uint64 uint64_param = 5; } } //@@ //@@.. cpp:var:: message InferTensorContents //@@ //@@ The data contained in a tensor represented by the repeated type //@@ that matches the tensor's data type. Protobuf oneof is not used //@@ because oneofs cannot contain repeated fields. //@@ message InferTensorContents { //@@ //@@ .. cpp:var:: bool bool_contents (repeated) //@@ //@@ Representation for BOOL data type. The size must match what is //@@ expected by the tensor's shape. The contents must be the flattened, //@@ one-dimensional, row-major order of the tensor elements. //@@ repeated bool bool_contents = 1; //@@ //@@ .. cpp:var:: int32 int_contents (repeated) //@@ //@@ Representation for INT8, INT16, and INT32 data types. The size //@@ must match what is expected by the tensor's shape. The contents //@@ must be the flattened, one-dimensional, row-major order of the //@@ tensor elements. //@@ repeated int32 int_contents = 2; //@@ //@@ .. cpp:var:: int64 int64_contents (repeated) //@@ //@@ Representation for INT64 data types. The size must match what //@@ is expected by the tensor's shape. The contents must be the //@@ flattened, one-dimensional, row-major order of the tensor elements. //@@ repeated int64 int64_contents = 3; //@@ //@@ .. cpp:var:: uint32 uint_contents (repeated) //@@ //@@ Representation for UINT8, UINT16, and UINT32 data types. The size //@@ must match what is expected by the tensor's shape. The contents //@@ must be the flattened, one-dimensional, row-major order of the //@@ tensor elements. //@@ repeated uint32 uint_contents = 4; //@@ //@@ .. cpp:var:: uint64 uint64_contents (repeated) //@@ //@@ Representation for UINT64 data types. The size must match what //@@ is expected by the tensor's shape. The contents must be the //@@ flattened, one-dimensional, row-major order of the tensor elements. //@@ repeated uint64 uint64_contents = 5; //@@ //@@ .. cpp:var:: float fp32_contents (repeated) //@@ //@@ Representation for FP32 data type. The size must match what is //@@ expected by the tensor's shape. The contents must be the flattened, //@@ one-dimensional, row-major order of the tensor elements. //@@ repeated float fp32_contents = 6; //@@ //@@ .. cpp:var:: double fp64_contents (repeated) //@@ //@@ Representation for FP64 data type. The size must match what is //@@ expected by the tensor's shape. The contents must be the flattened, //@@ one-dimensional, row-major order of the tensor elements. //@@ repeated double fp64_contents = 7; //@@ //@@ .. cpp:var:: bytes bytes_contents (repeated) //@@ //@@ Representation for BYTES data type. The size must match what is //@@ expected by the tensor's shape. The contents must be the flattened, //@@ one-dimensional, row-major order of the tensor elements. //@@ repeated bytes bytes_contents = 8; } //@@ //@@.. cpp:var:: message ModelInferRequest //@@ //@@ Request message for ModelInfer. //@@ message ModelInferRequest { //@@ //@@ .. cpp:var:: message InferInputTensor //@@ //@@ An input tensor for an inference request. //@@ message InferInputTensor { //@@ //@@ .. cpp:var:: string name //@@ //@@ The tensor name. //@@ string name = 1; //@@ //@@ .. cpp:var:: string datatype //@@ //@@ The tensor data type. //@@ string datatype = 2; //@@ //@@ .. cpp:var:: int64 shape (repeated) //@@ //@@ The tensor shape. //@@ repeated int64 shape = 3; //@@ .. cpp:var:: map parameters //@@ //@@ Optional inference input tensor parameters. //@@ map parameters = 4; //@@ .. cpp:var:: InferTensorContents contents //@@ //@@ The tensor contents using a data-type format. This field //@@ must not be specified if tensor contents are being specified //@@ in ModelInferRequest.raw_input_contents. //@@ InferTensorContents contents = 5; } //@@ //@@ .. cpp:var:: message InferRequestedOutputTensor //@@ //@@ An output tensor requested for an inference request. //@@ message InferRequestedOutputTensor { //@@ //@@ .. cpp:var:: string name //@@ //@@ The tensor name. //@@ string name = 1; //@@ .. cpp:var:: map parameters //@@ //@@ Optional requested output tensor parameters. //@@ map parameters = 2; } //@@ .. cpp:var:: string model_name //@@ //@@ The name of the model to use for inferencing. //@@ string model_name = 1; //@@ .. cpp:var:: string model_version //@@ //@@ The version of the model to use for inference. If not //@@ given the latest/most-recent version of the model is used. //@@ string model_version = 2; //@@ .. cpp:var:: string id //@@ //@@ Optional identifier for the request. If specified will be //@@ returned in the response. //@@ string id = 3; //@@ .. cpp:var:: map parameters //@@ //@@ Optional inference parameters. //@@ map parameters = 4; //@@ //@@ .. cpp:var:: InferInputTensor inputs (repeated) //@@ //@@ The input tensors for the inference. //@@ repeated InferInputTensor inputs = 5; //@@ //@@ .. cpp:var:: InferRequestedOutputTensor outputs (repeated) //@@ //@@ The requested output tensors for the inference. Optional, if not //@@ specified all outputs specified in the model config will be //@@ returned. //@@ repeated InferRequestedOutputTensor outputs = 6; //@@ //@@ .. cpp:var:: bytes raw_input_contents //@@ //@@ The data contained in an input tensor can be represented in //@@ "raw" bytes form or in the repeated type that matches the //@@ tensor's data type. Using the "raw" bytes form will //@@ typically allow higher performance due to the way protobuf //@@ allocation and reuse interacts with GRPC. For example, see //@@ https://github.com/grpc/grpc/issues/23231. //@@ //@@ To use the raw representation 'raw_input_contents' must be //@@ initialized with data for each tensor in the same order as //@@ 'inputs'. For each tensor, the size of this content must //@@ match what is expected by the tensor's shape and data //@@ type. The raw data must be the flattened, one-dimensional, //@@ row-major order of the tensor elements without any stride //@@ or padding between the elements. Note that the FP16 and BF16 data //@@ types must be represented as raw content as there is no //@@ specific data type for a 16-bit float type. //@@ //@@ If this field is specified then InferInputTensor::contents //@@ must not be specified for any input tensor. //@@ repeated bytes raw_input_contents = 7; } //@@ //@@.. cpp:var:: message ModelInferResponse //@@ //@@ Response message for ModelInfer. //@@ message ModelInferResponse { //@@ //@@ .. cpp:var:: message InferOutputTensor //@@ //@@ An output tensor returned for an inference request. //@@ message InferOutputTensor { //@@ //@@ .. cpp:var:: string name //@@ //@@ The tensor name. //@@ string name = 1; //@@ //@@ .. cpp:var:: string datatype //@@ //@@ The tensor data type. //@@ string datatype = 2; //@@ //@@ .. cpp:var:: int64 shape (repeated) //@@ //@@ The tensor shape. //@@ repeated int64 shape = 3; //@@ .. cpp:var:: map parameters //@@ //@@ Optional output tensor parameters. //@@ map parameters = 4; //@@ .. cpp:var:: InferTensorContents contents //@@ //@@ The tensor contents using a data-type format. This field //@@ must not be specified if tensor contents are being specified //@@ in ModelInferResponse.raw_output_contents. //@@ InferTensorContents contents = 5; } //@@ .. cpp:var:: string model_name //@@ //@@ The name of the model used for inference. //@@ string model_name = 1; //@@ .. cpp:var:: string model_version //@@ //@@ The version of the model used for inference. //@@ string model_version = 2; //@@ .. cpp:var:: string id //@@ //@@ The id of the inference request if one was specified. //@@ string id = 3; //@@ .. cpp:var:: map parameters //@@ //@@ Optional inference response parameters. //@@ map parameters = 4; //@@ //@@ .. cpp:var:: InferOutputTensor outputs (repeated) //@@ //@@ The output tensors holding inference results. //@@ repeated InferOutputTensor outputs = 5; //@@ //@@ .. cpp:var:: bytes raw_output_contents //@@ //@@ The data contained in an output tensor can be represented in //@@ "raw" bytes form or in the repeated type that matches the //@@ tensor's data type. Using the "raw" bytes form will //@@ typically allow higher performance due to the way protobuf //@@ allocation and reuse interacts with GRPC. For example, see //@@ https://github.com/grpc/grpc/issues/23231. //@@ //@@ To use the raw representation 'raw_output_contents' must be //@@ initialized with data for each tensor in the same order as //@@ 'outputs'. For each tensor, the size of this content must //@@ match what is expected by the tensor's shape and data //@@ type. The raw data must be the flattened, one-dimensional, //@@ row-major order of the tensor elements without any stride //@@ or padding between the elements. Note that the FP16 and BF16 data //@@ types must be represented as raw content as there is no //@@ specific data type for a 16-bit float type. //@@ //@@ If this field is specified then InferOutputTensor::contents //@@ must not be specified for any output tensor. //@@ repeated bytes raw_output_contents = 6; } //@@ //@@.. cpp:var:: message ModelStreamInferResponse //@@ //@@ Response message for ModelStreamInfer. //@@ message ModelStreamInferResponse { //@@ //@@ .. cpp:var:: string error_message //@@ //@@ The message describing the error. The empty message //@@ indicates the inference was successful without errors. //@@ string error_message = 1; //@@ //@@ .. cpp:var:: ModelInferResponse infer_response //@@ //@@ Holds the results of the request. //@@ ModelInferResponse infer_response = 2; } //@@ //@@.. cpp:var:: message ModelConfigRequest //@@ //@@ Request message for ModelConfig. //@@ message ModelConfigRequest { //@@ //@@ .. cpp:var:: string name //@@ //@@ The name of the model. //@@ string name = 1; //@@ .. cpp:var:: string version //@@ //@@ The version of the model. If not given the model version //@@ is selected automatically based on the version policy. //@@ string version = 2; } //@@ //@@.. cpp:var:: message ModelConfigResponse //@@ //@@ Response message for ModelConfig. //@@ message ModelConfigResponse { //@@ //@@ .. cpp:var:: ModelConfig config //@@ //@@ The model configuration. //@@ ModelConfig config = 1; } //@@ //@@.. cpp:var:: message ModelRepositoryParameter //@@ //@@ An model repository parameter value. //@@ message ModelRepositoryParameter { //@@ .. cpp:var:: oneof parameter_choice //@@ //@@ The parameter value can be a string, an int64 or //@@ a boolean //@@ oneof parameter_choice { //@@ .. cpp:var:: bool bool_param //@@ //@@ A boolean parameter value. //@@ bool bool_param = 1; //@@ .. cpp:var:: int64 int64_param //@@ //@@ An int64 parameter value. //@@ int64 int64_param = 2; //@@ .. cpp:var:: string string_param //@@ //@@ A string parameter value. //@@ string string_param = 3; //@@ .. cpp:var:: bytes bytes_param //@@ //@@ A bytes parameter value. //@@ bytes bytes_param = 4; } }