model_config.proto

// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2018, TensorFlow Authors. All rights reserved.

syntax = "proto3";

package inference;

//@@.. cpp:namespace:: inference

//@@
//@@.. cpp:enum:: DataType
//@@
//@@   Data types supported for input and output tensors.
//@@
enum DataType {
  //@@  .. cpp:enumerator:: DataType::INVALID = 0
  TYPE_INVALID = 0;

  //@@  .. cpp:enumerator:: DataType::BOOL = 1
  TYPE_BOOL = 1;

  //@@  .. cpp:enumerator:: DataType::UINT8 = 2
  TYPE_UINT8 = 2;
  //@@  .. cpp:enumerator:: DataType::UINT16 = 3
  TYPE_UINT16 = 3;
  //@@  .. cpp:enumerator:: DataType::UINT32 = 4
  TYPE_UINT32 = 4;
  //@@  .. cpp:enumerator:: DataType::UINT64 = 5
  TYPE_UINT64 = 5;

  //@@  .. cpp:enumerator:: DataType::INT8 = 6
  TYPE_INT8 = 6;
  //@@  .. cpp:enumerator:: DataType::INT16 = 7
  TYPE_INT16 = 7;
  //@@  .. cpp:enumerator:: DataType::INT32 = 8
  TYPE_INT32 = 8;
  //@@  .. cpp:enumerator:: DataType::INT64 = 9
  TYPE_INT64 = 9;

  //@@  .. cpp:enumerator:: DataType::FP16 = 10
  TYPE_FP16 = 10;
  //@@  .. cpp:enumerator:: DataType::FP32 = 11
  TYPE_FP32 = 11;
  //@@  .. cpp:enumerator:: DataType::FP64 = 12
  TYPE_FP64 = 12;

  //@@  .. cpp:enumerator:: DataType::STRING = 13
  TYPE_STRING = 13;

  //@@  .. cpp:enumerator:: DataType::BF16 = 14
  TYPE_BF16 = 14;
}

//@@
//@@  .. cpp:var:: message ModelRateLimiter
//@@
//@@     The specifications required by the rate limiter to properly
//@@     schedule the inference requests across the different models
//@@     and their instances.
//@@
message ModelRateLimiter
{
  //@@  .. cpp:var:: message Resource
  //@@
  //@@     The resource property.
  //@@
  message Resource
  {
    //@@  .. cpp:var:: string name
    //@@
    //@@     The name associated with the resource.
    //@@
    string name = 1;

    //@@  .. cpp:var:: bool global
    //@@
    //@@     Whether or not the resource is global. If true then the resource
    //@@     is assumed to be shared among the devices otherwise specified
    //@@     count of the resource is assumed for each device associated
    //@@     with the instance.
    //@@
    bool global = 2;

    //@@  .. cpp:var:: uint32 count
    //@@
    //@@     The number of resources required for the execution of the model
    //@@     instance.
    //@@
    uint32 count = 3;
  }

  //@@  .. cpp:var:: Resource resources (repeated)
  //@@
  //@@     The resources required to execute the request on a model instance.
  //@@     Resources are just names with a corresponding count. The execution
  //@@     of the instance will be blocked until the specificied resources are
  //@@     available. By default an instance uses no rate-limiter resources.
  //@@
  repeated Resource resources = 1;

  //@@  .. cpp:var:: uint32 priority
  //@@
  //@@     The optional weighting value to be used for prioritizing across
  //@@     instances. An instance with priority 2 will be given 1/2 the
  //@@     number of scheduling chances as an instance_group with priority
  //@@     1. The default priority is 1. The priority of value 0 will be
  //@@     treated as priority 1.
  //@@
  uint32 priority = 2;
}

//@@
//@@.. cpp:var:: message ModelInstanceGroup
//@@
//@@   A group of one or more instances of a model and resources made
//@@   available for those instances.
//@@
message ModelInstanceGroup
{
  //@@
  //@@  .. cpp:enum:: Kind
  //@@
  //@@     Kind of this instance group.
  //@@
  enum Kind {
    //@@    .. cpp:enumerator:: Kind::KIND_AUTO = 0
    //@@
    //@@       This instance group represents instances that can run on either
    //@@       CPU or GPU. If all GPUs listed in 'gpus' are available then
    //@@       instances will be created on GPU(s), otherwise instances will
    //@@       be created on CPU.
    //@@
    KIND_AUTO = 0;

    //@@    .. cpp:enumerator:: Kind::KIND_GPU = 1
    //@@
    //@@       This instance group represents instances that must run on the
    //@@       GPU.
    //@@
    KIND_GPU = 1;

    //@@    .. cpp:enumerator:: Kind::KIND_CPU = 2
    //@@
    //@@       This instance group represents instances that must run on the
    //@@       CPU.
    //@@
    KIND_CPU = 2;

    //@@    .. cpp:enumerator:: Kind::KIND_MODEL = 3
    //@@
    //@@       This instance group represents instances that should run on the
    //@@       CPU and/or GPU(s) as specified by the model or backend itself.
    //@@       The inference server will not override the model/backend
    //@@       settings.
    //@@
    KIND_MODEL = 3;
  }

  //@@
  //@@  .. cpp:var:: message SecondaryDevice
  //@@
  //@@     A secondary device required for a model instance.
  //@@
  message SecondaryDevice
  {
    //@@
    //@@  .. cpp:enum:: SecondaryDeviceKind
    //@@
    //@@     The kind of the secondary device.
    //@@
    enum SecondaryDeviceKind {
      //@@    .. cpp:enumerator:: SecondaryDeviceKind::KIND_NVDLA = 0
      //@@
      //@@       An NVDLA core. http://nvdla.org
      //@@       Currently KIND_NVDLA is only supported by the TensorRT backend.
      //@@
      KIND_NVDLA = 0;
    }

    //@@  .. cpp:var:: SecondaryDeviceKind kind
    //@@
    //@@     The secondary device kind.
    //@@
    SecondaryDeviceKind kind = 1;

    //@@  .. cpp:var:: int64 device_id
    //@@
    //@@     Identifier for the secondary device.
    //@@
    int64 device_id = 2;
  }

  //@@  .. cpp:var:: string name
  //@@
  //@@     Optional name of this group of instances. If not specified the
  //@@     name will be formed as <model name>_<group number>. The name of
  //@@     individual instances will be further formed by a unique instance
  //@@     number and GPU index:
  //@@
  string name = 1;

  //@@  .. cpp:var:: Kind kind
  //@@
  //@@     The kind of this instance group. Default is KIND_AUTO. If
  //@@     KIND_AUTO or KIND_GPU then both 'count' and 'gpu' are valid and
  //@@     may be specified. If KIND_CPU or KIND_MODEL only 'count' is valid
  //@@     and 'gpu' cannot be specified.
  //@@
  Kind kind = 4;

  //@@  .. cpp:var:: int32 count
  //@@
  //@@     For a group assigned to GPU, the number of instances created for
  //@@     each GPU listed in 'gpus'. For a group assigned to CPU the number
  //@@     of instances created. Default is 1.
  int32 count = 2;

  //@@  .. cpp:var:: ModelRateLimiter rate_limiter
  //@@
  //@@     The rate limiter specific settings to be associated with this
  //@@     instance group. Optional, if not specified no rate limiting
  //@@     will be applied to this instance group.
  //@@
  ModelRateLimiter rate_limiter = 6;

  //@@  .. cpp:var:: int32 gpus (repeated)
  //@@
  //@@     GPU(s) where instances should be available. For each GPU listed,
  //@@     'count' instances of the model will be available. Setting 'gpus'
  //@@     to empty (or not specifying at all) is eqivalent to listing all
  //@@     available GPUs.
  //@@
  repeated int32 gpus = 3;

  //@@  .. cpp:var:: SecondaryDevice secondary_devices (repeated)
  //@@
  //@@     Secondary devices that are required by instances specified by this
  //@@     instance group. Optional.
  //@@
  repeated SecondaryDevice secondary_devices = 8;

  //@@  .. cpp:var:: string profile (repeated)
  //@@
  //@@     For TensorRT models containing multiple optimization profile, this
  //@@     parameter specifies a set of optimization profiles available to this
  //@@     instance group. The inference server will choose the optimal profile
  //@@     based on the shapes of the input tensors. This field should lie
  //@@     between 0 and <TotalNumberOfOptimizationProfilesInPlanModel> - 1
  //@@     and be specified only for TensorRT backend, otherwise an error will
  //@@     be generated. If not specified, the server will select the first
  //@@     optimization profile by default.
  //@@
  repeated string profile = 5;

  //@@  .. cpp:var:: bool passive
  //@@
  //@@     Whether the instances within this instance group will be accepting
  //@@     inference requests from the scheduler. If true, the instances will
  //@@     not be added to the scheduler. Default value is false.
  //@@
  bool passive = 7;

  //@@  .. cpp:var:: string host_policy
  //@@
  //@@     The host policy name that the instance to be associated with.
  //@@     The default value is set to reflect the device kind of the instance,
  //@@     for instance, KIND_CPU is "cpu", KIND_MODEL is "model" and
  //@@     KIND_GPU is "gpu_<gpu_id>".
  //@@
  string host_policy = 9;
}

//@@
//@@.. cpp:var:: message ModelTensorReshape
//@@
//@@   Reshape specification for input and output tensors.
//@@
message ModelTensorReshape
{
  //@@  .. cpp:var:: int64 shape (repeated)
  //@@
  //@@     The shape to use for reshaping.
  //@@
  repeated int64 shape = 1;
}

//@@
//@@.. cpp:var:: message ModelInput
//@@
//@@   An input required by the model.
//@@
message ModelInput
{
  //@@
  //@@  .. cpp:enum:: Format
  //@@
  //@@     The format for the input.
  //@@
  enum Format {
    //@@    .. cpp:enumerator:: Format::FORMAT_NONE = 0
    //@@
    //@@       The input has no specific format. This is the default.
    //@@
    FORMAT_NONE = 0;

    //@@    .. cpp:enumerator:: Format::FORMAT_NHWC = 1
    //@@
    //@@       HWC image format. Tensors with this format require 3 dimensions
    //@@       if the model does not support batching (max_batch_size = 0) or 4
    //@@       dimensions if the model does support batching (max_batch_size
    //@@       >= 1). In either case the 'dims' below should only specify the
    //@@       3 non-batch dimensions (i.e. HWC or CHW).
    //@@
    FORMAT_NHWC = 1;

    //@@    .. cpp:enumerator:: Format::FORMAT_NCHW = 2
    //@@
    //@@       CHW image format. Tensors with this format require 3 dimensions
    //@@       if the model does not support batching (max_batch_size = 0) or 4
    //@@       dimensions if the model does support batching (max_batch_size
    //@@       >= 1). In either case the 'dims' below should only specify the
    //@@       3 non-batch dimensions (i.e. HWC or CHW).
    //@@
    FORMAT_NCHW = 2;
  }

  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the input.
  //@@
  string name = 1;

  //@@  .. cpp:var:: DataType data_type
  //@@
  //@@     The data-type of the input.
  //@@
  DataType data_type = 2;

  //@@  .. cpp:var:: Format format
  //@@
  //@@     The format of the input. Optional.
  //@@
  Format format = 3;

  //@@  .. cpp:var:: int64 dims (repeated)
  //@@
  //@@     The dimensions/shape of the input tensor that must be provided
  //@@     when invoking the inference API for this model.
  //@@
  repeated int64 dims = 4;

  //@@  .. cpp:var:: ModelTensorReshape reshape
  //@@
  //@@     The shape expected for this input by the backend. The input will
  //@@     be reshaped to this before being presented to the backend. The
  //@@     reshape must have the same number of elements as the input shape
  //@@     specified by 'dims'. Optional.
  //@@
  ModelTensorReshape reshape = 5;

  //@@  .. cpp:var:: bool is_shape_tensor
  //@@
  //@@     Whether or not the input is a shape tensor to the model. This field
  //@@     is currently supported only for the TensorRT model. An error will be
  //@@     generated if this specification does not comply with underlying
  //@@     model.
  //@@
  bool is_shape_tensor = 6;

  //@@  .. cpp:var:: bool allow_ragged_batch
  //@@
  //@@     Whether or not the input is allowed to be "ragged" in a dynamically
  //@@     created batch. Default is false indicating that two requests will
  //@@     only be batched if this tensor has the same shape in both requests.
  //@@     True indicates that two requests can be batched even if this tensor
  //@@     has a different shape in each request.
  //@@
  bool allow_ragged_batch = 7;

  //@@  .. cpp:var:: bool optional
  //@@
  //@@     Whether or not the input is optional for the model execution.
  //@@     If true, the input is not required in the inference request.
  //@@     Default value is false.
  //@@
  bool optional = 8;
}

//@@
//@@.. cpp:var:: message ModelOutput
//@@
//@@   An output produced by the model.
//@@
message ModelOutput
{
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the output.
  //@@
  string name = 1;

  //@@  .. cpp:var:: DataType data_type
  //@@
  //@@     The data-type of the output.
  //@@
  DataType data_type = 2;

  //@@  .. cpp:var:: int64 dims (repeated)
  //@@
  //@@     The dimensions/shape of the output tensor.
  //@@
  repeated int64 dims = 3;

  //@@  .. cpp:var:: ModelTensorReshape reshape
  //@@
  //@@     The shape produced for this output by the backend. The output will
  //@@     be reshaped from this to the shape specifed in 'dims' before being
  //@@     returned in the inference response. The reshape must have the same
  //@@     number of elements as the output shape specified by 'dims'. Optional.
  //@@
  ModelTensorReshape reshape = 5;

  //@@  .. cpp:var:: string label_filename
  //@@
  //@@     The label file associated with this output. Should be specified only
  //@@     for outputs that represent classifications. Optional.
  //@@
  string label_filename = 4;


  //@@  .. cpp:var:: bool is_shape_tensor
  //@@
  //@@     Whether or not the output is a shape tensor to the model. This field
  //@@     is currently supported only for the TensorRT model. An error will be
  //@@     generated if this specification does not comply with underlying
  //@@     model.
  //@@
  bool is_shape_tensor = 6;
}

//@@  .. cpp:var:: message BatchInput
//@@
//@@     A batch input is an additional input that must be added by
//@@     the backend based on all the requests in a batch.
//@@
message BatchInput
{
  //@@
  //@@    .. cpp:enum:: Kind
  //@@
  //@@       The kind of the batch input.
  //@@
  enum Kind {
    //@@      .. cpp:enumerator:: Kind::BATCH_ELEMENT_COUNT = 0
    //@@
    //@@         The element count of the 'source_input' will be added as
    //@@         input with shape [1].
    //@@
    BATCH_ELEMENT_COUNT = 0;

    //@@      .. cpp:enumerator:: Kind::BATCH_ACCUMULATED_ELEMENT_COUNT = 1
    //@@
    //@@         The accumulated element count of the 'source_input' will be
    //@@         added as input with shape [1]. For example, if there is a
    //@@         batch of two request, each with 2 elements, an input of value
    //@@         2 will be added to the first request, and an input of value
    //@@         4 will be added to the second request.
    //@@
    BATCH_ACCUMULATED_ELEMENT_COUNT = 1;

    //@@      .. cpp:enumerator::
    //@@         Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2
    //@@
    //@@         The accumulated element count of the 'source_input' will be
    //@@         added as input with shape [1], except for the first request
    //@@         in the batch. For the first request in the batch, the input
    //@@         will have shape [2] where the first element is value 0.
    //@@
    BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2;

    //@@      .. cpp:enumerator:: Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3
    //@@
    //@@         Among the requests in the batch, the max element count of the
    //@@         'source_input' will be added as input with shape
    //@@         [max_element_count] for the first request in the batch.
    //@@         For other requests, such input will be with shape [0].
    //@@         The data of the tensor will be uninitialized.
    //@@
    BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3;

    //@@      .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE = 4
    //@@
    //@@         Among the requests in the batch, the shape of the
    //@@         'source_input' will be added as input with shape
    //@@         [batch_size, len(input_dim)]. For example, if one
    //@@         batch-2 input with shape [3, 1] and batch-1 input
    //@@         with shape [2, 2] are batched, the batch input will
    //@@         have shape [3, 2] and value [ [3, 1], [3, 1], [2, 2]].
    //@@
    BATCH_ITEM_SHAPE = 4;

    //@@      .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE_FLATTEN = 5
    //@@
    //@@         Among the requests in the batch, the shape of the
    //@@         'source_input' will be added as input with single dimensional
    //@@         shape [batch_size * len(input_dim)]. For example, if one
    //@@         batch-2 input with shape [3, 1] and batch-1 input
    //@@         with shape [2, 2] are batched, the batch input will
    //@@         have shape [6] and value [3, 1, 3, 1, 2, 2].
    //@@
    BATCH_ITEM_SHAPE_FLATTEN = 5;
  }

  //@@    .. cpp:var:: Kind kind
  //@@
  //@@       The kind of this batch input.
  //@@
  Kind kind = 1;

  //@@    .. cpp:var:: string target_name (repeated)
  //@@
  //@@       The name of the model inputs that the backend will create
  //@@       for this batch input.
  //@@
  repeated string target_name = 2;

  //@@    .. cpp:var:: DataType data_type
  //@@
  //@@       The input's datatype. The data type can be TYPE_INT32 or
  //@@       TYPE_FP32.
  //@@
  DataType data_type = 3;

  //@@    .. cpp:var:: string source_input (repeated)
  //@@
  //@@       The backend derives the value for each batch input from one or
  //@@       more other inputs. 'source_input' gives the names of those
  //@@       inputs.
  //@@
  repeated string source_input = 4;
}

//@@.. cpp:var:: message BatchOutput
//@@
//@@   A batch output is an output produced by the model that must be handled
//@@   differently by the backend based on all the requests in a batch.
//@@
message BatchOutput
{
  //@@
  //@@  .. cpp:enum:: Kind
  //@@
  //@@     The kind of the batch output.
  //@@
  enum Kind {
    //@@    .. cpp:enumerator:: Kind::BATCH_SCATTER_WITH_INPUT_SHAPE = 0
    //@@
    //@@       The output should be scattered according to the shape of
    //@@       'source_input'. The dynamic dimension of the output will
    //@@       be set to the value of the same dimension in the input.
    //@@
    BATCH_SCATTER_WITH_INPUT_SHAPE = 0;
  }

  //@@  .. cpp:var:: string target_name (repeated)
  //@@
  //@@     The name of the outputs to be produced by this batch output
  //@@     specification.
  //@@
  repeated string target_name = 1;

  //@@  .. cpp:var:: Kind kind
  //@@
  //@@     The kind of this batch output.
  //@@
  Kind kind = 2;

  //@@  .. cpp:var:: string source_input (repeated)
  //@@
  //@@     The backend derives each batch output from one or more inputs.
  //@@     'source_input' gives the names of those inputs.
  //@@
  repeated string source_input = 3;
}

//@@
//@@.. cpp:var:: message ModelVersionPolicy
//@@
//@@   Policy indicating which versions of a model should be made
//@@   available by the inference server.
//@@
message ModelVersionPolicy
{
  //@@  .. cpp:var:: message Latest
  //@@
  //@@     Serve only the latest version(s) of a model. This is
  //@@     the default policy.
  //@@
  message Latest
  {
    //@@    .. cpp:var:: uint32 num_versions
    //@@
    //@@       Serve only the 'num_versions' highest-numbered versions. T
    //@@       The default value of 'num_versions' is 1, indicating that by
    //@@       default only the single highest-number version of a
    //@@       model will be served.
    //@@
    uint32 num_versions = 1;
  }

  //@@  .. cpp:var:: message All
  //@@
  //@@     Serve all versions of the model.
  //@@
  message All {}

  //@@  .. cpp:var:: message Specific
  //@@
  //@@     Serve only specific versions of the model.
  //@@
  message Specific
  {
    //@@    .. cpp:var:: int64 versions (repeated)
    //@@
    //@@       The specific versions of the model that will be served.
    //@@
    repeated int64 versions = 1;
  }

  //@@  .. cpp:var:: oneof policy_choice
  //@@
  //@@     Each model must implement only a single version policy. The
  //@@     default policy is 'Latest'.
  //@@
  oneof policy_choice
  {
    //@@    .. cpp:var:: Latest latest
    //@@
    //@@       Serve only latest version(s) of the model.
    //@@
    Latest latest = 1;

    //@@    .. cpp:var:: All all
    //@@
    //@@       Serve all versions of the model.
    //@@
    All all = 2;

    //@@    .. cpp:var:: Specific specific
    //@@
    //@@       Serve only specific version(s) of the model.
    //@@
    Specific specific = 3;
  }
}

//@@
//@@.. cpp:var:: message ModelOptimizationPolicy
//@@
//@@   Optimization settings for a model. These settings control if/how a
//@@   model is optimized and prioritized by the backend framework when
//@@   it is loaded.
//@@
message ModelOptimizationPolicy
{
  //@@
  //@@  .. cpp:var:: message Graph
  //@@
  //@@     Enable generic graph optimization of the model. If not specified
  //@@     the framework's default level of optimization is used. Supports
  //@@     TensorFlow graphdef and savedmodel and Onnx models. For TensorFlow
  //@@     causes XLA to be enabled/disabled for the model. For Onnx defaults
  //@@     to enabling all optimizations, -1 enables only basic optimizations,
  //@@     +1 enables only basic and extended optimizations.
  //@@
  message Graph
  {
    //@@    .. cpp:var:: int32 level
    //@@
    //@@       The optimization level. Defaults to 0 (zero) if not specified.
    //@@
    //@@         - -1: Disabled
    //@@         -  0: Framework default
    //@@         -  1+: Enable optimization level (greater values indicate
    //@@            higher optimization levels)
    //@@
    int32 level = 1;
  }

  //@@
  //@@  .. cpp:enum:: ModelPriority
  //@@
  //@@     Model priorities. A model will be given scheduling and execution
  //@@     preference over models at lower priorities. Current model
  //@@     priorities only work for TensorRT models.
  //@@
  enum ModelPriority {
    //@@    .. cpp:enumerator:: ModelPriority::PRIORITY_DEFAULT = 0
    //@@
    //@@       The default model priority.
    //@@
    PRIORITY_DEFAULT = 0;

    //@@    .. cpp:enumerator:: ModelPriority::PRIORITY_MAX = 1
    //@@
    //@@       The maximum model priority.
    //@@
    PRIORITY_MAX = 1;

    //@@    .. cpp:enumerator:: ModelPriority::PRIORITY_MIN = 2
    //@@
    //@@       The minimum model priority.
    //@@
    PRIORITY_MIN = 2;
  }

  //@@
  //@@  .. cpp:var:: message Cuda
  //@@
  //@@     CUDA-specific optimization settings.
  //@@
  message Cuda
  {
    //@@    .. cpp:var:: message GraphSpec
    //@@
    //@@       Specification of the CUDA graph to be captured.
    //@@
    message GraphSpec
    {
      //@@      .. cpp:var:: message Dims
      //@@
      //@@         Specification of tensor dimension.
      //@@
      message Shape
      {
        //@@        .. cpp:var:: int64 dim (repeated)
        //@@
        //@@           The dimension.
        //@@
        repeated int64 dim = 1;
      }

      message LowerBound
      {
        //@@      .. cpp:var:: int32 batch_size
        //@@
        //@@         The batch size of the CUDA graph. If 'max_batch_size' is 0,
        //@@         'batch_size' must be set to 0. Otherwise, 'batch_size' must
        //@@         be set to value between 1 and 'max_batch_size'.
        //@@
        int32 batch_size = 1;

        //@@      .. cpp:var:: map<string, Shape> input
        //@@
        //@@         The specification of the inputs. 'Shape' is the shape of
        //@@         the input without batching dimension.
        //@@
        map<string, Shape> input = 2;
      }

      //@@      .. cpp:var:: int32 batch_size
      //@@
      //@@         The batch size of the CUDA graph. If 'max_batch_size' is 0,
      //@@         'batch_size' must be set to 0. Otherwise, 'batch_size' must
      //@@         be set to value between 1 and 'max_batch_size'.
      //@@
      int32 batch_size = 1;

      //@@      .. cpp:var:: map<string, Shape> input
      //@@
      //@@         The specification of the inputs. 'Shape' is the shape of the
      //@@         input without batching dimension.
      //@@
      map<string, Shape> input = 2;

      //@@      .. cpp:var:: LowerBound graph_lower_bound
      //@@
      //@@         Specify the lower bound of the CUDA graph. Optional.
      //@@         If specified, the graph can be used for input shapes and
      //@@         batch sizes that are in closed interval between the lower
      //@@         bound specification and graph specification. For dynamic
      //@@         shape model, this allows CUDA graphs to be launched
      //@@         frequently without capturing all possible shape combinations.
      //@@         However, using graph for shape combinations different from
      //@@         the one used for capturing introduces uninitialized data for
      //@@         execution and it may distort the inference result if
      //@@         the model is sensitive to uninitialized data.
      //@@
      LowerBound graph_lower_bound = 3;
    }

    //@@    .. cpp:var:: bool graphs
    //@@
    //@@       Use CUDA graphs API to capture model operations and execute
    //@@       them more efficiently. Default value is false.
    //@@       Currently only recognized by TensorRT backend.
    //@@
    bool graphs = 1;

    //@@    .. cpp:var:: bool busy_wait_events
    //@@
    //@@       Use busy-waiting to synchronize CUDA events to achieve minimum
    //@@       latency from event complete to host thread to be notified, with
    //@@       the cost of high CPU load. Default value is false.
    //@@       Currently only recognized by TensorRT backend.
    //@@
    bool busy_wait_events = 2;

    //@@    .. cpp:var:: GraphSpec graph_spec (repeated)
    //@@
    //@@       Specification of the CUDA graph to be captured. If not specified
    //@@       and 'graphs' is true, the default CUDA graphs will be captured
    //@@       based on model settings.
    //@@       Currently only recognized by TensorRT backend.
    //@@
    repeated GraphSpec graph_spec = 3;

    //@@    .. cpp:var:: bool output_copy_stream
    //@@
    //@@       Uses a CUDA stream separate from the inference stream to copy the
    //@@       output to host. However, be aware that setting this option to
    //@@       true will lead to an increase in the memory consumption of the
    //@@       model as Triton will allocate twice as much GPU memory for its
    //@@       I/O tensor buffers. Default value is false.
    //@@       Currently only recognized by TensorRT backend.
    //@@
    bool output_copy_stream = 4;
  }

  //@@
  //@@  .. cpp:var:: message ExecutionAccelerators
  //@@
  //@@     Specify the preferred execution accelerators to be used to execute
  //@@     the model. Currently only recognized by ONNX Runtime backend and
  //@@     TensorFlow backend.
  //@@
  //@@     For ONNX Runtime backend, it will deploy the model with the execution
  //@@     accelerators by priority, the priority is determined based on the
  //@@     order that they are set, i.e. the provider at the front has highest
  //@@     priority. Overall, the priority will be in the following order:
  //@@         <gpu_execution_accelerator> (if instance is on GPU)
  //@@         CUDA Execution Provider     (if instance is on GPU)
  //@@         <cpu_execution_accelerator>
  //@@         Default CPU Execution Provider
  //@@
  message ExecutionAccelerators
  {
    //@@
    //@@  .. cpp:var:: message Accelerator
    //@@
    //@@     Specify the accelerator to be used to execute the model.
    //@@     Accelerator with the same name may accept different parameters
    //@@     depending on the backends.
    //@@
    message Accelerator
    {
      //@@    .. cpp:var:: string name
      //@@
      //@@       The name of the execution accelerator.
      //@@
      string name = 1;

      //@@    .. cpp:var:: map<string, string> parameters
      //@@
      //@@       Additional paremeters used to configure the accelerator.
      //@@
      map<string, string> parameters = 2;
    }

    //@@    .. cpp:var:: Accelerator gpu_execution_accelerator (repeated)
    //@@
    //@@       The preferred execution provider to be used if the model instance
    //@@       is deployed on GPU.
    //@@
    //@@       For ONNX Runtime backend, possible value is "tensorrt" as name,
    //@@       and no parameters are required.
    //@@
    //@@       For TensorFlow backend, possible values are "tensorrt",
    //@@       "auto_mixed_precision", "gpu_io".
    //@@
    //@@       For "tensorrt", the following parameters can be specified:
    //@@         "precision_mode": The precision used for optimization.
    //@@         Allowed values are "FP32" and "FP16". Default value is "FP32".
    //@@
    //@@         "max_cached_engines": The maximum number of cached TensorRT
    //@@         engines in dynamic TensorRT ops. Default value is 100.
    //@@
    //@@         "minimum_segment_size": The smallest model subgraph that will
    //@@         be considered for optimization by TensorRT. Default value is 3.
    //@@
    //@@         "max_workspace_size_bytes": The maximum GPU memory the model
    //@@         can use temporarily during execution. Default value is 1GB.
    //@@
    //@@       For "auto_mixed_precision", no parameters are required. If set,
    //@@       the model will try to use FP16 for better performance.
    //@@       This optimization can not be set with "tensorrt".
    //@@
    //@@       For "gpu_io", no parameters are required. If set, the model will
    //@@       be executed using TensorFlow Callable API to set input and output
    //@@       tensors in GPU memory if possible, which can reduce data transfer
    //@@       overhead if the model is used in ensemble. However, the Callable
    //@@       object will be created on model creation and it will request all
    //@@       outputs for every model execution, which may impact the
    //@@       performance if a request does not require all outputs. This
    //@@       optimization will only take affect if the model instance is
    //@@       created with KIND_GPU.
    //@@
    repeated Accelerator gpu_execution_accelerator = 1;

    //@@    .. cpp:var:: Accelerator cpu_execution_accelerator (repeated)
    //@@
    //@@       The preferred execution provider to be used if the model instance
    //@@       is deployed on CPU.
    //@@
    //@@       For ONNX Runtime backend, possible value is "openvino" as name,
    //@@       and no parameters are required.
    //@@
    repeated Accelerator cpu_execution_accelerator = 2;
  }

  //@@
  //@@  .. cpp:var:: message PinnedMemoryBuffer
  //@@
  //@@     Specify whether to use a pinned memory buffer when transferring data
  //@@     between non-pinned system memory and GPU memory. Using a pinned
  //@@     memory buffer for system from/to GPU transfers will typically provide
  //@@     increased performance. For example, in the common use case where the
  //@@     request provides inputs and delivers outputs via non-pinned system
  //@@     memory, if the model instance accepts GPU IOs, the inputs will be
  //@@     processed by two copies: from non-pinned system memory to pinned
  //@@     memory, and from pinned memory to GPU memory. Similarly, pinned
  //@@     memory will be used for delivering the outputs.
  //@@
  message PinnedMemoryBuffer
  {
    //@@    .. cpp:var:: bool enable
    //@@
    //@@       Use pinned memory buffer. Default is true.
    //@@
    bool enable = 1;
  }

  //@@  .. cpp:var:: Graph graph
  //@@
  //@@     The graph optimization setting for the model. Optional.
  //@@
  Graph graph = 1;

  //@@  .. cpp:var:: ModelPriority priority
  //@@
  //@@     The priority setting for the model. Optional.
  //@@
  ModelPriority priority = 2;

  //@@  .. cpp:var:: Cuda cuda
  //@@
  //@@     CUDA-specific optimization settings. Optional.
  //@@
  Cuda cuda = 3;

  //@@  .. cpp:var:: ExecutionAccelerators execution_accelerators
  //@@
  //@@     The accelerators used for the model. Optional.
  //@@
  ExecutionAccelerators execution_accelerators = 4;

  //@@  .. cpp:var:: PinnedMemoryBuffer input_pinned_memory
  //@@
  //@@     Use pinned memory buffer when the data transfer for inputs
  //@@     is between GPU memory and non-pinned system memory.
  //@@     Default is true.
  //@@
  PinnedMemoryBuffer input_pinned_memory = 5;

  //@@  .. cpp:var:: PinnedMemoryBuffer output_pinned_memory
  //@@
  //@@     Use pinned memory buffer when the data transfer for outputs
  //@@     is between GPU memory and non-pinned system memory.
  //@@     Default is true.
  //@@
  PinnedMemoryBuffer output_pinned_memory = 6;

  //@@  .. cpp:var:: uint32 gather_kernel_buffer_threshold
  //@@
  //@@     The backend may use a gather kernel to gather input data if the
  //@@     device has direct access to the source buffer and the destination
  //@@     buffer. In such case, the gather kernel will be used only if the
  //@@     number of buffers to be gathered is greater or equal to
  //@@     the specifed value. If 0, the gather kernel will be disabled.
  //@@     Default value is 0.
  //@@     Currently only recognized by TensorRT backend.
  //@@
  uint32 gather_kernel_buffer_threshold = 7;

  //@@  .. cpp:var:: bool eager_batching
  //@@
  //@@     Start preparing the next batch before the model instance is ready
  //@@     for the next inference. This option can be used to overlap the
  //@@     batch preparation with model execution, with the trade-off that
  //@@     the next batch might be smaller than what it could have been.
  //@@     Default value is false.
  //@@     Currently only recognized by TensorRT backend.
  //@@
  bool eager_batching = 8;
}

//@@
//@@.. cpp:var:: message ModelQueuePolicy
//@@
//@@   Queue policy for inference requests.
//@@
message ModelQueuePolicy
{
  //@@
  //@@  .. cpp:enum:: TimeoutAction
  //@@
  //@@     The action applied to timed-out requests.
  //@@
  enum TimeoutAction {
    //@@    .. cpp:enumerator:: Action::REJECT = 0
    //@@
    //@@       Reject the request and return error message accordingly.
    //@@
    REJECT = 0;

    //@@    .. cpp:enumerator:: Action::DELAY = 1
    //@@
    //@@       Delay the request until all other requests at the same
    //@@       (or higher) priority levels that have not reached their timeouts
    //@@       are processed. A delayed request will eventually be processed,
    //@@       but may be delayed indefinitely due to newly arriving requests.
    //@@
    DELAY = 1;
  }

  //@@
  //@@  .. cpp:var:: TimeoutAction timeout_action
  //@@
  //@@     The action applied to timed-out request.
  //@@     The default action is REJECT.
  //@@
  TimeoutAction timeout_action = 1;

  //@@
  //@@  .. cpp:var:: uint64 default_timeout_microseconds
  //@@
  //@@     The default timeout for every request, in microseconds.
  //@@     The default value is 0 which indicates that no timeout is set.
  //@@
  uint64 default_timeout_microseconds = 2;

  //@@
  //@@  .. cpp:var:: bool allow_timeout_override
  //@@
  //@@     Whether individual request can override the default timeout value.
  //@@     When true, individual requests can set a timeout that is less than
  //@@     the default timeout value but may not increase the timeout.
  //@@     The default value is false.
  //@@
  bool allow_timeout_override = 3;

  //@@
  //@@  .. cpp:var:: uint32 max_queue_size
  //@@
  //@@     The maximum queue size for holding requests. A request will be
  //@@     rejected immediately if it can't be enqueued because the queue is
  //@@     full. The default value is 0 which indicates that no maximum
  //@@     queue size is enforced.
  //@@
  uint32 max_queue_size = 4;
}

//@@
//@@.. cpp:var:: message ModelDynamicBatching
//@@
//@@   Dynamic batching configuration. These settings control how dynamic
//@@   batching operates for the model.
//@@
message ModelDynamicBatching
{
  //@@  .. cpp:var:: int32 preferred_batch_size (repeated)
  //@@
  //@@     Preferred batch sizes for dynamic batching. If a batch of one of
  //@@     these sizes can be formed it will be executed immediately.  If
  //@@     not specified a preferred batch size will be chosen automatically
  //@@     based on model and GPU characteristics.
  //@@
  repeated int32 preferred_batch_size = 1;

  //@@  .. cpp:var:: uint64 max_queue_delay_microseconds
  //@@
  //@@     The maximum time, in microseconds, a request will be delayed in
  //@@     the scheduling queue to wait for additional requests for
  //@@     batching. Default is 0.
  //@@
  uint64 max_queue_delay_microseconds = 2;

  //@@  .. cpp:var:: bool preserve_ordering
  //@@
  //@@     Should the dynamic batcher preserve the ordering of responses to
  //@@     match the order of requests received by the scheduler. Default is
  //@@     false. If true, the responses will be returned in the same order as
  //@@     the order of requests sent to the scheduler. If false, the responses
  //@@     may be returned in arbitrary order. This option is specifically
  //@@     needed when a sequence of related inference requests (i.e. inference
  //@@     requests with the same correlation ID) are sent to the dynamic
  //@@     batcher to ensure that the sequence responses are in the correct
  //@@     order.
  //@@
  bool preserve_ordering = 3;

  //@@  .. cpp:var:: uint32 priority_levels
  //@@
  //@@     The number of priority levels to be enabled for the model,
  //@@     the priority level starts from 1 and 1 is the highest priority.
  //@@     Requests are handled in priority order with all priority 1 requests
  //@@     processed before priority 2, all priority 2 requests processed before
  //@@     priority 3, etc. Requests with the same priority level will be
  //@@     handled in the order that they are received.
  //@@
  uint32 priority_levels = 4;

  //@@  .. cpp:var:: uint32 default_priority_level
  //@@
  //@@     The priority level used for requests that don't specify their
  //@@     priority. The value must be in the range [ 1, 'priority_levels' ].
  //@@
  uint32 default_priority_level = 5;

  //@@  .. cpp:var:: ModelQueuePolicy default_queue_policy
  //@@
  //@@     The default queue policy used for requests that don't require
  //@@     priority handling and requests that specify priority levels where
  //@@     there is no specific policy given. If not specified, a policy with
  //@@     default field values will be used.
  //@@
  ModelQueuePolicy default_queue_policy = 6;

  //@@  .. cpp:var:: map<uint32, ModelQueuePolicy> priority_queue_policy
  //@@
  //@@     Specify the queue policy for the priority level. The default queue
  //@@     policy will be used if a priority level doesn't specify a queue
  //@@     policy.
  //@@
  map<uint32, ModelQueuePolicy> priority_queue_policy = 7;
}

//@@
//@@.. cpp:var:: message ModelSequenceBatching
//@@
//@@   Sequence batching configuration. These settings control how sequence
//@@   batching operates for the model.
//@@
message ModelSequenceBatching
{
  //@@  .. cpp:var:: message Control
  //@@
  //@@     A control is a signal that the sequence batcher uses to
  //@@     communicate with a backend.
  //@@
  message Control
  {
    //@@
    //@@    .. cpp:enum:: Kind
    //@@
    //@@       The kind of the control.
    //@@
    enum Kind {
      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_START = 0
      //@@
      //@@         A new sequence is/is-not starting. If true a sequence is
      //@@         starting, if false a sequence is continuing. Must
      //@@         specify either int32_false_true, fp32_false_true or
      //@@         bool_false_true for this control. This control is optional.
      //@@
      CONTROL_SEQUENCE_START = 0;

      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_READY = 1
      //@@
      //@@         A sequence is/is-not ready for inference. If true the
      //@@         input tensor data is valid and should be used. If false
      //@@         the input tensor data is invalid and inferencing should
      //@@         be "skipped". Must specify either int32_false_true,
      //@@         fp32_false_true or bool_false_true for this control. This
      //@@         control is optional.
      //@@
      CONTROL_SEQUENCE_READY = 1;

      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2
      //@@
      //@@         A sequence is/is-not ending. If true a sequence is
      //@@         ending, if false a sequence is continuing. Must specify
      //@@         either int32_false_true, fp32_false_true or bool_false_true
      //@@         for this control. This control is optional.
      //@@
      CONTROL_SEQUENCE_END = 2;

      //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_CORRID = 3
      //@@
      //@@         The correlation ID of the sequence. The correlation ID
      //@@         is an uint64_t value that is communicated in whole or
      //@@         in part by the tensor. The tensor's datatype must be
      //@@         specified by data_type and must be TYPE_UINT64, TYPE_INT64,
      //@@         TYPE_UINT32 or TYPE_INT32. If a 32-bit datatype is specified
      //@@         the correlation ID will be truncated to the low-order 32
      //@@         bits. This control is optional.
      //@@
      CONTROL_SEQUENCE_CORRID = 3;
    }

    //@@    .. cpp:var:: Kind kind
    //@@
    //@@       The kind of this control.
    //@@
    Kind kind = 1;

    //@@    .. cpp:var:: int32 int32_false_true (repeated)
    //@@
    //@@       The control's true and false setting is indicated by setting
    //@@       a value in an int32 tensor. The tensor must be a
    //@@       1-dimensional tensor with size equal to the batch size of
    //@@       the request. 'int32_false_true' must have two entries: the
    //@@       first the false value and the second the true value.
    //@@
    repeated int32 int32_false_true = 2;

    //@@    .. cpp:var:: float fp32_false_true (repeated)
    //@@
    //@@       The control's true and false setting is indicated by setting
    //@@       a value in a fp32 tensor. The tensor must be a
    //@@       1-dimensional tensor with size equal to the batch size of
    //@@       the request. 'fp32_false_true' must have two entries: the
    //@@       first the false value and the second the true value.
    //@@
    repeated float fp32_false_true = 3;

    //@@    .. cpp:var:: bool bool_false_true (repeated)
    //@@
    //@@       The control's true and false setting is indicated by setting
    //@@       a value in a bool tensor. The tensor must be a
    //@@       1-dimensional tensor with size equal to the batch size of
    //@@       the request. 'bool_false_true' must have two entries: the
    //@@       first the false value and the second the true value.
    //@@
    repeated bool bool_false_true = 5;

    //@@    .. cpp:var:: DataType data_type
    //@@
    //@@       The control's datatype.
    //@@
    DataType data_type = 4;
  }

  //@@  .. cpp:var:: message ControlInput
  //@@
  //@@     The sequence control values to communicate by a model input.
  //@@
  message ControlInput
  {
    //@@    .. cpp:var:: string name
    //@@
    //@@       The name of the model input.
    //@@
    string name = 1;

    //@@    .. cpp:var:: Control control (repeated)
    //@@
    //@@       The control value(s) that should be communicated to the
    //@@       model using this model input.
    //@@
    repeated Control control = 2;
  }

  //@@
  //@@  .. cpp:var:: message InitialState
  //@@
  //@@     Settings used to initialize data for implicit state.
  //@@
  message InitialState
  {
    //@@      .. cpp:var:: DataType data_type
    //@@
    //@@         The data-type of the state.
    //@@
    DataType data_type = 1;

    //@@      .. cpp:var:: int64 dims (repeated)
    //@@
    //@@         The shape of the state tensor, not including the batch dimension.
    //@@
    repeated int64 dims = 2;

    //@@      .. cpp:var:: oneof state_data
    //@@
    //@@         Specify how the initial state data is generated.
    //@@
    oneof state_data
    {
      //@@
      //@@      .. cpp:var:: bool zero_data
      //@@
      //@@         The identifier for using zeros as initial state data.
      //@@         Note that the value of 'zero_data' will not be checked,
      //@@         instead, zero data will be used as long as the field is set.
      //@@
      bool zero_data = 3;

      //@@      .. cpp:var:: string data_file
      //@@
      //@@         The file whose content will be used as the initial data for
      //@@         the state in row-major order. The file must be provided in
      //@@         sub-directory 'initial_state' under the model directory.
      //@@
      string data_file = 4;
    }

    //@@  .. cpp:var:: string name
    //@@
    //@@     The name of the state initialization.
    //@@
    string name = 5;
  }

  //@@  .. cpp:var:: message State
  //@@
  //@@     An input / output pair of tensors that carry state for the sequence.
  //@@
  message State
  {
    //@@    .. cpp:var:: string input_name
    //@@
    //@@       The name of the model state input.
    //@@
    string input_name = 1;

    //@@    .. cpp:var:: string output_name
    //@@
    //@@       The name of the model state output.
    //@@
    string output_name = 2;

    //@@    .. cpp:var:: DataType data_type
    //@@
    //@@       The data-type of the state.
    //@@
    DataType data_type = 3;

    //@@    .. cpp:var:: int64 dim (repeated)
    //@@
    //@@       The dimension.
    //@@
    repeated int64 dims = 4;

    //@@  .. cpp:var:: InitialState initial_state (repeated)
    //@@
    //@@     The optional field to specify the initial state for the model.
    //@@
    repeated InitialState initial_state = 5;
  }

  //@@  .. cpp:var:: message StrategyDirect
  //@@
  //@@     The sequence batcher uses a specific, unique batch
  //@@     slot for each sequence. All inference requests in a
  //@@     sequence are directed to the same batch slot in the same
  //@@     model instance over the lifetime of the sequence. This
  //@@     is the default strategy.
  //@@
  message StrategyDirect
  {
    //@@    .. cpp:var:: uint64 max_queue_delay_microseconds
    //@@
    //@@       The maximum time, in microseconds, a candidate request
    //@@       will be delayed in the sequence batch scheduling queue to
    //@@       wait for additional requests for batching. Default is 0.
    //@@
    uint64 max_queue_delay_microseconds = 1;

    //@@    .. cpp:var:: float minimum_slot_utilization
    //@@
    //@@       The minimum slot utilization that must be satisfied to
    //@@       execute the batch before 'max_queue_delay_microseconds' expires.
    //@@       For example, a value of 0.5 indicates that the batch should be
    //@@       executed as soon as 50% or more of the slots are ready even if
    //@@       the 'max_queue_delay_microseconds' timeout has not expired.
    //@@       The default is 0.0, indicating that a batch will be executed
    //@@       before 'max_queue_delay_microseconds' timeout expires if at least
    //@@       one batch slot is ready. 'max_queue_delay_microseconds' will be
    //@@       ignored unless minimum_slot_utilization is set to a non-zero
    //@@       value.
    //@@
    float minimum_slot_utilization = 2;
  }

  //@@  .. cpp:var:: message StrategyOldest
  //@@
  //@@     The sequence batcher maintains up to 'max_candidate_sequences'
  //@@     candidate sequences. 'max_candidate_sequences' can be greater
  //@@     than the model's 'max_batch_size'. For inferencing the batcher
  //@@     chooses from the candidate sequences up to 'max_batch_size'
  //@@     inference requests. Requests are chosen in an oldest-first
  //@@     manner across all candidate sequences. A given sequence is
  //@@     not guaranteed to be assigned to the same batch slot for
  //@@     all inference requests of that sequence.
  //@@
  message StrategyOldest
  {
    //@@    .. cpp:var:: int32 max_candidate_sequences
    //@@
    //@@       Maximum number of candidate sequences that the batcher
    //@@       maintains. Excess seqences are kept in an ordered backlog
    //@@       and become candidates when existing candidate sequences
    //@@       complete.
    //@@
    int32 max_candidate_sequences = 1;

    //@@    .. cpp:var:: int32 preferred_batch_size (repeated)
    //@@
    //@@       Preferred batch sizes for dynamic batching of candidate
    //@@       sequences. If a batch of one of these sizes can be formed
    //@@       it will be executed immediately. If not specified a
    //@@       preferred batch size will be chosen automatically
    //@@       based on model and GPU characteristics.
    //@@
    repeated int32 preferred_batch_size = 2;

    //@@    .. cpp:var:: uint64 max_queue_delay_microseconds
    //@@
    //@@       The maximum time, in microseconds, a candidate request
    //@@       will be delayed in the dynamic batch scheduling queue to
    //@@       wait for additional requests for batching. Default is 0.
    //@@
    uint64 max_queue_delay_microseconds = 3;
  }

  //@@  .. cpp:var:: oneof strategy_choice
  //@@
  //@@     The strategy used by the sequence batcher. Default strategy
  //@@     is 'direct'.
  //@@
  oneof strategy_choice
  {
    //@@    .. cpp:var:: StrategyDirect direct
    //@@
    //@@       StrategyDirect scheduling strategy.
    //@@
    StrategyDirect direct = 3;

    //@@    .. cpp:var:: StrategyOldest oldest
    //@@
    //@@       StrategyOldest scheduling strategy.
    //@@
    StrategyOldest oldest = 4;
  }

  //@@  .. cpp:var:: uint64 max_sequence_idle_microseconds
  //@@
  //@@     The maximum time, in microseconds, that a sequence is allowed to
  //@@     be idle before it is aborted. The inference server considers a
  //@@     sequence idle when it does not have any inference request queued
  //@@     for the sequence. If this limit is exceeded, the inference server
  //@@     will free the sequence slot allocated by the sequence and make it
  //@@     available for another sequence. If not specified (or specified as
  //@@     zero) a default value of 1000000 (1 second) is used.
  //@@
  uint64 max_sequence_idle_microseconds = 1;

  //@@  .. cpp:var:: ControlInput control_input (repeated)
  //@@
  //@@     The model input(s) that the server should use to communicate
  //@@     sequence start, stop, ready and similar control values to the
  //@@     model.
  //@@
  repeated ControlInput control_input = 2;

  //@@  .. cpp:var:: State state (repeated)
  //@@
  //@@     The optional state that can be stored in Triton for performing
  //@@     inference requests on a sequence. Each sequence holds an implicit
  //@@     state local to itself. The output state tensor provided by the
  //@@     model in 'output_name' field of the current inference request will
  //@@     be transferred as an input tensor named 'input_name' in the next
  //@@     request of the same sequence. The input state of the first request
  //@@     in the sequence contains garbage data.
  //@@
  repeated State state = 5;
}

//@@
//@@.. cpp:var:: message ModelEnsembling
//@@
//@@   Model ensembling configuration. These settings specify the models that
//@@   compose the ensemble and how data flows between the models.
//@@
message ModelEnsembling
{
  //@@  .. cpp:var:: message Step
  //@@
  //@@     Each step specifies a model included in the ensemble,
  //@@     maps ensemble tensor names to the model input tensors,
  //@@     and maps model output tensors to ensemble tensor names
  //@@
  message Step
  {
    //@@  .. cpp:var:: string model_name
    //@@
    //@@     The name of the model to execute for this step of the ensemble.
    //@@
    string model_name = 1;

    //@@  .. cpp:var:: int64 model_version
    //@@
    //@@     The version of the model to use for inference. If -1
    //@@     the latest/most-recent version of the model is used.
    //@@
    int64 model_version = 2;

    //@@  .. cpp:var:: map<string,string> input_map
    //@@
    //@@     Map from name of an input tensor on this step's model to ensemble
    //@@     tensor name. The ensemble tensor must have the same data type and
    //@@     shape as the model input. Each model input must be assigned to
    //@@     one ensemble tensor, but the same ensemble tensor can be assigned
    //@@     to multiple model inputs.
    //@@
    map<string, string> input_map = 3;

    //@@  .. cpp:var:: map<string,string> output_map
    //@@
    //@@     Map from name of an output tensor on this step's model to ensemble
    //@@     tensor name. The data type and shape of the ensemble tensor will
    //@@     be inferred from the model output. It is optional to assign all
    //@@     model outputs to ensemble tensors. One ensemble tensor name
    //@@     can appear in an output map only once.
    //@@
    map<string, string> output_map = 4;
  }

  //@@  .. cpp:var:: Step step (repeated)
  //@@
  //@@     The models and the input / output mappings used within the ensemble.
  //@@
  repeated Step step = 1;
}

//@@
//@@.. cpp:var:: message ModelParameter
//@@
//@@   A model parameter.
//@@
message ModelParameter
{
  //@@  .. cpp:var:: string string_value
  //@@
  //@@     The string value of the parameter.
  //@@
  string string_value = 1;
}

//@@
//@@.. cpp:var:: message ModelWarmup
//@@
//@@   Settings used to construct the request sample for model warmup.
//@@
message ModelWarmup
{
  //@@
  //@@  .. cpp:var:: message Input
  //@@
  //@@     Meta data associated with an input.
  //@@
  message Input
  {
    //@@    .. cpp:var:: DataType data_type
    //@@
    //@@       The data-type of the input.
    //@@
    DataType data_type = 1;

    //@@    .. cpp:var:: int64 dims (repeated)
    //@@
    //@@       The shape of the input tensor, not including the batch dimension.
    //@@
    repeated int64 dims = 2;

    //@@    .. cpp:var:: oneof input_data_type
    //@@
    //@@       Specify how the input data is generated. If the input has STRING
    //@@       data type and 'random_data' is set, the data generation will fall
    //@@       back to 'zero_data'.
    //@@
    oneof input_data_type
    {
      //@@
      //@@    .. cpp:var:: bool zero_data
      //@@
      //@@       The identifier for using zeros as input data. Note that the
      //@@       value of 'zero_data' will not be checked, instead, zero data
      //@@       will be used as long as the field is set.
      //@@
      bool zero_data = 3;

      //@@
      //@@    .. cpp:var:: bool random_data
      //@@
      //@@       The identifier for using random data as input data. Note that
      //@@       the value of 'random_data' will not be checked, instead,
      //@@       random data will be used as long as the field is set.
      //@@
      bool random_data = 4;

      //@@    .. cpp:var:: string input_data_file
      //@@
      //@@       The file whose content will be used as raw input data in
      //@@       row-major order. The file must be provided in a sub-directory
      //@@       'warmup' under the model directory. The file contents should be
      //@@       in binary format. For TYPE_STRING data-type, an element is
      //@@       represented by a 4-byte unsigned integer giving the length 
      //@@       followed by the actual bytes.
      //@@
      string input_data_file = 5;
    }
  }

  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the request sample.
  //@@
  string name = 1;

  //@@  .. cpp:var:: uint32 batch_size
  //@@
  //@@     The batch size of the inference request. This must be >= 1. For
  //@@     models that don't support batching, batch_size must be 1. If
  //@@     batch_size > 1, the 'inputs' specified below will be duplicated to
  //@@     match the batch size requested.
  //@@
  uint32 batch_size = 2;

  //@@  .. cpp:var:: map<string, Input> inputs
  //@@
  //@@     The warmup meta data associated with every model input, including
  //@@     control tensors.
  //@@
  map<string, Input> inputs = 3;

  //@@  .. cpp:var:: uint32 count
  //@@
  //@@     The number of iterations that this warmup sample will be executed.
  //@@     For example, if this field is set to 2, 2 model executions using this
  //@@     sample will be scheduled for warmup. Default value is 0 which
  //@@     indicates that this sample will be used only once.
  //@@     Note that for sequence model, 'count' may not work well
  //@@     because the model often expect a valid sequence of requests which
  //@@     should be represented by a series of warmup samples. 'count > 1'
  //@@     essentially "resends" one of the sample, which may invalidate the
  //@@     sequence and result in unexpected warmup failure.
  //@@
  uint32 count = 4;
}

//@@
//@@ .. cpp:var:: message ModelOperations
//@@
//@@    The metadata of libraries providing custom operations for this model.
//@@
message ModelOperations
{
  //@@  .. cpp:var:: string op_library_filename (repeated)
  //@@
  //@@     Optional paths of the libraries providing custom operations for
  //@@     this model. Valid only for ONNX models.
  //@@
  repeated string op_library_filename = 1;
}

//@@
//@@ .. cpp:var:: message ModelTransactionPolicy
//@@
//@@    The specification that describes the nature of transactions
//@@    to be expected from the model.
//@@
message ModelTransactionPolicy
{
  //@@  .. cpp:var:: bool decoupled
  //@@
  //@@     Indicates whether responses generated by the model are decoupled with
  //@@     the requests issued to it, which means the number of responses
  //@@     generated by model may differ from number of requests issued, and
  //@@     that the responses may be out of order relative to the order of
  //@@     requests. The default is false, which means the model will generate
  //@@     exactly one response for each request.
  //@@
  bool decoupled = 1;
}

//@@
//@@.. cpp:var:: message ModelRepositoryAgents
//@@
//@@   The repository agents for the model.
//@@
message ModelRepositoryAgents
{
  //@@
  //@@  .. cpp:var:: message Agent
  //@@
  //@@     A repository agent that should be invoked for the specified
  //@@     repository actions for this model.
  //@@
  message Agent
  {
    //@@    .. cpp:var:: string name
    //@@
    //@@       The name of the agent.
    //@@
    string name = 1;

    //@@    .. cpp:var:: map<string, string> parameters
    //@@
    //@@       The parameters for the agent.
    //@@
    map<string, string> parameters = 2;
  }

  //@@
  //@@  .. cpp:var:: Agent agents (repeated)
  //@@
  //@@     The ordered list of agents for the model. These agents will be
  //@@     invoked in order to respond to repository actions occuring for the
  //@@     model.
  //@@
  repeated Agent agents = 1;
}

//@@
//@@.. cpp:var:: message ModelResponseCache
//@@
//@@   The response cache setting for the model.
//@@
message ModelResponseCache
{
  //@@
  //@@  .. cpp::var:: bool enable
  //@@
  //@@     Whether or not to use response cache for the model. If True, the
  //@@     responses from the model are cached and when identical request
  //@@     is encountered, instead of going through the model execution,
  //@@     the response from the cache is utilized. By default, response
  //@@     cache is disabled for the models.
  //@@
  bool enable = 1;
}

//@@
//@@.. cpp:var:: message ModelConfig
//@@
//@@   A model configuration.
//@@
message ModelConfig
{
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the model.
  //@@
  string name = 1;

  //@@  .. cpp:var:: string platform
  //@@
  //@@     The framework for the model. Possible values are
  //@@     "tensorrt_plan", "tensorflow_graphdef",
  //@@     "tensorflow_savedmodel", "onnxruntime_onnx",
  //@@     "pytorch_libtorch".
  //@@
  string platform = 2;

  //@@  .. cpp:var:: string backend
  //@@
  //@@     The backend used by the model.
  //@@
  string backend = 17;

  //@@  .. cpp:var:: ModelVersionPolicy version_policy
  //@@
  //@@     Policy indicating which version(s) of the model will be served.
  //@@
  ModelVersionPolicy version_policy = 3;

  //@@  .. cpp:var:: int32 max_batch_size
  //@@
  //@@     Maximum batch size allowed for inference. This can only decrease
  //@@     what is allowed by the model itself. A max_batch_size value of 0
  //@@     indicates that batching is not allowed for the model and the
  //@@     dimension/shape of the input and output tensors must exactly
  //@@     match what is specified in the input and output configuration. A
  //@@     max_batch_size value > 0 indicates that batching is allowed and
  //@@     so the model expects the input tensors to have an additional
  //@@     initial dimension for the batching that is not specified in the
  //@@     input (for example, if the model supports batched inputs of
  //@@     2-dimensional tensors then the model configuration will specify
  //@@     the input shape as [ X, Y ] but the model will expect the actual
  //@@     input tensors to have shape [ N, X, Y ]). For max_batch_size > 0
  //@@     returned outputs will also have an additional initial dimension
  //@@     for the batch.
  //@@
  int32 max_batch_size = 4;

  //@@  .. cpp:var:: ModelInput input (repeated)
  //@@
  //@@     The inputs request by the model.
  //@@
  repeated ModelInput input = 5;

  //@@  .. cpp:var:: ModelOutput output (repeated)
  //@@
  //@@     The outputs produced by the model.
  //@@
  repeated ModelOutput output = 6;

  //@@  .. cpp:var:: BatchInput batch_input (repeated)
  //@@
  //@@     The model input(s) that the server should use to communicate
  //@@     batch related values to the model.
  //@@
  repeated BatchInput batch_input = 20;

  //@@  .. cpp:var:: BatchOutput batch_output (repeated)
  //@@
  //@@     The outputs produced by the model that requires special handling
  //@@     by the model backend.
  //@@
  repeated BatchOutput batch_output = 21;

  //@@  .. cpp:var:: ModelOptimizationPolicy optimization
  //@@
  //@@     Optimization configuration for the model. If not specified
  //@@     then default optimization policy is used.
  //@@
  ModelOptimizationPolicy optimization = 12;

  //@@  .. cpp:var:: oneof scheduling_choice
  //@@
  //@@     The scheduling policy for the model. If not specified the
  //@@     default scheduling policy is used for the model. The default
  //@@     policy is to execute each inference request independently.
  //@@
  oneof scheduling_choice
  {
    //@@    .. cpp:var:: ModelDynamicBatching dynamic_batching
    //@@
    //@@       If specified, enables the dynamic-batching scheduling
    //@@       policy. With dynamic-batching the scheduler may group
    //@@       together independent requests into a single batch to
    //@@       improve inference throughput.
    //@@
    ModelDynamicBatching dynamic_batching = 11;

    //@@    .. cpp:var:: ModelSequenceBatching sequence_batching
    //@@
    //@@       If specified, enables the sequence-batching scheduling
    //@@       policy. With sequence-batching, inference requests
    //@@       with the same correlation ID are routed to the same
    //@@       model instance. Multiple sequences of inference requests
    //@@       may be batched together into a single batch to
    //@@       improve inference throughput.
    //@@
    ModelSequenceBatching sequence_batching = 13;

    //@@    .. cpp:var:: ModelEnsembling ensemble_scheduling
    //@@
    //@@       If specified, enables the model-ensembling scheduling
    //@@       policy. With model-ensembling, inference requests
    //@@       will be processed according to the specification, such as an
    //@@       execution sequence of models. The input specified in this model
    //@@       config will be the input for the ensemble, and the output
    //@@       specified will be the output of the ensemble.
    //@@
    ModelEnsembling ensemble_scheduling = 15;
  }

  //@@  .. cpp:var:: ModelInstanceGroup instance_group (repeated)
  //@@
  //@@     Instances of this model. If not specified, one instance
  //@@     of the model will be instantiated on each available GPU.
  //@@
  repeated ModelInstanceGroup instance_group = 7;

  //@@  .. cpp:var:: string default_model_filename
  //@@
  //@@     Optional filename of the model file to use if a
  //@@     compute-capability specific model is not specified in
  //@@     :cpp:var:`cc_model_filenames`. If not specified the default name
  //@@     is 'model.graphdef', 'model.savedmodel', 'model.plan' or
  //@@     'model.pt' depending on the model type.
  //@@
  string default_model_filename = 8;

  //@@  .. cpp:var:: map<string,string> cc_model_filenames
  //@@
  //@@     Optional map from CUDA compute capability to the filename of
  //@@     the model that supports that compute capability. The filename
  //@@     refers to a file within the model version directory.
  //@@
  map<string, string> cc_model_filenames = 9;

  //@@  .. cpp:var:: map<string,string> metric_tags
  //@@
  //@@     Optional metric tags. User-specific key-value pairs for metrics
  //@@     reported for this model. These tags are applied to the metrics
  //@@     reported on the HTTP metrics port.
  //@@
  map<string, string> metric_tags = 10;

  //@@  .. cpp:var:: map<string,ModelParameter> parameters
  //@@
  //@@     Optional model parameters. User-specified parameter values.
  //@@
  map<string, ModelParameter> parameters = 14;

  //@@  .. cpp:var:: ModelWarmup model_warmup (repeated)
  //@@
  //@@     Warmup setting of this model. If specified, all instances
  //@@     will be run with the request samples in sequence before
  //@@     serving the model.
  //@@     This field can only be specified if the model is not an ensemble
  //@@     model.
  //@@
  repeated ModelWarmup model_warmup = 16;

  //@@  .. cpp:var:: ModelOperations model_operations
  //@@
  //@@     Optional metadata of the libraries providing custom operations for
  //@@     this model.
  //@@
  ModelOperations model_operations = 18;

  //@@  .. cpp:var:: ModelTransactionPolicy model_transaction_policy
  //@@
  //@@     Optional specification that describes the nature of transactions
  //@@     to be expected from the model.
  //@@
  ModelTransactionPolicy model_transaction_policy = 19;

  //@@  .. cpp:var:: ModelRepositoryAgents model_repository_agents
  //@@
  //@@     Optional specification of the agent(s) that should be invoked
  //@@     with repository actions are performed for this model.
  //@@
  ModelRepositoryAgents model_repository_agents = 23;

  //@@  .. cpp:var:: ModelResponseCache response_cache
  //@@
  //@@     Optional setting for utilizing the response cache for this
  //@@     model.
  //@@
  ModelResponseCache response_cache = 24;
}