"--log-level",type=int,default=1,help="log level applied to all workers"
)
parser.add_argument(
"--request-plane-uri",
type=str,
default="nats://localhost:4223",
help="URI of request plane",
)
parser.add_argument(
"--starting-metrics-port",
type=int,
default=50000,
help="Metrics port for first worker. Each worker will expose metrics on subsequent ports, ex. worker 1: 50000, worker 2: 50001, worker 3: 50002",
)
parser.add_argument(
"--operator-repository",
type=str,
default=str(default_operator_repository),
help="operator repository",
)
parser.add_argument(
"--triton-core-models",
type=str,
default=str(default_triton_core_models),
help="model repository for triton core models.",
)
parser.add_argument(
"--encoder-delay-per-token",
type=float,
default=0,
help="Delay per input token. In this toy example can be used to vary the simulated compute load for encoding stage.",
)
parser.add_argument(
"--encoder-input-copies",
type=int,
default=1,
help="Number of copies of input to create during encoding. In this toy example can be used to vary the memory transferred between encoding and decoding stages.",
)
parser.add_argument(
"--encoders",
type=str,
nargs=4,
default=["1","1","1","CPU"],
help="Number of encoding workers to deploy. Specified as #Workers, #MaxInflightRequests, #ModelInstancesPerWorker, CPU || GPU",
)
parser.add_argument(
"--decoders",
type=str,
nargs=4,
default=["1","1","1","CPU"],
help="Number of decoding workers to deploy. Specified as #Workers, #MaxInflightRequests,#ModelInstancesPerWorker, CPU || GPU",
)
parser.add_argument(
"--decoder-delay-per-token",
type=float,
default=0,
help="Delay per input token. In this toy example can be used to vary the simulated compute load for decoding stage.",
)
parser.add_argument(
"--encoder-decoders",
type=str,
nargs=2,
default=["1","1"],
help="Number of encode-decode workers to deploy. Specified as #Worker, #MaxInflightRequests",