# the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked
handler=RequestHandler(engine_client)
ifengine_args.is_embedding:
awaitendpoint.serve_endpoint(handler.encode)
else:
awaitendpoint.serve_endpoint(handler.generate)
defcmd_line_args():
parser=argparse.ArgumentParser(
description="SGLang server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--base-gpu-id",
type=int,
default=0,
help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
)
parser.add_argument(
"--tensor-parallel-size",type=int,default=1,help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size",type=int,default=16,help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--nnodes",type=int,default=1,help="The number of machines SGLang will use"
)
parser.add_argument(
"--node-rank",
type=int,
default=0,
help="Unique number for each node. 0 for the leader.",
)
parser.add_argument(
"--dist-init-addr",
type=str,
default="",
help="Host address (e.g., `192.168.0.2:25000`) of the node with rank 0",
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the SGLang Engine.",
)
args=parser.parse_args()
config=Config()
config.model_path=args.model_path
ifargs.model_name:
config.model_name=args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name=None
endpoint_str=args.endpoint.replace("dyn://","",1)
endpoint_parts=endpoint_str.split(".")
iflen(endpoint_parts)!=3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
description="TensorRT-LLM server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--tensor-parallel-size",type=int,default=1,help="Number of GPUs to use."
)
# IMPORTANT: We should ideally not expose this to users. We should be able to
# query the block size from the TRTLLM engine.
parser.add_argument(
"--kv-block-size",type=int,default=32,help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="This argument is not used by TRTLLM. Please provide max_input_len, max_seq_len and max_output_len in yaml file and point --extra-engine-args to the yaml file.",
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.",
)
parser.add_argument(
"--publish-events-and-metrics",
action="store_true",
help="Publish events and metrics to the dynamo components. Note: This is not supported when running in prefill disaggregation mode.",
help="Specifies the task for the engine. Can be specified multiple time for different tasks. Will raise an error if conflicting tasks are specified.",
)
parser.add_argument(
"--remote-prefill-endpoint",
type=str,
default=DEFAULT_PREFILL_ENDPOINT,
help=f"Endpoint(in 'dyn://namespace.component.endpoint' format) to send prefill requests to when running in decode disaggregation mode. Default: {DEFAULT_PREFILL_ENDPOINT}",
)
args=parser.parse_args()
# Validate arguments
ifargs.context_lengthisnotNone:
warnings.warn(
"--context-length is accepted for compatibility but will be ignored for TensorRT-LLM. Please provide max_input_len, max_seq_len and max_output_len in yaml file and point --extra-engine-args to the yaml file.",
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked
awaitendpoint.serve_endpoint(handler.generate)
defcmd_line_args():
parser=argparse.ArgumentParser(
description="vLLM server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--tensor-parallel-size",type=int,default=1,help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size",type=int,default=16,help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
)
args=parser.parse_args()
config=Config()
config.model_path=args.model_path
ifargs.model_name:
config.model_name=args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name=None
endpoint_str=args.endpoint.replace("dyn://","",1)
endpoint_parts=endpoint_str.split(".")
iflen(endpoint_parts)!=3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
description="vLLM server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--tensor-parallel-size",type=int,default=1,help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size",type=int,default=16,help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
)
args=parser.parse_args()
config=Config()
config.model_path=args.model_path
ifargs.model_name:
config.model_name=args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name=None
endpoint_str=args.endpoint.replace("dyn://","",1)
endpoint_parts=endpoint_str.split(".")
iflen(endpoint_parts)!=3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked
handler=RequestHandler(engine_client)
ifengine_args.is_embedding:
awaitendpoint.serve_endpoint(handler.encode)
else:
awaitendpoint.serve_endpoint(handler.generate)
defcmd_line_args():
parser=argparse.ArgumentParser(
description="SGLang server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--base-gpu-id",
type=int,
default=0,
help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
)
parser.add_argument(
"--tensor-parallel-size",type=int,default=1,help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size",type=int,default=16,help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--nnodes",type=int,default=1,help="The number of machines SGLang will use"
)
parser.add_argument(
"--node-rank",
type=int,
default=0,
help="Unique number for each node. 0 for the leader.",
)
parser.add_argument(
"--dist-init-addr",
type=str,
default="",
help="Host address (e.g., `192.168.0.2:25000`) of the node with rank 0",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the SGLang Engine.",
)
args=parser.parse_args()
config=Config()
config.model_path=args.model_path
ifargs.model_name:
config.model_name=args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name=None
endpoint_str=args.endpoint.replace("dyn://","",1)
endpoint_parts=endpoint_str.split(".")
iflen(endpoint_parts)!=3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
description="TensorRT-LLM server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--tensor-parallel-size",type=int,default=1,help="Number of GPUs to use."
)
# IMPORTANT: We should ideally not expose this to users. We should be able to
# query the block size from the TRTLLM engine.
parser.add_argument(
"--kv-block-size",type=int,default=32,help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="This argument is not used by TRTLLM. Please provide max_input_len, max_seq_len and max_output_len in yaml file and point --extra-engine-args to the yaml file.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.",
)
parser.add_argument(
"--publish-events-and-metrics",
action="store_true",
help="Publish events and metrics to the dynamo components. Note: This is not supported when running in prefill disaggregation mode.",
help="Specifies the task for the engine. Can be specified multiple time for different tasks. Will raise an error if conflicting tasks are specified.",
)
parser.add_argument(
"--remote-prefill-endpoint",
type=str,
default=DEFAULT_PREFILL_ENDPOINT,
help=f"Endpoint(in 'dyn://namespace.component.endpoint' format) to send prefill requests to when running in decode disaggregation mode. Default: {DEFAULT_PREFILL_ENDPOINT}",
)
args=parser.parse_args()
# Validate arguments
ifargs.context_lengthisnotNone:
warnings.warn(
"--context-length is accepted for compatibility but will be ignored for TensorRT-LLM. Please provide max_input_len, max_seq_len and max_output_len in yaml file and point --extra-engine-args to the yaml file.",
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked
awaitendpoint.serve_endpoint(handler.generate)
defcmd_line_args():
parser=argparse.ArgumentParser(
description="vLLM server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--tensor-parallel-size",type=int,default=1,help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size",type=int,default=16,help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
)
args=parser.parse_args()
config=Config()
config.model_path=args.model_path
ifargs.model_name:
config.model_name=args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name=None
endpoint_str=args.endpoint.replace("dyn://","",1)
endpoint_parts=endpoint_str.split(".")
iflen(endpoint_parts)!=3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."