# the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked
awaitendpoint.serve_endpoint(handler.generate)
defcmd_line_args():
parser=argparse.ArgumentParser(
description="vLLM server integrated with Dynamo LLM."
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-path",
type=str,
default=DEFAULT_MODEL,
help=f"Path to disk model or HuggingFace model identifier to load. Default: {DEFAULT_MODEL}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--tensor-parallel-size",type=int,default=1,help="Number of GPUs to use."
)
parser.add_argument(
"--kv-block-size",type=int,default=16,help="Size of a KV cache block."
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
parser.add_argument(
"--extra-engine-args",
type=str,
default="",
help="Path to a JSON file containing additional keyword arguments to pass to the vLLM AsyncLLMEngine.",
)
args=parser.parse_args()
config=Config()
config.model_path=args.model_path
ifargs.model_name:
config.model_name=args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name=None
endpoint_str=args.endpoint.replace("dyn://","",1)
endpoint_parts=endpoint_str.split(".")
iflen(endpoint_parts)!=3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."