"help":"Use SGLang's tokenizer. This will skip tokenization of the input and output and only v1/chat/completions will be available when using the dynamo frontend. Cannot be used with --custom-jinja-template.",
"help":"Use SGLang's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend. Cannot be used with --custom-jinja-template.",
help="Determines how requests are distributed from routers to workers. 'tcp' is fastest [nats|http|tcp]",
)
parser.add_argument(
"--use-vllm-tokenizer",
action="store_true",
default=False,
help="Use vLLM's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend.",
)
add_config_dump_args(parser)
parser=AsyncEngineArgs.add_cli_args(parser)
...
...
@@ -303,6 +312,7 @@ def parse_args() -> Config:
config.mm_prompt_template=args.mm_prompt_template
config.store_kv=args.store_kv
config.request_plane=args.request_plane
config.use_vllm_tokenizer=args.use_vllm_tokenizer
# Validate custom Jinja template file exists if provided