"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/claude"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/deepseek"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
},
{
"name": "api_base",
"type": "string",
"required": false,
"description": "The base url of the DeepSeek API.",
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/gemini"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
},
{
"name": "api_base",
"type": "string",
"required": false,
"description": "The base url of the gemini API.",
"defaultValue": "${env:GEMINI_PROXY_API_BASE}"
},
{
"name": "api_key",
"type": "string",
"required": false,
"description": "The API key of the gemini API.",
"defaultValue": "${env:GEMINI_PROXY_API_KEY}"
},
{
"name": "api_type",
"type": "string",
"required": false,
"description": "The type of the OpenAI API, if you use Azure, it can be: azure"
},
{
"name": "api_version",
"type": "string",
"required": false,
"description": "The version of the OpenAI API."
},
{
"name": "http_proxy",
"type": "string",
"required": false,
"description": "The http or https proxy to use openai"
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/gitee"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
<ConfigDetail config={{
"name": "HFLLMDeployModelParameters",
"description": "Local deploy model parameters.",
"documentationUrl": "",
"parameters": [
{
"name": "name",
"type": "string",
"required": true,
"description": "The name of the model."
},
{
"name": "path",
"type": "string",
"required": false,
"description": "The path of the model, if you want to deploy a local model."
},
{
"name": "backend",
"type": "string",
"required": false,
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "device",
"type": "string",
"required": false,
"description": "Device to run model. If None, the device is automatically determined"
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "hf"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "5"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the model. If None, it is automatically determined from model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
"description": "Whether to use low CPU memory usage mode. It can reduce the memory when loading the model, if you load your model with quantization, it will be True by default. You must install `accelerate` to make it work."
},
{
"name": "num_gpus",
"type": "integer",
"required": false,
"description": "The number of gpus you expect to use, if it is empty, use all of them as much as possible"
},
{
"name": "max_gpu_memory",
"type": "string",
"required": false,
"description": "The maximum memory limit of each GPU, only valid in multi-GPU configuration, eg: 10GiB, 24GiB"
},
{
"name": "torch_dtype",
"type": "string",
"required": false,
"description": "The dtype of the model, default is None.",
"validValues": [
"auto",
"float16",
"bfloat16",
"float",
"float32"
]
},
{
"name": "attn_implementation",
"type": "string",
"required": false,
"description": "The attention implementation, only valid in multi-GPU configuration",
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "device",
"type": "string",
"required": false,
"description": "Device to run model. If None, the device is automatically determined"
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "llama.cpp.server"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "20"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the model. If None, it is automatically determined from model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
},
{
"name": "model_hf_repo",
"type": "string",
"required": false,
"description": "Hugging Face repository for model download"
},
{
"name": "model_hf_file",
"type": "string",
"required": false,
"description": "Model file name in the Hugging Face repository"
},
{
"name": "server_bin_path",
"type": "string",
"required": false,
"description": "Path to the server binary executable"
},
{
"name": "server_host",
"type": "string",
"required": false,
"description": "Host address to bind the server",
"defaultValue": "127.0.0.1"
},
{
"name": "server_port",
"type": "integer",
"required": false,
"description": "Port to bind the server. 0 for random available port",
"defaultValue": "0"
},
{
"name": "temperature",
"type": "number",
"required": false,
"description": "Sampling temperature for text generation",
"description": "The path of the model, if you want to deploy a local model."
},
{
"name": "backend",
"type": "string",
"required": false,
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "device",
"type": "string",
"required": false,
"description": "Device to run model. If None, the device is automatically determined"
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "llama.cpp"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "5"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the model. If None, it is automatically determined from model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
},
{
"name": "seed",
"type": "integer",
"required": false,
"description": "Random seed for llama-cpp models. -1 for random",
"defaultValue": "-1"
},
{
"name": "n_threads",
"type": "integer",
"required": false,
"description": "Number of threads to use. If None, the number of threads is automatically determined"
},
{
"name": "n_batch",
"type": "integer",
"required": false,
"description": "Maximum number of prompt tokens to batch together when calling llama_eval",
"defaultValue": "512"
},
{
"name": "n_gpu_layers",
"type": "integer",
"required": false,
"description": "Number of layers to offload to the GPU, Set this to 1000000000 to offload all layers to the GPU.",
"defaultValue": "1000000000"
},
{
"name": "n_gqa",
"type": "integer",
"required": false,
"description": "Grouped-query attention. Must be 8 for llama-2 70b."
},
{
"name": "rms_norm_eps",
"type": "number",
"required": false,
"description": "5e-6 is a good value for llama-2 models.",
"defaultValue": "5e-06"
},
{
"name": "cache_capacity",
"type": "string",
"required": false,
"description": "Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. "
},
{
"name": "prefer_cpu",
"type": "boolean",
"required": false,
"description": "If a GPU is available, it will be preferred by default, unless prefer_cpu=False is configured.",
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/moonshot"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
},
{
"name": "api_base",
"type": "string",
"required": false,
"description": "The base url of the Moonshot API.",
import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
<ConfigDetail config={{
"name": "OllamaDeployModelParameters",
"description": "Ollama proxy LLM configuration.",
"documentationUrl": "https://ollama.com/library",
"parameters": [
{
"name": "name",
"type": "string",
"required": true,
"description": "The name of the model."
},
{
"name": "backend",
"type": "string",
"required": false,
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/ollama"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "5"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the model. If None, it is automatically determined from model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
description: "Bits and bytes quantization 4 bits parameters."
---
import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
<ConfigDetail config={{
"name": "BitsandbytesQuantization4bits",
"description": "Bits and bytes quantization 4 bits parameters.",
"documentationUrl": "",
"parameters": [
{
"name": "load_in_8bits",
"type": "boolean",
"required": false,
"description": "Whether to load the model in 8 bits(LLM.int8() algorithm), default is False.",
"defaultValue": "False"
},
{
"name": "load_in_4bits",
"type": "boolean",
"required": false,
"description": "Whether to load the model in 4 bits.",
"defaultValue": "True"
},
{
"name": "bnb_4bit_compute_dtype",
"type": "string",
"required": false,
"description": "To speedup computation, you can change the data type from float32 (the default value) to bfloat16",
"validValues": [
"bfloat16",
"float16",
"float32"
]
},
{
"name": "bnb_4bit_quant_type",
"type": "string",
"required": false,
"description": "Quantization datatypes, `fp4` (four bit float) and `nf4` (normal four bit float), only valid when load_4bit=True",
"defaultValue": "nf4",
"validValues": [
"nf4",
"fp4"
]
},
{
"name": "bnb_4bit_use_double_quant",
"type": "boolean",
"required": false,
"description": "Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. ",
description: "Bits and bytes quantization 8 bits parameters."
---
import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
<ConfigDetail config={{
"name": "BitsandbytesQuantization8bits",
"description": "Bits and bytes quantization 8 bits parameters.",
"documentationUrl": "",
"parameters": [
{
"name": "load_in_8bits",
"type": "boolean",
"required": false,
"description": "Whether to load the model in 8 bits(LLM.int8() algorithm).",
"defaultValue": "True"
},
{
"name": "load_in_4bits",
"type": "boolean",
"required": false,
"description": "Whether to load the model in 4 bits, default is False.",
"defaultValue": "False"
},
{
"name": "llm_int8_enable_fp32_cpu_offload",
"type": "boolean",
"required": false,
"description": "8-bit models can offload weights between the CPU and GPU to support fitting very large models into memory. The weights dispatched to the CPU are actually stored in float32, and aren’t converted to 8-bit. ",
"defaultValue": "False"
},
{
"name": "llm_int8_threshold",
"type": "number",
"required": false,
"description": "An “outlier” is a hidden state value greater than a certain threshold, and these values are computed in fp16. While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). 8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning).",
"defaultValue": "6.0"
},
{
"name": "llm_int8_skip_modules",
"type": "string",
"required": false,
"description": "An explicit list of the modules that we do not want to convert in 8-bit. This is useful for models such as Jukebox that has several heads in different places and not necessarily at the last position. For example for `CausalLM` models, the last `lm_head` is kept in its original `dtype`",
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/siliconflow"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
},
{
"name": "api_base",
"type": "string",
"required": false,
"description": "The base url of the SiliconFlow API.",
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/spark"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/tongyi"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
<ConfigDetail config={{
"name": "VLLMDeployModelParameters",
"description": "Local deploy model parameters.",
"documentationUrl": "",
"parameters": [
{
"name": "name",
"type": "string",
"required": true,
"description": "The name of the model."
},
{
"name": "path",
"type": "string",
"required": false,
"description": "The path of the model, if you want to deploy a local model."
},
{
"name": "backend",
"type": "string",
"required": false,
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "device",
"type": "string",
"required": false,
"description": "Device to run model. If None, the device is automatically determined",
"defaultValue": "auto"
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "vllm"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the model. If None, it is automatically determined from model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
},
{
"name": "trust_remote_code",
"type": "boolean",
"required": false,
"description": "Trust remote code or not.",
"defaultValue": "True"
},
{
"name": "download_dir",
"type": "string",
"required": false,
"description": "Directory to download and load the weights, default to the default cache dir of huggingface."
},
{
"name": "load_format",
"type": "string",
"required": false,
"description": "The format of the model weights to load.\n\n* \"auto\" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.\n* \"pt\" will load the weights in the pytorch bin format.\n* \"safetensors\" will load the weights in the safetensors format.\n* \"npcache\" will load the weights in pytorch format and store a numpy cache to speed up the loading.\n* \"dummy\" will initialize the weights with random values, which is mainly for profiling.\n* \"tensorizer\" will load the weights using tensorizer from CoreWeave. See the Tensorize vLLM Model script in the Examples section for more information.\n* \"runai_streamer\" will load the Safetensors weights using Run:aiModel Streamer \n* \"bitsandbytes\" will load the weights using bitsandbytes quantization.\n",
"defaultValue": "auto",
"validValues": [
"auto",
"pt",
"safetensors",
"npcache",
"dummy",
"tensorizer",
"runai_streamer",
"bitsandbytes",
"sharded_state",
"gguf",
"mistral"
]
},
{
"name": "config_format",
"type": "string",
"required": false,
"description": "The format of the model config to load.\n\n* \"auto\" will try to load the config in hf format if available else it will try to load in mistral format ",
"defaultValue": "auto",
"validValues": [
"auto",
"hf",
"mistral"
]
},
{
"name": "dtype",
"type": "string",
"required": false,
"description": "Data type for model weights and activations.\n\n* \"auto\" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.\n* \"half\" for FP16. Recommended for AWQ quantization.\n* \"float16\" is the same as \"half\".\n* \"bfloat16\" for a balance between precision and range.\n* \"float\" is shorthand for FP32 precision.\n* \"float32\" for FP32 precision.",
"defaultValue": "auto",
"validValues": [
"auto",
"half",
"float16",
"bfloat16",
"float",
"float32"
]
},
{
"name": "kv_cache_dtype",
"type": "string",
"required": false,
"description": "Data type for kv cache storage. If \"auto\", will use model data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports fp8 (=fp8_e4m3)",
"defaultValue": "auto",
"validValues": [
"auto",
"fp8",
"fp8_e5m2",
"fp8_e4m3"
]
},
{
"name": "seed",
"type": "integer",
"required": false,
"description": "Random seed for operations.",
"defaultValue": "0"
},
{
"name": "max_model_len",
"type": "integer",
"required": false,
"description": "Model context length. If unspecified, will be automatically derived from the model config."
},
{
"name": "distributed_executor_backend",
"type": "string",
"required": false,
"description": "Backend to use for distributed model workers, either \"ray\" or \"mp\" (multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size is less than or equal to the number of GPUs available, \"mp\" will be used to keep processing on a single host. Otherwise, this will default to \"ray\" if Ray is installed and fail otherwise. Note that tpu only supports Ray for distributed inference.",
"validValues": [
"ray",
"mp",
"uni",
"external_launcher"
]
},
{
"name": "pipeline_parallel_size",
"type": "integer",
"required": false,
"description": "Number of pipeline stages.",
"defaultValue": "1"
},
{
"name": "tensor_parallel_size",
"type": "integer",
"required": false,
"description": "Number of tensor parallel replicas.",
"defaultValue": "1"
},
{
"name": "max_parallel_loading_workers",
"type": "integer",
"required": false,
"description": "Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models."
},
{
"name": "block_size",
"type": "integer",
"required": false,
"description": "Token block size for contiguous chunks of tokens. This is ignored on neuron devices and set to ``--max-model-len``. On CUDA devices, only block sizes up to 32 are supported. On HPU devices, block size defaults to 128.",
"description": "CPU swap space size (GiB) per GPU.",
"defaultValue": "4"
},
{
"name": "cpu_offload_gb",
"type": "number",
"required": false,
"description": "The space in GiB to offload to CPU, per GPU. Default is 0, which means no offloading. Intuitively, this argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass.",
"defaultValue": "0"
},
{
"name": "gpu_memory_utilization",
"type": "number",
"required": false,
"description": "The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. For example, a value of 0.5 would imply 50%% GPU memory utilization. If unspecified, will use the default value of 0.9. This is a per-instance limit, and only applies to the current vLLM instance.It does not matter if you have another vLLM instance running on the same GPU. For example, if you have two vLLM instances running on the same GPU, you can set the GPU memory utilization to 0.5 for each instance.",
"defaultValue": "0.9"
},
{
"name": "max_num_batched_tokens",
"type": "integer",
"required": false,
"description": "Maximum number of batched tokens per iteration."
},
{
"name": "max_num_seqs",
"type": "integer",
"required": false,
"description": "Maximum number of sequences per iteration."
},
{
"name": "max_logprobs",
"type": "integer",
"required": false,
"description": "Max number of log probs to return logprobs is specified in SamplingParams.",
"defaultValue": "20"
},
{
"name": "revision",
"type": "string",
"required": false,
"description": "The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version."
},
{
"name": "code_revision",
"type": "string",
"required": false,
"description": "The specific revision to use for the model code on Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version."
},
{
"name": "tokenizer_revision",
"type": "string",
"required": false,
"description": "Revision of the huggingface tokenizer to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version."
},
{
"name": "tokenizer_mode",
"type": "string",
"required": false,
"description": "The tokenizer mode.\n\n* \"auto\" will use the fast tokenizer if available.\n* \"slow\" will always use the slow tokenizer. \n* \"mistral\" will always use the `mistral_common` tokenizer.",
"defaultValue": "auto",
"validValues": [
"auto",
"slow",
"mistral"
]
},
{
"name": "quantization",
"type": "string",
"required": false,
"description": "Method used to quantize the weights. If None, we first check the `quantization_config` attribute in the model config file. If that is None, we assume the model weights are not quantized and use `dtype` to determine the data type of the weights.",
"validValues": [
"aqlm",
"awq",
"deepspeedfp",
"tpu_int8",
"fp8",
"ptpc_fp8",
"fbgemm_fp8",
"modelopt",
"marlin",
"gguf",
"gptq_marlin_24",
"gptq_marlin",
"awq_marlin",
"gptq",
"compressed-tensors",
"bitsandbytes",
"qqq",
"hqq",
"experts_int8",
"neuron_quant",
"ipex",
"quark",
"moe_wna16"
]
},
{
"name": "max_seq_len_to_capture",
"type": "integer",
"required": false,
"description": "Maximum sequence length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this, we fall back to the eager mode.",
"defaultValue": "8192"
},
{
"name": "worker_cls",
"type": "string",
"required": false,
"description": "The worker class to use for distributed execution.",
"defaultValue": "auto"
},
{
"name": "extras",
"type": "object",
"required": false,
"description": "Extra parameters, it will be passed to the vllm engine."
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/volcengine"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
},
{
"name": "api_base",
"type": "string",
"required": false,
"description": "The base url of the Volcengine API.",
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/wenxin"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
},
{
"name": "api_key",
"type": "string",
"required": false,
"description": "The API key of the Wenxin API.",
"defaultValue": "${env:WEN_XIN_API_KEY}"
},
{
"name": "api_secret",
"type": "string",
"required": false,
"description": "The API secret key of the Wenxin API.",
"description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
},
{
"name": "provider",
"type": "string",
"required": false,
"description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
"defaultValue": "proxy/yi"
},
{
"name": "verbose",
"type": "boolean",
"required": false,
"description": "Show verbose output.",
"defaultValue": "False"
},
{
"name": "concurrency",
"type": "integer",
"required": false,
"description": "Model concurrency limit",
"defaultValue": "100"
},
{
"name": "prompt_template",
"type": "string",
"required": false,
"description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
},
{
"name": "context_length",
"type": "integer",
"required": false,
"description": "The context length of the OpenAI API. If None, it is determined by the model."
},
{
"name": "reasoning_model",
"type": "boolean",
"required": false,
"description": "Whether the model is a reasoning model. If None, it is automatically determined from model."