v1.0

396700dd · chenzk · 396700dd · 396700dd · 396700dd · 396700dd
Commit 396700dd authored Apr 01, 2025 by chenzk
20 changed files
--- a/docs/docs/config-reference/llm/claude_claudedeploymodelparameters_1f0c45.mdx
+++ b/docs/docs/config-reference/llm/claude_claudedeploymodelparameters_1f0c45.mdx
+---
+title: "Claude Proxy LLM Configuration"
+description: "Claude Proxy LLM"
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "ClaudeDeployModelParameters",
+  "description": "Claude Proxy LLM",
+  "documentationUrl": "https://docs.anthropic.com/en/api/getting-started",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/claude"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the claude API.",
+      "defaultValue": "${env:ANTHROPIC_BASE_URL:-https://api.anthropic.com}"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the claude API.",
+      "defaultValue": "${env:ANTHROPIC_API_KEY}"
+    },
+    {
+      "name": "api_type",
+      "type": "string",
+      "required": false,
+      "description": "The type of the OpenAI API, if you use Azure, it can be: azure"
+    },
+    {
+      "name": "api_version",
+      "type": "string",
+      "required": false,
+      "description": "The version of the OpenAI API."
+    },
+    {
+      "name": "http_proxy",
+      "type": "string",
+      "required": false,
+      "description": "The http or https proxy to use openai"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/deepseek_deepseekdeploymodelparameters_194cbd.mdx
+++ b/docs/docs/config-reference/llm/deepseek_deepseekdeploymodelparameters_194cbd.mdx
+---
+title: "Deepseek Proxy LLM Configuration"
+description: "Deepseek proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "DeepSeekDeployModelParameters",
+  "description": "Deepseek proxy LLM configuration.",
+  "documentationUrl": "https://api-docs.deepseek.com/",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/deepseek"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the DeepSeek API.",
+      "defaultValue": "${env:DEEPSEEK_API_BASE:-https://api.deepseek.com/v1}"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the DeepSeek API.",
+      "defaultValue": "${env:DEEPSEEK_API_KEY}"
+    },
+    {
+      "name": "api_type",
+      "type": "string",
+      "required": false,
+      "description": "The type of the OpenAI API, if you use Azure, it can be: azure"
+    },
+    {
+      "name": "api_version",
+      "type": "string",
+      "required": false,
+      "description": "The version of the OpenAI API."
+    },
+    {
+      "name": "http_proxy",
+      "type": "string",
+      "required": false,
+      "description": "The http or https proxy to use openai"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/gemini_geminideploymodelparameters_5113b9.mdx
+++ b/docs/docs/config-reference/llm/gemini_geminideploymodelparameters_5113b9.mdx
+---
+title: "Gemini Proxy LLM Configuration"
+description: "Google Gemini proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "GeminiDeployModelParameters",
+  "description": "Google Gemini proxy LLM configuration.",
+  "documentationUrl": "https://ai.google.dev/gemini-api/docs",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/gemini"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the gemini API.",
+      "defaultValue": "${env:GEMINI_PROXY_API_BASE}"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the gemini API.",
+      "defaultValue": "${env:GEMINI_PROXY_API_KEY}"
+    },
+    {
+      "name": "api_type",
+      "type": "string",
+      "required": false,
+      "description": "The type of the OpenAI API, if you use Azure, it can be: azure"
+    },
+    {
+      "name": "api_version",
+      "type": "string",
+      "required": false,
+      "description": "The version of the OpenAI API."
+    },
+    {
+      "name": "http_proxy",
+      "type": "string",
+      "required": false,
+      "description": "The http or https proxy to use openai"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/gitee_giteedeploymodelparameters_d1bdb3.mdx
+++ b/docs/docs/config-reference/llm/gitee_giteedeploymodelparameters_d1bdb3.mdx
+---
+title: "Gitee Proxy LLM Configuration"
+description: "Gitee proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "GiteeDeployModelParameters",
+  "description": "Gitee proxy LLM configuration.",
+  "documentationUrl": "https://ai.gitee.com/docs/getting-started/intro",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/gitee"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the Gitee API.",
+      "defaultValue": "${env:GITEE_API_BASE:-https://ai.gitee.com/v1}"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the Gitee API.",
+      "defaultValue": "${env:GITEE_API_KEY}"
+    },
+    {
+      "name": "api_type",
+      "type": "string",
+      "required": false,
+      "description": "The type of the OpenAI API, if you use Azure, it can be: azure"
+    },
+    {
+      "name": "api_version",
+      "type": "string",
+      "required": false,
+      "description": "The version of the OpenAI API."
+    },
+    {
+      "name": "http_proxy",
+      "type": "string",
+      "required": false,
+      "description": "The http or https proxy to use openai"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/hf_adapter_hfllmdeploymodelparameters_103e81.mdx
+++ b/docs/docs/config-reference/llm/hf_adapter_hfllmdeploymodelparameters_103e81.mdx
+---
+title: "HFLLMDeployModelParameters Configuration"
+description: "Local deploy model parameters."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "HFLLMDeployModelParameters",
+  "description": "Local deploy model parameters.",
+  "documentationUrl": "",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "path",
+      "type": "string",
+      "required": false,
+      "description": "The path of the model, if you want to deploy a local model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "device",
+      "type": "string",
+      "required": false,
+      "description": "Device to run model. If None, the device is automatically determined"
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "hf"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "5"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "trust_remote_code",
+      "type": "boolean",
+      "required": false,
+      "description": "Trust remote code or not.",
+      "defaultValue": "True"
+    },
+    {
+      "name": "quantization",
+      "type": "BaseHFQuantization",
+      "required": false,
+      "description": "The quantization parameters.",
+      "nestedTypes": [
+        {
+          "type": "link",
+          "text": "bitsandbytes configuration",
+          "url": "parameter_bitsandbytesquantization_d40e3b"
+        },
+        {
+          "type": "link",
+          "text": "bitsandbytes_8bits configuration",
+          "url": "parameter_bitsandbytesquantization8bits_909aed"
+        },
+        {
+          "type": "link",
+          "text": "bitsandbytes_4bits configuration",
+          "url": "parameter_bitsandbytesquantization4bits_52b778"
+        }
+      ]
+    },
+    {
+      "name": "low_cpu_mem_usage",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether to use low CPU memory usage mode. It can reduce the memory when loading the model, if you load your model with quantization, it will be True by default. You must install `accelerate` to make it work."
+    },
+    {
+      "name": "num_gpus",
+      "type": "integer",
+      "required": false,
+      "description": "The number of gpus you expect to use, if it is empty, use all of them as much as possible"
+    },
+    {
+      "name": "max_gpu_memory",
+      "type": "string",
+      "required": false,
+      "description": "The maximum memory limit of each GPU, only valid in multi-GPU configuration, eg: 10GiB, 24GiB"
+    },
+    {
+      "name": "torch_dtype",
+      "type": "string",
+      "required": false,
+      "description": "The dtype of the model, default is None.",
+      "validValues": [
+        "auto",
+        "float16",
+        "bfloat16",
+        "float",
+        "float32"
+      ]
+    },
+    {
+      "name": "attn_implementation",
+      "type": "string",
+      "required": false,
+      "description": "The attention implementation, only valid in multi-GPU configuration",
+      "validValues": [
+        "flash_attention_2"
+      ]
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/index.mdx
+++ b/docs/docs/config-reference/llm/index.mdx
+---
+title: "llm"
+description: "llm Configuration"
+---
+
+# llm Configuration
+
+This document provides an overview of all configuration classes in llm type.
+
+import { ConfigClassTable } from '@site/src/components/mdx/ConfigClassTable';
+
+## Configuration Classes
+
+<ConfigClassTable classes={[
+  {
+    "name": "BaichuanDeployModelParameters",
+    "description": "Baichuan Proxy LLM",
+    "link": "./baichuan_baichuandeploymodelparameters_0bf9cc"
+  },
+  {
+    "name": "BitsandbytesQuantization",
+    "description": "Bits and bytes quantization parameters.",
+    "link": "./parameter_bitsandbytesquantization_d40e3b"
+  },
+  {
+    "name": "BitsandbytesQuantization4bits",
+    "description": "Bits and bytes quantization 4 bits parameters.",
+    "link": "./parameter_bitsandbytesquantization4bits_52b778"
+  },
+  {
+    "name": "BitsandbytesQuantization8bits",
+    "description": "Bits and bytes quantization 8 bits parameters.",
+    "link": "./parameter_bitsandbytesquantization8bits_909aed"
+  },
+  {
+    "name": "ClaudeDeployModelParameters",
+    "description": "Claude Proxy LLM",
+    "link": "./claude_claudedeploymodelparameters_1f0c45"
+  },
+  {
+    "name": "DeepSeekDeployModelParameters",
+    "description": "Deepseek proxy LLM configuration.",
+    "link": "./deepseek_deepseekdeploymodelparameters_194cbd"
+  },
+  {
+    "name": "GeminiDeployModelParameters",
+    "description": "Google Gemini proxy LLM configuration.",
+    "link": "./gemini_geminideploymodelparameters_5113b9"
+  },
+  {
+    "name": "GiteeDeployModelParameters",
+    "description": "Gitee proxy LLM configuration.",
+    "link": "./gitee_giteedeploymodelparameters_d1bdb3"
+  },
+  {
+    "name": "HFLLMDeployModelParameters",
+    "description": "Local deploy model parameters.",
+    "link": "./hf_adapter_hfllmdeploymodelparameters_103e81"
+  },
+  {
+    "name": "LlamaCppModelParameters",
+    "description": "LlamaCppModelParameters(name: str, provider: str = 'llama.cpp', verbose: Optional[bool] = False, concurrency: Optional[int] = 5, backend: Optional[str] = None, prompt_template: Optional[str] = None, context_length: Optional[int] = None, reasoning_model: Optional[bool] = None, path: Optional[str] = None, device: Optional[str] = None, seed: Optional[int] = -1, n_threads: Optional[int] = None, n_batch: Optional[int] = 512, n_gpu_layers: Optional[int] = 1000000000, n_gqa: Optional[int] = None, rms_norm_eps: Optional[float] = 5e-06, cache_capacity: Optional[str] = None, prefer_cpu: Optional[bool] = False)",
+    "link": "./llama_cpp_py_adapter_llamacppmodelparameters_e88874"
+  },
+  {
+    "name": "LlamaServerParameters",
+    "description": "LlamaServerParameters(name: str, provider: str = 'llama.cpp.server', verbose: Optional[bool] = False, concurrency: Optional[int] = 20, backend: Optional[str] = None, prompt_template: Optional[str] = None, context_length: Optional[int] = None, reasoning_model: Optional[bool] = None, path: Optional[str] = None, model_hf_repo: Optional[str] = None, model_hf_file: Optional[str] = None, device: Optional[str] = None, server_bin_path: Optional[str] = None, server_host: str = '127.0.0.1', server_port: int = 0, temperature: float = 0.8, seed: int = 42, debug: bool = False, model_url: Optional[str] = None, model_draft: Optional[str] = None, threads: Optional[int] = None, n_gpu_layers: Optional[int] = None, batch_size: Optional[int] = None, ubatch_size: Optional[int] = None, ctx_size: Optional[int] = None, grp_attn_n: Optional[int] = None, grp_attn_w: Optional[int] = None, n_predict: Optional[int] = None, slot_save_path: Optional[str] = None, n_slots: Optional[int] = None, cont_batching: bool = False, embedding: bool = False, reranking: bool = False, metrics: bool = False, slots: bool = False, draft: Optional[int] = None, draft_max: Optional[int] = None, draft_min: Optional[int] = None, api_key: Optional[str] = None, lora_files: List[str] = <factory>, no_context_shift: bool = False, no_webui: Optional[bool] = None, startup_timeout: Optional[int] = None)",
+    "link": "./llama_cpp_adapter_llamaserverparameters_421f40"
+  },
+  {
+    "name": "MoonshotDeployModelParameters",
+    "description": "Moonshot proxy LLM configuration.",
+    "link": "./moonshot_moonshotdeploymodelparameters_aa2f6b"
+  },
+  {
+    "name": "OllamaDeployModelParameters",
+    "description": "Ollama proxy LLM configuration.",
+    "link": "./ollama_ollamadeploymodelparameters_d55be6"
+  },
+  {
+    "name": "OpenAICompatibleDeployModelParameters",
+    "description": "OpenAI Compatible Proxy LLM",
+    "link": "./chatgpt_openaicompatibledeploymodelparameters_c3d426"
+  },
+  {
+    "name": "SiliconFlowDeployModelParameters",
+    "description": "SiliconFlow proxy LLM configuration.",
+    "link": "./siliconflow_siliconflowdeploymodelparameters_abe22f"
+  },
+  {
+    "name": "SparkDeployModelParameters",
+    "description": "Xunfei Spark proxy LLM configuration.",
+    "link": "./spark_sparkdeploymodelparameters_afba3c"
+  },
+  {
+    "name": "TongyiDeployModelParameters",
+    "description": "Tongyi proxy LLM configuration.",
+    "link": "./tongyi_tongyideploymodelparameters_02a91b"
+  },
+  {
+    "name": "VLLMDeployModelParameters",
+    "description": "Local deploy model parameters.",
+    "link": "./vllm_adapter_vllmdeploymodelparameters_1d4a24"
+  },
+  {
+    "name": "VolcengineDeployModelParameters",
+    "description": "Volcengine proxy LLM configuration.",
+    "link": "./volcengine_volcenginedeploymodelparameters_938015"
+  },
+  {
+    "name": "WenxinDeployModelParameters",
+    "description": "Baidu Wenxin proxy LLM configuration.",
+    "link": "./wenxin_wenxindeploymodelparameters_63c66b"
+  },
+  {
+    "name": "YiDeployModelParameters",
+    "description": "Yi proxy LLM configuration.",
+    "link": "./yi_yideploymodelparameters_92dbaa"
+  },
+  {
+    "name": "ZhipuDeployModelParameters",
+    "description": "Zhipu proxy LLM configuration.",
+    "link": "./zhipu_zhipudeploymodelparameters_c51e31"
+  },
+]} />
+
--- a/docs/docs/config-reference/llm/llama_cpp_adapter_llamaserverparameters_421f40.mdx
+++ b/docs/docs/config-reference/llm/llama_cpp_adapter_llamaserverparameters_421f40.mdx
+---
+title: "LlamaServerParameters Configuration"
+description: "LlamaServerParameters(name: str, provider: str = 'llama.cpp.server', verbose: Optional[bool] = False, concurrency: Optional[int] = 20, backend: Optional[str] = None, prompt_template: Optional[str] = None, context_length: Optional[int] = None, reasoning_model: Optional[bool] = None, path: Optional[str] = None, model_hf_repo: Optional[str] = None, model_hf_file: Optional[str] = None, device: Optional[str] = None, server_bin_path: Optional[str] = None, server_host: str = '127.0.0.1', server_port: int = 0, temperature: float = 0.8, seed: int = 42, debug: bool = False, model_url: Optional[str] = None, model_draft: Optional[str] = None, threads: Optional[int] = None, n_gpu_layers: Optional[int] = None, batch_size: Optional[int] = None, ubatch_size: Optional[int] = None, ctx_size: Optional[int] = None, grp_attn_n: Optional[int] = None, grp_attn_w: Optional[int] = None, n_predict: Optional[int] = None, slot_save_path: Optional[str] = None, n_slots: Optional[int] = None, cont_batching: bool = False, embedding: bool = False, reranking: bool = False, metrics: bool = False, slots: bool = False, draft: Optional[int] = None, draft_max: Optional[int] = None, draft_min: Optional[int] = None, api_key: Optional[str] = None, lora_files: List[str] = <factory>, no_context_shift: bool = False, no_webui: Optional[bool] = None, startup_timeout: Optional[int] = None)"
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "LlamaServerParameters",
+  "description": "LlamaServerParameters(name: str, provider: str = 'llama.cpp.server', verbose: Optional[bool] = False, concurrency: Optional[int] = 20, backend: Optional[str] = None, prompt_template: Optional[str] = None, context_length: Optional[int] = None, reasoning_model: Optional[bool] = None, path: Optional[str] = None, model_hf_repo: Optional[str] = None, model_hf_file: Optional[str] = None, device: Optional[str] = None, server_bin_path: Optional[str] = None, server_host: str = '127.0.0.1', server_port: int = 0, temperature: float = 0.8, seed: int = 42, debug: bool = False, model_url: Optional[str] = None, model_draft: Optional[str] = None, threads: Optional[int] = None, n_gpu_layers: Optional[int] = None, batch_size: Optional[int] = None, ubatch_size: Optional[int] = None, ctx_size: Optional[int] = None, grp_attn_n: Optional[int] = None, grp_attn_w: Optional[int] = None, n_predict: Optional[int] = None, slot_save_path: Optional[str] = None, n_slots: Optional[int] = None, cont_batching: bool = False, embedding: bool = False, reranking: bool = False, metrics: bool = False, slots: bool = False, draft: Optional[int] = None, draft_max: Optional[int] = None, draft_min: Optional[int] = None, api_key: Optional[str] = None, lora_files: List[str] = <factory>, no_context_shift: bool = False, no_webui: Optional[bool] = None, startup_timeout: Optional[int] = None)",
+  "documentationUrl": "",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "path",
+      "type": "string",
+      "required": false,
+      "description": "Local model file path"
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "device",
+      "type": "string",
+      "required": false,
+      "description": "Device to run model. If None, the device is automatically determined"
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "llama.cpp.server"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "20"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "model_hf_repo",
+      "type": "string",
+      "required": false,
+      "description": "Hugging Face repository for model download"
+    },
+    {
+      "name": "model_hf_file",
+      "type": "string",
+      "required": false,
+      "description": "Model file name in the Hugging Face repository"
+    },
+    {
+      "name": "server_bin_path",
+      "type": "string",
+      "required": false,
+      "description": "Path to the server binary executable"
+    },
+    {
+      "name": "server_host",
+      "type": "string",
+      "required": false,
+      "description": "Host address to bind the server",
+      "defaultValue": "127.0.0.1"
+    },
+    {
+      "name": "server_port",
+      "type": "integer",
+      "required": false,
+      "description": "Port to bind the server. 0 for random available port",
+      "defaultValue": "0"
+    },
+    {
+      "name": "temperature",
+      "type": "number",
+      "required": false,
+      "description": "Sampling temperature for text generation",
+      "defaultValue": "0.8"
+    },
+    {
+      "name": "seed",
+      "type": "integer",
+      "required": false,
+      "description": "Random seed for reproducibility",
+      "defaultValue": "42"
+    },
+    {
+      "name": "debug",
+      "type": "boolean",
+      "required": false,
+      "description": "Enable debug mode",
+      "defaultValue": "False"
+    },
+    {
+      "name": "model_url",
+      "type": "string",
+      "required": false,
+      "description": "Model download URL (env: LLAMA_ARG_MODEL_URL)"
+    },
+    {
+      "name": "model_draft",
+      "type": "string",
+      "required": false,
+      "description": "Draft model file path"
+    },
+    {
+      "name": "threads",
+      "type": "integer",
+      "required": false,
+      "description": "Number of threads to use during generation (default: -1) (env: LLAMA_ARG_THREADS)"
+    },
+    {
+      "name": "n_gpu_layers",
+      "type": "integer",
+      "required": false,
+      "description": "Number of layers to store in VRAM (env: LLAMA_ARG_N_GPU_LAYERS), set 1000000000 to use all layers"
+    },
+    {
+      "name": "batch_size",
+      "type": "integer",
+      "required": false,
+      "description": "Logical maximum batch size (default: 2048) (env: LLAMA_ARG_BATCH)"
+    },
+    {
+      "name": "ubatch_size",
+      "type": "integer",
+      "required": false,
+      "description": "Physical maximum batch size (default: 512) (env: LLAMA_ARG_UBATCH)"
+    },
+    {
+      "name": "ctx_size",
+      "type": "integer",
+      "required": false,
+      "description": "Size of the prompt context (default: 4096, 0 = loaded from model) (env: LLAMA_ARG_CTX_SIZE)"
+    },
+    {
+      "name": "grp_attn_n",
+      "type": "integer",
+      "required": false,
+      "description": "Group-attention factor (default: 1)"
+    },
+    {
+      "name": "grp_attn_w",
+      "type": "integer",
+      "required": false,
+      "description": "Group-attention width (default: 512)"
+    },
+    {
+      "name": "n_predict",
+      "type": "integer",
+      "required": false,
+      "description": "Number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) (env: LLAMA_ARG_N_PREDICT)"
+    },
+    {
+      "name": "slot_save_path",
+      "type": "string",
+      "required": false,
+      "description": "Path to save slot kv cache (default: disabled)"
+    },
+    {
+      "name": "n_slots",
+      "type": "integer",
+      "required": false,
+      "description": "Number of slots for KV cache"
+    },
+    {
+      "name": "cont_batching",
+      "type": "boolean",
+      "required": false,
+      "description": "Enable continuous batching (a.k.a dynamic batching)",
+      "defaultValue": "False"
+    },
+    {
+      "name": "embedding",
+      "type": "boolean",
+      "required": false,
+      "description": "Restrict to only support embedding use case; use only with dedicated embedding models (env: LLAMA_ARG_EMBEDDINGS)",
+      "defaultValue": "False"
+    },
+    {
+      "name": "reranking",
+      "type": "boolean",
+      "required": false,
+      "description": "Enable reranking endpoint on server (env: LLAMA_ARG_RERANKING)",
+      "defaultValue": "False"
+    },
+    {
+      "name": "metrics",
+      "type": "boolean",
+      "required": false,
+      "description": "Enable prometheus compatible metrics endpoint (env: LLAMA_ARG_ENDPOINT_METRICS)",
+      "defaultValue": "False"
+    },
+    {
+      "name": "slots",
+      "type": "boolean",
+      "required": false,
+      "description": "Enable slots monitoring endpoint (env: LLAMA_ARG_ENDPOINT_SLOTS)",
+      "defaultValue": "False"
+    },
+    {
+      "name": "draft",
+      "type": "integer",
+      "required": false,
+      "description": "Number of tokens to draft for speculative decoding (default: 16) (env: LLAMA_ARG_DRAFT_MAX)"
+    },
+    {
+      "name": "draft_max",
+      "type": "integer",
+      "required": false,
+      "description": "Same as draft"
+    },
+    {
+      "name": "draft_min",
+      "type": "integer",
+      "required": false,
+      "description": "Minimum number of draft tokens to use for speculative decoding (default: 5)"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "API key to use for authentication (env: LLAMA_API_KEY)"
+    },
+    {
+      "name": "lora_files",
+      "type": "string",
+      "required": false,
+      "description": "Path to LoRA adapter (can be repeated to use multiple adapters)",
+      "defaultValue": "[]"
+    },
+    {
+      "name": "no_context_shift",
+      "type": "boolean",
+      "required": false,
+      "description": "Disables context shift on infinite text generation",
+      "defaultValue": "False"
+    },
+    {
+      "name": "no_webui",
+      "type": "boolean",
+      "required": false,
+      "description": "Disable web UI"
+    },
+    {
+      "name": "startup_timeout",
+      "type": "integer",
+      "required": false,
+      "description": "Server startup timeout in seconds"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/llama_cpp_py_adapter_llamacppmodelparameters_e88874.mdx
+++ b/docs/docs/config-reference/llm/llama_cpp_py_adapter_llamacppmodelparameters_e88874.mdx
+---
+title: "LlamaCppModelParameters Configuration"
+description: "LlamaCppModelParameters(name: str, provider: str = 'llama.cpp', verbose: Optional[bool] = False, concurrency: Optional[int] = 5, backend: Optional[str] = None, prompt_template: Optional[str] = None, context_length: Optional[int] = None, reasoning_model: Optional[bool] = None, path: Optional[str] = None, device: Optional[str] = None, seed: Optional[int] = -1, n_threads: Optional[int] = None, n_batch: Optional[int] = 512, n_gpu_layers: Optional[int] = 1000000000, n_gqa: Optional[int] = None, rms_norm_eps: Optional[float] = 5e-06, cache_capacity: Optional[str] = None, prefer_cpu: Optional[bool] = False)"
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "LlamaCppModelParameters",
+  "description": "LlamaCppModelParameters(name: str, provider: str = 'llama.cpp', verbose: Optional[bool] = False, concurrency: Optional[int] = 5, backend: Optional[str] = None, prompt_template: Optional[str] = None, context_length: Optional[int] = None, reasoning_model: Optional[bool] = None, path: Optional[str] = None, device: Optional[str] = None, seed: Optional[int] = -1, n_threads: Optional[int] = None, n_batch: Optional[int] = 512, n_gpu_layers: Optional[int] = 1000000000, n_gqa: Optional[int] = None, rms_norm_eps: Optional[float] = 5e-06, cache_capacity: Optional[str] = None, prefer_cpu: Optional[bool] = False)",
+  "documentationUrl": "",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "path",
+      "type": "string",
+      "required": false,
+      "description": "The path of the model, if you want to deploy a local model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "device",
+      "type": "string",
+      "required": false,
+      "description": "Device to run model. If None, the device is automatically determined"
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "llama.cpp"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "5"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "seed",
+      "type": "integer",
+      "required": false,
+      "description": "Random seed for llama-cpp models. -1 for random",
+      "defaultValue": "-1"
+    },
+    {
+      "name": "n_threads",
+      "type": "integer",
+      "required": false,
+      "description": "Number of threads to use. If None, the number of threads is automatically determined"
+    },
+    {
+      "name": "n_batch",
+      "type": "integer",
+      "required": false,
+      "description": "Maximum number of prompt tokens to batch together when calling llama_eval",
+      "defaultValue": "512"
+    },
+    {
+      "name": "n_gpu_layers",
+      "type": "integer",
+      "required": false,
+      "description": "Number of layers to offload to the GPU, Set this to 1000000000 to offload all layers to the GPU.",
+      "defaultValue": "1000000000"
+    },
+    {
+      "name": "n_gqa",
+      "type": "integer",
+      "required": false,
+      "description": "Grouped-query attention. Must be 8 for llama-2 70b."
+    },
+    {
+      "name": "rms_norm_eps",
+      "type": "number",
+      "required": false,
+      "description": "5e-6 is a good value for llama-2 models.",
+      "defaultValue": "5e-06"
+    },
+    {
+      "name": "cache_capacity",
+      "type": "string",
+      "required": false,
+      "description": "Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. "
+    },
+    {
+      "name": "prefer_cpu",
+      "type": "boolean",
+      "required": false,
+      "description": "If a GPU is available, it will be preferred by default, unless prefer_cpu=False is configured.",
+      "defaultValue": "False"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/moonshot_moonshotdeploymodelparameters_aa2f6b.mdx
+++ b/docs/docs/config-reference/llm/moonshot_moonshotdeploymodelparameters_aa2f6b.mdx
+---
+title: "Moonshot Proxy LLM Configuration"
+description: "Moonshot proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "MoonshotDeployModelParameters",
+  "description": "Moonshot proxy LLM configuration.",
+  "documentationUrl": "https://platform.moonshot.cn/docs/api/chat#%E5%85%AC%E5%BC%80%E7%9A%84%E6%9C%8D%E5%8A%A1%E5%9C%B0%E5%9D%80",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/moonshot"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the Moonshot API.",
+      "defaultValue": "${env:MOONSHOT_API_BASE:-https://api.moonshot.cn/v1}"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the Moonshot API.",
+      "defaultValue": "${env:MOONSHOT_API_KEY}"
+    },
+    {
+      "name": "api_type",
+      "type": "string",
+      "required": false,
+      "description": "The type of the OpenAI API, if you use Azure, it can be: azure"
+    },
+    {
+      "name": "api_version",
+      "type": "string",
+      "required": false,
+      "description": "The version of the OpenAI API."
+    },
+    {
+      "name": "http_proxy",
+      "type": "string",
+      "required": false,
+      "description": "The http or https proxy to use openai"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/ollama_ollamadeploymodelparameters_d55be6.mdx
+++ b/docs/docs/config-reference/llm/ollama_ollamadeploymodelparameters_d55be6.mdx
+---
+title: "Ollama Proxy LLM Configuration"
+description: "Ollama proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "OllamaDeployModelParameters",
+  "description": "Ollama proxy LLM configuration.",
+  "documentationUrl": "https://ollama.com/library",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/ollama"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "5"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the Ollama API.",
+      "defaultValue": "${env:OLLAMA_API_BASE:-http://localhost:11434}"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/parameter_bitsandbytesquantization4bits_52b778.mdx
+++ b/docs/docs/config-reference/llm/parameter_bitsandbytesquantization4bits_52b778.mdx
+---
+title: "BitsandbytesQuantization4bits Configuration"
+description: "Bits and bytes quantization 4 bits parameters."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "BitsandbytesQuantization4bits",
+  "description": "Bits and bytes quantization 4 bits parameters.",
+  "documentationUrl": "",
+  "parameters": [
+    {
+      "name": "load_in_8bits",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether to load the model in 8 bits(LLM.int8() algorithm), default is False.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "load_in_4bits",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether to load the model in 4 bits.",
+      "defaultValue": "True"
+    },
+    {
+      "name": "bnb_4bit_compute_dtype",
+      "type": "string",
+      "required": false,
+      "description": "To speedup computation, you can change the data type from float32 (the default value) to bfloat16",
+      "validValues": [
+        "bfloat16",
+        "float16",
+        "float32"
+      ]
+    },
+    {
+      "name": "bnb_4bit_quant_type",
+      "type": "string",
+      "required": false,
+      "description": "Quantization datatypes, `fp4` (four bit float) and `nf4` (normal four bit float), only valid when load_4bit=True",
+      "defaultValue": "nf4",
+      "validValues": [
+        "nf4",
+        "fp4"
+      ]
+    },
+    {
+      "name": "bnb_4bit_use_double_quant",
+      "type": "boolean",
+      "required": false,
+      "description": "Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. ",
+      "defaultValue": "True"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/parameter_bitsandbytesquantization8bits_909aed.mdx
+++ b/docs/docs/config-reference/llm/parameter_bitsandbytesquantization8bits_909aed.mdx
+---
+title: "BitsandbytesQuantization8bits Configuration"
+description: "Bits and bytes quantization 8 bits parameters."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "BitsandbytesQuantization8bits",
+  "description": "Bits and bytes quantization 8 bits parameters.",
+  "documentationUrl": "",
+  "parameters": [
+    {
+      "name": "load_in_8bits",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether to load the model in 8 bits(LLM.int8() algorithm).",
+      "defaultValue": "True"
+    },
+    {
+      "name": "load_in_4bits",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether to load the model in 4 bits, default is False.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "llm_int8_enable_fp32_cpu_offload",
+      "type": "boolean",
+      "required": false,
+      "description": "8-bit models can offload weights between the CPU and GPU to support fitting very large models into memory. The weights dispatched to the CPU are actually stored in float32, and aren’t converted to 8-bit. ",
+      "defaultValue": "False"
+    },
+    {
+      "name": "llm_int8_threshold",
+      "type": "number",
+      "required": false,
+      "description": "An “outlier” is a hidden state value greater than a certain threshold, and these values are computed in fp16. While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). 8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning).",
+      "defaultValue": "6.0"
+    },
+    {
+      "name": "llm_int8_skip_modules",
+      "type": "string",
+      "required": false,
+      "description": "An explicit list of the modules that we do not want to convert in 8-bit. This is useful for models such as Jukebox that has several heads in different places and not necessarily at the last position. For example for `CausalLM` models, the last `lm_head` is kept in its original `dtype`",
+      "defaultValue": "[]"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/parameter_bitsandbytesquantization_d40e3b.mdx
+++ b/docs/docs/config-reference/llm/parameter_bitsandbytesquantization_d40e3b.mdx
+---
+title: "BitsandbytesQuantization Configuration"
+description: "Bits and bytes quantization parameters."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "BitsandbytesQuantization",
+  "description": "Bits and bytes quantization parameters.",
+  "documentationUrl": "",
+  "parameters": [
+    {
+      "name": "load_in_8bits",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether to load the model in 8 bits(LLM.int8() algorithm), default is False.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "load_in_4bits",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether to load the model in 4 bits, default is False.",
+      "defaultValue": "False"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/siliconflow_siliconflowdeploymodelparameters_abe22f.mdx
+++ b/docs/docs/config-reference/llm/siliconflow_siliconflowdeploymodelparameters_abe22f.mdx
+---
+title: "SiliconFlow Proxy LLM Configuration"
+description: "SiliconFlow proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "SiliconFlowDeployModelParameters",
+  "description": "SiliconFlow proxy LLM configuration.",
+  "documentationUrl": "https://docs.siliconflow.cn/en/api-reference/chat-completions/chat-completions",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/siliconflow"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the SiliconFlow API.",
+      "defaultValue": "${env:SILICONFLOW_API_BASE:-https://api.siliconflow.cn/v1}"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the SiliconFlow API.",
+      "defaultValue": "${env:SILICONFLOW_API_KEY}"
+    },
+    {
+      "name": "api_type",
+      "type": "string",
+      "required": false,
+      "description": "The type of the OpenAI API, if you use Azure, it can be: azure"
+    },
+    {
+      "name": "api_version",
+      "type": "string",
+      "required": false,
+      "description": "The version of the OpenAI API."
+    },
+    {
+      "name": "http_proxy",
+      "type": "string",
+      "required": false,
+      "description": "The http or https proxy to use openai"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/spark_sparkdeploymodelparameters_afba3c.mdx
+++ b/docs/docs/config-reference/llm/spark_sparkdeploymodelparameters_afba3c.mdx
+---
+title: "Xunfei Spark Proxy LLM Configuration"
+description: "Xunfei Spark proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "SparkDeployModelParameters",
+  "description": "Xunfei Spark proxy LLM configuration.",
+  "documentationUrl": "https://www.xfyun.cn/doc/spark/HTTP%E8%B0%83%E7%94%A8%E6%96%87%E6%A1%A3.html#_1-%E6%8E%A5%E5%8F%A3%E8%AF%B4%E6%98%8E",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/spark"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the Spark API.",
+      "defaultValue": "${env:XUNFEI_SPARK_API_BASE:-https://spark-api-open.xf-yun.com/v1}"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the Spark API.",
+      "defaultValue": "${env:XUNFEI_SPARK_API_KEY}"
+    },
+    {
+      "name": "api_type",
+      "type": "string",
+      "required": false,
+      "description": "The type of the OpenAI API, if you use Azure, it can be: azure"
+    },
+    {
+      "name": "api_version",
+      "type": "string",
+      "required": false,
+      "description": "The version of the OpenAI API."
+    },
+    {
+      "name": "http_proxy",
+      "type": "string",
+      "required": false,
+      "description": "The http or https proxy to use openai"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/tongyi_tongyideploymodelparameters_02a91b.mdx
+++ b/docs/docs/config-reference/llm/tongyi_tongyideploymodelparameters_02a91b.mdx
+---
+title: "Tongyi Proxy LLM Configuration"
+description: "Tongyi proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "TongyiDeployModelParameters",
+  "description": "Tongyi proxy LLM configuration.",
+  "documentationUrl": "https://help.aliyun.com/zh/model-studio/getting-started/first-api-call-to-qwen",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/tongyi"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the tongyi API.",
+      "defaultValue": "https://dashscope.aliyuncs.com/compatible-mode/v1"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the tongyi API.",
+      "defaultValue": "${env:DASHSCOPE_API_KEY}"
+    },
+    {
+      "name": "api_type",
+      "type": "string",
+      "required": false,
+      "description": "The type of the OpenAI API, if you use Azure, it can be: azure"
+    },
+    {
+      "name": "api_version",
+      "type": "string",
+      "required": false,
+      "description": "The version of the OpenAI API."
+    },
+    {
+      "name": "http_proxy",
+      "type": "string",
+      "required": false,
+      "description": "The http or https proxy to use openai"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/vllm_adapter_vllmdeploymodelparameters_1d4a24.mdx
+++ b/docs/docs/config-reference/llm/vllm_adapter_vllmdeploymodelparameters_1d4a24.mdx
+---
+title: "VLLMDeployModelParameters Configuration"
+description: "Local deploy model parameters."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "VLLMDeployModelParameters",
+  "description": "Local deploy model parameters.",
+  "documentationUrl": "",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "path",
+      "type": "string",
+      "required": false,
+      "description": "The path of the model, if you want to deploy a local model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "device",
+      "type": "string",
+      "required": false,
+      "description": "Device to run model. If None, the device is automatically determined",
+      "defaultValue": "auto"
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "vllm"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "trust_remote_code",
+      "type": "boolean",
+      "required": false,
+      "description": "Trust remote code or not.",
+      "defaultValue": "True"
+    },
+    {
+      "name": "download_dir",
+      "type": "string",
+      "required": false,
+      "description": "Directory to download and load the weights, default to the default cache dir of huggingface."
+    },
+    {
+      "name": "load_format",
+      "type": "string",
+      "required": false,
+      "description": "The format of the model weights to load.\n\n* \"auto\" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.\n* \"pt\" will load the weights in the pytorch bin format.\n* \"safetensors\" will load the weights in the safetensors format.\n* \"npcache\" will load the weights in pytorch format and store a numpy cache to speed up the loading.\n* \"dummy\" will initialize the weights with random values, which is mainly for profiling.\n* \"tensorizer\" will load the weights using tensorizer from CoreWeave. See the Tensorize vLLM Model script in the Examples section for more information.\n* \"runai_streamer\" will load the Safetensors weights using Run:aiModel Streamer \n* \"bitsandbytes\" will load the weights using bitsandbytes quantization.\n",
+      "defaultValue": "auto",
+      "validValues": [
+        "auto",
+        "pt",
+        "safetensors",
+        "npcache",
+        "dummy",
+        "tensorizer",
+        "runai_streamer",
+        "bitsandbytes",
+        "sharded_state",
+        "gguf",
+        "mistral"
+      ]
+    },
+    {
+      "name": "config_format",
+      "type": "string",
+      "required": false,
+      "description": "The format of the model config to load.\n\n* \"auto\" will try to load the config in hf format if available else it will try to load in mistral format ",
+      "defaultValue": "auto",
+      "validValues": [
+        "auto",
+        "hf",
+        "mistral"
+      ]
+    },
+    {
+      "name": "dtype",
+      "type": "string",
+      "required": false,
+      "description": "Data type for model weights and activations.\n\n* \"auto\" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.\n* \"half\" for FP16. Recommended for AWQ quantization.\n* \"float16\" is the same as \"half\".\n* \"bfloat16\" for a balance between precision and range.\n* \"float\" is shorthand for FP32 precision.\n* \"float32\" for FP32 precision.",
+      "defaultValue": "auto",
+      "validValues": [
+        "auto",
+        "half",
+        "float16",
+        "bfloat16",
+        "float",
+        "float32"
+      ]
+    },
+    {
+      "name": "kv_cache_dtype",
+      "type": "string",
+      "required": false,
+      "description": "Data type for kv cache storage. If \"auto\", will use model data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports fp8 (=fp8_e4m3)",
+      "defaultValue": "auto",
+      "validValues": [
+        "auto",
+        "fp8",
+        "fp8_e5m2",
+        "fp8_e4m3"
+      ]
+    },
+    {
+      "name": "seed",
+      "type": "integer",
+      "required": false,
+      "description": "Random seed for operations.",
+      "defaultValue": "0"
+    },
+    {
+      "name": "max_model_len",
+      "type": "integer",
+      "required": false,
+      "description": "Model context length. If unspecified, will be automatically derived from the model config."
+    },
+    {
+      "name": "distributed_executor_backend",
+      "type": "string",
+      "required": false,
+      "description": "Backend to use for distributed model workers, either \"ray\" or \"mp\" (multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size is less than or equal to the number of GPUs available, \"mp\" will be used to keep processing on a single host. Otherwise, this will default to \"ray\" if Ray is installed and fail otherwise. Note that tpu only supports Ray for distributed inference.",
+      "validValues": [
+        "ray",
+        "mp",
+        "uni",
+        "external_launcher"
+      ]
+    },
+    {
+      "name": "pipeline_parallel_size",
+      "type": "integer",
+      "required": false,
+      "description": "Number of pipeline stages.",
+      "defaultValue": "1"
+    },
+    {
+      "name": "tensor_parallel_size",
+      "type": "integer",
+      "required": false,
+      "description": "Number of tensor parallel replicas.",
+      "defaultValue": "1"
+    },
+    {
+      "name": "max_parallel_loading_workers",
+      "type": "integer",
+      "required": false,
+      "description": "Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models."
+    },
+    {
+      "name": "block_size",
+      "type": "integer",
+      "required": false,
+      "description": "Token block size for contiguous chunks of tokens. This is ignored on neuron devices and set to ``--max-model-len``. On CUDA devices, only block sizes up to 32 are supported. On HPU devices, block size defaults to 128.",
+      "validValues": [
+        "8",
+        "16",
+        "32",
+        "64",
+        "128"
+      ]
+    },
+    {
+      "name": "enable_prefix_caching",
+      "type": "boolean",
+      "required": false,
+      "description": "Enables automatic prefix caching. "
+    },
+    {
+      "name": "swap_space",
+      "type": "number",
+      "required": false,
+      "description": "CPU swap space size (GiB) per GPU.",
+      "defaultValue": "4"
+    },
+    {
+      "name": "cpu_offload_gb",
+      "type": "number",
+      "required": false,
+      "description": "The space in GiB to offload to CPU, per GPU. Default is 0, which means no offloading. Intuitively, this argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass.",
+      "defaultValue": "0"
+    },
+    {
+      "name": "gpu_memory_utilization",
+      "type": "number",
+      "required": false,
+      "description": "The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. For example, a value of 0.5 would imply 50%% GPU memory utilization. If unspecified, will use the default value of 0.9. This is a per-instance limit, and only applies to the current vLLM instance.It does not matter if you have another vLLM instance running on the same GPU. For example, if you have two vLLM instances running on the same GPU, you can set the GPU memory utilization to 0.5 for each instance.",
+      "defaultValue": "0.9"
+    },
+    {
+      "name": "max_num_batched_tokens",
+      "type": "integer",
+      "required": false,
+      "description": "Maximum number of batched tokens per iteration."
+    },
+    {
+      "name": "max_num_seqs",
+      "type": "integer",
+      "required": false,
+      "description": "Maximum number of sequences per iteration."
+    },
+    {
+      "name": "max_logprobs",
+      "type": "integer",
+      "required": false,
+      "description": "Max number of log probs to return logprobs is specified in SamplingParams.",
+      "defaultValue": "20"
+    },
+    {
+      "name": "revision",
+      "type": "string",
+      "required": false,
+      "description": "The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version."
+    },
+    {
+      "name": "code_revision",
+      "type": "string",
+      "required": false,
+      "description": "The specific revision to use for the model code on Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version."
+    },
+    {
+      "name": "tokenizer_revision",
+      "type": "string",
+      "required": false,
+      "description": "Revision of the huggingface tokenizer to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version."
+    },
+    {
+      "name": "tokenizer_mode",
+      "type": "string",
+      "required": false,
+      "description": "The tokenizer mode.\n\n* \"auto\" will use the fast tokenizer if available.\n* \"slow\" will always use the slow tokenizer. \n* \"mistral\" will always use the `mistral_common` tokenizer.",
+      "defaultValue": "auto",
+      "validValues": [
+        "auto",
+        "slow",
+        "mistral"
+      ]
+    },
+    {
+      "name": "quantization",
+      "type": "string",
+      "required": false,
+      "description": "Method used to quantize the weights. If None, we first check the `quantization_config` attribute in the model config file. If that is None, we assume the model weights are not quantized and use `dtype` to determine the data type of the weights.",
+      "validValues": [
+        "aqlm",
+        "awq",
+        "deepspeedfp",
+        "tpu_int8",
+        "fp8",
+        "ptpc_fp8",
+        "fbgemm_fp8",
+        "modelopt",
+        "marlin",
+        "gguf",
+        "gptq_marlin_24",
+        "gptq_marlin",
+        "awq_marlin",
+        "gptq",
+        "compressed-tensors",
+        "bitsandbytes",
+        "qqq",
+        "hqq",
+        "experts_int8",
+        "neuron_quant",
+        "ipex",
+        "quark",
+        "moe_wna16"
+      ]
+    },
+    {
+      "name": "max_seq_len_to_capture",
+      "type": "integer",
+      "required": false,
+      "description": "Maximum sequence length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this, we fall back to the eager mode.",
+      "defaultValue": "8192"
+    },
+    {
+      "name": "worker_cls",
+      "type": "string",
+      "required": false,
+      "description": "The worker class to use for distributed execution.",
+      "defaultValue": "auto"
+    },
+    {
+      "name": "extras",
+      "type": "object",
+      "required": false,
+      "description": "Extra parameters, it will be passed to the vllm engine."
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/volcengine_volcenginedeploymodelparameters_938015.mdx
+++ b/docs/docs/config-reference/llm/volcengine_volcenginedeploymodelparameters_938015.mdx
+---
+title: "Volcengine Proxy LLM Configuration"
+description: "Volcengine proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "VolcengineDeployModelParameters",
+  "description": "Volcengine proxy LLM configuration.",
+  "documentationUrl": "https://www.volcengine.com/docs/82379/1298454",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/volcengine"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the Volcengine API.",
+      "defaultValue": "${env:ARK_API_BASE:-https://ark.cn-beijing.volces.com/api/v3}"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the Volcengine API.",
+      "defaultValue": "${env:ARK_API_KEY}"
+    },
+    {
+      "name": "api_type",
+      "type": "string",
+      "required": false,
+      "description": "The type of the OpenAI API, if you use Azure, it can be: azure"
+    },
+    {
+      "name": "api_version",
+      "type": "string",
+      "required": false,
+      "description": "The version of the OpenAI API."
+    },
+    {
+      "name": "http_proxy",
+      "type": "string",
+      "required": false,
+      "description": "The http or https proxy to use openai"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/wenxin_wenxindeploymodelparameters_63c66b.mdx
+++ b/docs/docs/config-reference/llm/wenxin_wenxindeploymodelparameters_63c66b.mdx
+---
+title: "Baidu Wenxin Proxy LLM Configuration"
+description: "Baidu Wenxin proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "WenxinDeployModelParameters",
+  "description": "Baidu Wenxin proxy LLM configuration.",
+  "documentationUrl": "https://cloud.baidu.com/doc/WENXINWORKSHOP/s/clntwmv7t",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/wenxin"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the Wenxin API.",
+      "defaultValue": "${env:WEN_XIN_API_KEY}"
+    },
+    {
+      "name": "api_secret",
+      "type": "string",
+      "required": false,
+      "description": "The API secret key of the Wenxin API.",
+      "defaultValue": "${env:WEN_XIN_API_SECRET}"
+    }
+  ]
+}} />
+
--- a/docs/docs/config-reference/llm/yi_yideploymodelparameters_92dbaa.mdx
+++ b/docs/docs/config-reference/llm/yi_yideploymodelparameters_92dbaa.mdx
+---
+title: "Yi Proxy LLM Configuration"
+description: "Yi proxy LLM configuration."
+---
+
+import { ConfigDetail } from "@site/src/components/mdx/ConfigDetail";
+
+<ConfigDetail config={{
+  "name": "YiDeployModelParameters",
+  "description": "Yi proxy LLM configuration.",
+  "documentationUrl": "https://platform.lingyiwanwu.com/docs",
+  "parameters": [
+    {
+      "name": "name",
+      "type": "string",
+      "required": true,
+      "description": "The name of the model."
+    },
+    {
+      "name": "backend",
+      "type": "string",
+      "required": false,
+      "description": "The real model name to pass to the provider, default is None. If backend is None, use name as the real model name."
+    },
+    {
+      "name": "provider",
+      "type": "string",
+      "required": false,
+      "description": "The provider of the model. If model is deployed in local, this is the inference type. If model is deployed in third-party service, this is platform name('proxy/<platform>')",
+      "defaultValue": "proxy/yi"
+    },
+    {
+      "name": "verbose",
+      "type": "boolean",
+      "required": false,
+      "description": "Show verbose output.",
+      "defaultValue": "False"
+    },
+    {
+      "name": "concurrency",
+      "type": "integer",
+      "required": false,
+      "description": "Model concurrency limit",
+      "defaultValue": "100"
+    },
+    {
+      "name": "prompt_template",
+      "type": "string",
+      "required": false,
+      "description": "Prompt template. If None, the prompt template is automatically determined from model. Just for local deployment."
+    },
+    {
+      "name": "context_length",
+      "type": "integer",
+      "required": false,
+      "description": "The context length of the OpenAI API. If None, it is determined by the model."
+    },
+    {
+      "name": "reasoning_model",
+      "type": "boolean",
+      "required": false,
+      "description": "Whether the model is a reasoning model. If None, it is automatically determined from model."
+    },
+    {
+      "name": "api_base",
+      "type": "string",
+      "required": false,
+      "description": "The base url of the Yi API.",
+      "defaultValue": "${env:YI_API_BASE:-https://api.lingyiwanwu.com/v1}"
+    },
+    {
+      "name": "api_key",
+      "type": "string",
+      "required": false,
+      "description": "The API key of the Yi API.",
+      "defaultValue": "${env:YI_API_KEY}"
+    },
+    {
+      "name": "api_type",
+      "type": "string",
+      "required": false,
+      "description": "The type of the OpenAI API, if you use Azure, it can be: azure"
+    },
+    {
+      "name": "api_version",
+      "type": "string",
+      "required": false,
+      "description": "The version of the OpenAI API."
+    },
+    {
+      "name": "http_proxy",
+      "type": "string",
+      "required": false,
+      "description": "The http or https proxy to use openai"
+    }
+  ]
+}} />
+