[fix]去掉merge-lora参数

9f48b238 · 王敏 · e14b43ff · 19470842 · 9f48b238
Commit 9f48b238 authored May 26, 2025 by 王敏
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 4 deletions

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +6 -4

No files found.
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -420,7 +420,7 @@ class EngineArgs:
            '--tokenizer-mode',
            type=str,
            default=EngineArgs.tokenizer_mode,
-            choices=['auto', 'slow', 'mistral', 'custom'],
+            choices=['auto', 'cpm', 'slow', 'mistral', 'custom'],
            help='The tokenizer mode.\n\n* "auto" will use the '
            'fast tokenizer if available.\n* "slow" will '
            'always use the slow tokenizer. \n* '
@@ -704,9 +704,6 @@ class EngineArgs:
        lora_group.add_argument('--max-loras', **lora_kwargs["max_loras"])
        lora_group.add_argument('--max-lora-rank',
                                **lora_kwargs["max_lora_rank"])
-        lora_group.add_argument('--merge-lora',
-                            action=argparse.BooleanOptionalAction,
-                            help='If set to True, the weights of the base layer will be merged with the weights of Lora.')
        lora_group.add_argument('--lora-target-modules',
                            **lora_kwargs["lora_target_modules"])
        lora_group.add_argument('--lora-extra-vocab-size',
@@ -1381,6 +1378,11 @@ class EngineArgs:
                from vllm.attention.utils.fa_utils import (
                    flash_attn_supports_fp8)
                supported = flash_attn_supports_fp8()
+            int8_attention = self.kv_cache_dtype.startswith("int8")
+            if int8_attention:
+                supported = True
            if not supported:
                _raise_or_fallback(feature_name="--kv-cache-dtype",
                                   recommend_to_remove=False)