"vscode:/vscode.git/clone" did not exist on "5ff0d32580eb4daadadece5ee39d33f43f230108"
Commit e14b43ff authored by 王敏's avatar 王敏
Browse files

[fix]1.去掉lora kernel中的maxnreg参数,暂不支持;2.去掉merge-lora参数

parent 7f022e4d
......@@ -92,7 +92,6 @@ def run_vllm(
spec_decoding_acceptance_method: str = None,
enable_lora: bool = False,
max_lora_rank: int = 32,
merge_lora: bool = False,
lora_extra_vocab_size: int = 0,
lora_target_modules: List[str] = None,
num_speculative_heads: int = 5,
......@@ -131,7 +130,6 @@ def run_vllm(
spec_decoding_acceptance_method=spec_decoding_acceptance_method,
enable_lora=enable_lora,
max_lora_rank=max_lora_rank,
merge_lora=merge_lora,
lora_extra_vocab_size=lora_extra_vocab_size,
lora_target_modules=lora_target_modules,
num_speculative_heads=num_speculative_heads,
......@@ -226,7 +224,6 @@ async def run_vllm_async(
spec_decoding_acceptance_method: str = None,
enable_lora: bool = False,
max_lora_rank: int = 32,
merge_lora: bool = False,
lora_extra_vocab_size: int = 0,
lora_target_modules: List[str] = None,
num_speculative_heads: int = 5,
......@@ -267,7 +264,6 @@ async def run_vllm_async(
spec_decoding_acceptance_method=spec_decoding_acceptance_method,
enable_lora=enable_lora,
max_lora_rank=max_lora_rank,
merge_lora=merge_lora,
lora_extra_vocab_size=lora_extra_vocab_size,
lora_target_modules=lora_target_modules,
num_speculative_heads=num_speculative_heads,
......@@ -342,7 +338,7 @@ def main(args: argparse.Namespace):
args.disable_async_output_proc, False, args.max_num_seqs,
args.speculative_model, args.speculative_draft_tensor_parallel_size,
args.speculative_disable_by_batch_size, args.spec_decoding_acceptance_method,
args.enable_lora, args.max_lora_rank, args.merge_lora, args.lora_extra_vocab_size,
args.enable_lora, args.max_lora_rank, args.lora_extra_vocab_size,
args.lora_target_modules, args.num_speculative_heads,
args.num_speculative_tokens
]
......@@ -360,7 +356,7 @@ def main(args: argparse.Namespace):
args.disable_async_output_proc, args.max_num_seqs,
args.speculative_model, args.speculative_draft_tensor_parallel_size,
args.speculative_disable_by_batch_size, args.spec_decoding_acceptance_method,
args.enable_lora, args.max_lora_rank, args.merge_lora, args.lora_extra_vocab_size,
args.enable_lora, args.max_lora_rank, args.lora_extra_vocab_size,
args.lora_target_modules, args.num_speculative_heads,
args.num_speculative_tokens
]
......
......@@ -2635,9 +2635,6 @@ class LoRAConfig:
max_cpu_loras: Optional[int] = None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
merge_lora: bool = False
"""Enable merge lora
"""
lora_target_modules: Optional[List[str]] = None
"""List of lora module name, If not specified,
modules will be chosen according to the model architecture.
......@@ -2678,7 +2675,6 @@ class LoRAConfig:
factors.append(self.lora_extra_vocab_size)
factors.append(self.long_lora_scaling_factors)
factors.append(self.bias_enabled)
factors.append(self.merge_lora)
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str
......@@ -2704,10 +2700,6 @@ class LoRAConfig:
raise ValueError(
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})")
if self.merge_lora and self.max_loras > 1:
raise ValueError(
f"merge_lora ({self.merge_lora}) can only be used when "
f"max_loras ({self.max_loras}) is 1")
def verify_with_cache_config(self, cache_config: CacheConfig):
if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
......
......@@ -280,7 +280,6 @@ class EngineArgs:
max_lora_rank: int = LoRAConfig.max_lora_rank
fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
merge_lora: bool = LoRAConfig.merge_lora
lora_target_modules: Optional[List[str]] = LoRAConfig.lora_target_modules
lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
......@@ -421,7 +420,7 @@ class EngineArgs:
'--tokenizer-mode',
type=str,
default=EngineArgs.tokenizer_mode,
choices=['auto', 'cpm', 'slow', 'mistral', 'custom'],
choices=['auto', 'slow', 'mistral', 'custom'],
help='The tokenizer mode.\n\n* "auto" will use the '
'fast tokenizer if available.\n* "slow" will '
'always use the slow tokenizer. \n* '
......@@ -706,9 +705,8 @@ class EngineArgs:
lora_group.add_argument('--max-lora-rank',
**lora_kwargs["max_lora_rank"])
lora_group.add_argument('--merge-lora',
**lora_kwargs["merge-lora"])
# action=argparse.BooleanOptionalAction,
# help='If set to True, the weights of the base layer will be merged with the weights of Lora.')
action=argparse.BooleanOptionalAction,
help='If set to True, the weights of the base layer will be merged with the weights of Lora.')
lora_group.add_argument('--lora-target-modules',
**lora_kwargs["lora_target_modules"])
lora_group.add_argument('--lora-extra-vocab-size',
......@@ -1240,7 +1238,6 @@ class EngineArgs:
lora_dtype=self.lora_dtype,
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
and self.max_cpu_loras > 0 else None,
merge_lora=self.merge_lora,
lora_target_modules=self.lora_target_modules) if self.enable_lora else None
if self.qlora_adapter_name_or_path is not None and \
......@@ -1384,11 +1381,6 @@ class EngineArgs:
from vllm.attention.utils.fa_utils import (
flash_attn_supports_fp8)
supported = flash_attn_supports_fp8()
int8_attention = self.kv_cache_dtype.startswith("int8")
if int8_attention:
supported = True
if not supported:
_raise_or_fallback(feature_name="--kv-cache-dtype",
recommend_to_remove=False)
......
......@@ -345,6 +345,9 @@ class LoRAModelManager(AdapterModelManager):
self.supported_lora_modules = get_supported_lora_modules(self.model)
assert self.supported_lora_modules, "No supported LoRA modules found in"
f"{self.model.__class__.__name__}."
if lora_config.lora_target_modules is not None:
self.supported_lora_modules = lora_config.lora_target_modules
if lora_config.long_lora_scaling_factors:
# We need to replace rotary emb layer to do batch computation
# for long lora.
......
......@@ -258,7 +258,8 @@ def _lora_expand(
num_warps=NUM_WARPS,
num_ctas=NUM_CTAS,
num_stages=NUM_STAGES,
maxnreg=MAX_NREG,
# Triton is currently not supported maxnreg
#maxnreg=MAX_NREG,
)
return
......
......@@ -213,7 +213,8 @@ def _lora_shrink(
num_warps=NUM_WARPS,
num_ctas=NUM_CTAS,
num_stages=NUM_STAGES,
maxnreg=MAX_NREG,
# Triton is currently not supported maxnreg
#maxnreg=MAX_NREG,
)
return
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment