Commit 2b8700e0 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.8.5.post1-dev-wm' into 'v0.8.5.post1-dev'

V0.8.5.post1 dev wm

See merge request dcutoolkit/deeplearing/vllm!122
parents 19470842 9f48b238
...@@ -92,7 +92,6 @@ def run_vllm( ...@@ -92,7 +92,6 @@ def run_vllm(
spec_decoding_acceptance_method: str = None, spec_decoding_acceptance_method: str = None,
enable_lora: bool = False, enable_lora: bool = False,
max_lora_rank: int = 32, max_lora_rank: int = 32,
merge_lora: bool = False,
lora_extra_vocab_size: int = 0, lora_extra_vocab_size: int = 0,
lora_target_modules: List[str] = None, lora_target_modules: List[str] = None,
num_speculative_heads: int = 5, num_speculative_heads: int = 5,
...@@ -131,7 +130,6 @@ def run_vllm( ...@@ -131,7 +130,6 @@ def run_vllm(
spec_decoding_acceptance_method=spec_decoding_acceptance_method, spec_decoding_acceptance_method=spec_decoding_acceptance_method,
enable_lora=enable_lora, enable_lora=enable_lora,
max_lora_rank=max_lora_rank, max_lora_rank=max_lora_rank,
merge_lora=merge_lora,
lora_extra_vocab_size=lora_extra_vocab_size, lora_extra_vocab_size=lora_extra_vocab_size,
lora_target_modules=lora_target_modules, lora_target_modules=lora_target_modules,
num_speculative_heads=num_speculative_heads, num_speculative_heads=num_speculative_heads,
...@@ -226,7 +224,6 @@ async def run_vllm_async( ...@@ -226,7 +224,6 @@ async def run_vllm_async(
spec_decoding_acceptance_method: str = None, spec_decoding_acceptance_method: str = None,
enable_lora: bool = False, enable_lora: bool = False,
max_lora_rank: int = 32, max_lora_rank: int = 32,
merge_lora: bool = False,
lora_extra_vocab_size: int = 0, lora_extra_vocab_size: int = 0,
lora_target_modules: List[str] = None, lora_target_modules: List[str] = None,
num_speculative_heads: int = 5, num_speculative_heads: int = 5,
...@@ -267,7 +264,6 @@ async def run_vllm_async( ...@@ -267,7 +264,6 @@ async def run_vllm_async(
spec_decoding_acceptance_method=spec_decoding_acceptance_method, spec_decoding_acceptance_method=spec_decoding_acceptance_method,
enable_lora=enable_lora, enable_lora=enable_lora,
max_lora_rank=max_lora_rank, max_lora_rank=max_lora_rank,
merge_lora=merge_lora,
lora_extra_vocab_size=lora_extra_vocab_size, lora_extra_vocab_size=lora_extra_vocab_size,
lora_target_modules=lora_target_modules, lora_target_modules=lora_target_modules,
num_speculative_heads=num_speculative_heads, num_speculative_heads=num_speculative_heads,
...@@ -342,7 +338,7 @@ def main(args: argparse.Namespace): ...@@ -342,7 +338,7 @@ def main(args: argparse.Namespace):
args.disable_async_output_proc, False, args.max_num_seqs, args.disable_async_output_proc, False, args.max_num_seqs,
args.speculative_model, args.speculative_draft_tensor_parallel_size, args.speculative_model, args.speculative_draft_tensor_parallel_size,
args.speculative_disable_by_batch_size, args.spec_decoding_acceptance_method, args.speculative_disable_by_batch_size, args.spec_decoding_acceptance_method,
args.enable_lora, args.max_lora_rank, args.merge_lora, args.lora_extra_vocab_size, args.enable_lora, args.max_lora_rank, args.lora_extra_vocab_size,
args.lora_target_modules, args.num_speculative_heads, args.lora_target_modules, args.num_speculative_heads,
args.num_speculative_tokens args.num_speculative_tokens
] ]
...@@ -360,7 +356,7 @@ def main(args: argparse.Namespace): ...@@ -360,7 +356,7 @@ def main(args: argparse.Namespace):
args.disable_async_output_proc, args.max_num_seqs, args.disable_async_output_proc, args.max_num_seqs,
args.speculative_model, args.speculative_draft_tensor_parallel_size, args.speculative_model, args.speculative_draft_tensor_parallel_size,
args.speculative_disable_by_batch_size, args.spec_decoding_acceptance_method, args.speculative_disable_by_batch_size, args.spec_decoding_acceptance_method,
args.enable_lora, args.max_lora_rank, args.merge_lora, args.lora_extra_vocab_size, args.enable_lora, args.max_lora_rank, args.lora_extra_vocab_size,
args.lora_target_modules, args.num_speculative_heads, args.lora_target_modules, args.num_speculative_heads,
args.num_speculative_tokens args.num_speculative_tokens
] ]
......
...@@ -2635,9 +2635,6 @@ class LoRAConfig: ...@@ -2635,9 +2635,6 @@ class LoRAConfig:
max_cpu_loras: Optional[int] = None max_cpu_loras: Optional[int] = None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than """Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`.""" `max_loras`."""
merge_lora: bool = False
"""Enable merge lora
"""
lora_target_modules: Optional[List[str]] = None lora_target_modules: Optional[List[str]] = None
"""List of lora module name, If not specified, """List of lora module name, If not specified,
modules will be chosen according to the model architecture. modules will be chosen according to the model architecture.
...@@ -2678,7 +2675,6 @@ class LoRAConfig: ...@@ -2678,7 +2675,6 @@ class LoRAConfig:
factors.append(self.lora_extra_vocab_size) factors.append(self.lora_extra_vocab_size)
factors.append(self.long_lora_scaling_factors) factors.append(self.long_lora_scaling_factors)
factors.append(self.bias_enabled) factors.append(self.bias_enabled)
factors.append(self.merge_lora)
hash_str = hashlib.md5(str(factors).encode(), hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest() usedforsecurity=False).hexdigest()
return hash_str return hash_str
...@@ -2704,10 +2700,6 @@ class LoRAConfig: ...@@ -2704,10 +2700,6 @@ class LoRAConfig:
raise ValueError( raise ValueError(
f"max_cpu_loras ({self.max_cpu_loras}) must be >= " f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})") f"max_loras ({self.max_loras})")
if self.merge_lora and self.max_loras > 1:
raise ValueError(
f"merge_lora ({self.merge_lora}) can only be used when "
f"max_loras ({self.max_loras}) is 1")
def verify_with_cache_config(self, cache_config: CacheConfig): def verify_with_cache_config(self, cache_config: CacheConfig):
if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1: if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
......
...@@ -280,7 +280,6 @@ class EngineArgs: ...@@ -280,7 +280,6 @@ class EngineArgs:
max_lora_rank: int = LoRAConfig.max_lora_rank max_lora_rank: int = LoRAConfig.max_lora_rank
fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
merge_lora: bool = LoRAConfig.merge_lora
lora_target_modules: Optional[List[str]] = LoRAConfig.lora_target_modules lora_target_modules: Optional[List[str]] = LoRAConfig.lora_target_modules
lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
...@@ -705,8 +704,6 @@ class EngineArgs: ...@@ -705,8 +704,6 @@ class EngineArgs:
lora_group.add_argument('--max-loras', **lora_kwargs["max_loras"]) lora_group.add_argument('--max-loras', **lora_kwargs["max_loras"])
lora_group.add_argument('--max-lora-rank', lora_group.add_argument('--max-lora-rank',
**lora_kwargs["max_lora_rank"]) **lora_kwargs["max_lora_rank"])
lora_group.add_argument('--merge-lora',
**lora_kwargs["merge_lora"])
lora_group.add_argument('--lora-target-modules', lora_group.add_argument('--lora-target-modules',
**lora_kwargs["lora_target_modules"]) **lora_kwargs["lora_target_modules"])
lora_group.add_argument('--lora-extra-vocab-size', lora_group.add_argument('--lora-extra-vocab-size',
...@@ -1238,7 +1235,6 @@ class EngineArgs: ...@@ -1238,7 +1235,6 @@ class EngineArgs:
lora_dtype=self.lora_dtype, lora_dtype=self.lora_dtype,
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
and self.max_cpu_loras > 0 else None, and self.max_cpu_loras > 0 else None,
merge_lora=self.merge_lora,
lora_target_modules=self.lora_target_modules) if self.enable_lora else None lora_target_modules=self.lora_target_modules) if self.enable_lora else None
if self.qlora_adapter_name_or_path is not None and \ if self.qlora_adapter_name_or_path is not None and \
......
...@@ -345,6 +345,9 @@ class LoRAModelManager(AdapterModelManager): ...@@ -345,6 +345,9 @@ class LoRAModelManager(AdapterModelManager):
self.supported_lora_modules = get_supported_lora_modules(self.model) self.supported_lora_modules = get_supported_lora_modules(self.model)
assert self.supported_lora_modules, "No supported LoRA modules found in" assert self.supported_lora_modules, "No supported LoRA modules found in"
f"{self.model.__class__.__name__}." f"{self.model.__class__.__name__}."
if lora_config.lora_target_modules is not None:
self.supported_lora_modules = lora_config.lora_target_modules
if lora_config.long_lora_scaling_factors: if lora_config.long_lora_scaling_factors:
# We need to replace rotary emb layer to do batch computation # We need to replace rotary emb layer to do batch computation
# for long lora. # for long lora.
......
...@@ -258,7 +258,8 @@ def _lora_expand( ...@@ -258,7 +258,8 @@ def _lora_expand(
num_warps=NUM_WARPS, num_warps=NUM_WARPS,
num_ctas=NUM_CTAS, num_ctas=NUM_CTAS,
num_stages=NUM_STAGES, num_stages=NUM_STAGES,
maxnreg=MAX_NREG, # Triton is currently not supported maxnreg
#maxnreg=MAX_NREG,
) )
return return
......
...@@ -213,7 +213,8 @@ def _lora_shrink( ...@@ -213,7 +213,8 @@ def _lora_shrink(
num_warps=NUM_WARPS, num_warps=NUM_WARPS,
num_ctas=NUM_CTAS, num_ctas=NUM_CTAS,
num_stages=NUM_STAGES, num_stages=NUM_STAGES,
maxnreg=MAX_NREG, # Triton is currently not supported maxnreg
#maxnreg=MAX_NREG,
) )
return return
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment