"vscode:/vscode.git/clone" did not exist on "8001970ce77dffa9ee73abae520b91d479f7cd17"
Commit 2b8700e0 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.8.5.post1-dev-wm' into 'v0.8.5.post1-dev'

V0.8.5.post1 dev wm

See merge request dcutoolkit/deeplearing/vllm!122
parents 19470842 9f48b238
......@@ -92,7 +92,6 @@ def run_vllm(
spec_decoding_acceptance_method: str = None,
enable_lora: bool = False,
max_lora_rank: int = 32,
merge_lora: bool = False,
lora_extra_vocab_size: int = 0,
lora_target_modules: List[str] = None,
num_speculative_heads: int = 5,
......@@ -131,7 +130,6 @@ def run_vllm(
spec_decoding_acceptance_method=spec_decoding_acceptance_method,
enable_lora=enable_lora,
max_lora_rank=max_lora_rank,
merge_lora=merge_lora,
lora_extra_vocab_size=lora_extra_vocab_size,
lora_target_modules=lora_target_modules,
num_speculative_heads=num_speculative_heads,
......@@ -226,7 +224,6 @@ async def run_vllm_async(
spec_decoding_acceptance_method: str = None,
enable_lora: bool = False,
max_lora_rank: int = 32,
merge_lora: bool = False,
lora_extra_vocab_size: int = 0,
lora_target_modules: List[str] = None,
num_speculative_heads: int = 5,
......@@ -267,7 +264,6 @@ async def run_vllm_async(
spec_decoding_acceptance_method=spec_decoding_acceptance_method,
enable_lora=enable_lora,
max_lora_rank=max_lora_rank,
merge_lora=merge_lora,
lora_extra_vocab_size=lora_extra_vocab_size,
lora_target_modules=lora_target_modules,
num_speculative_heads=num_speculative_heads,
......@@ -342,7 +338,7 @@ def main(args: argparse.Namespace):
args.disable_async_output_proc, False, args.max_num_seqs,
args.speculative_model, args.speculative_draft_tensor_parallel_size,
args.speculative_disable_by_batch_size, args.spec_decoding_acceptance_method,
args.enable_lora, args.max_lora_rank, args.merge_lora, args.lora_extra_vocab_size,
args.enable_lora, args.max_lora_rank, args.lora_extra_vocab_size,
args.lora_target_modules, args.num_speculative_heads,
args.num_speculative_tokens
]
......@@ -360,7 +356,7 @@ def main(args: argparse.Namespace):
args.disable_async_output_proc, args.max_num_seqs,
args.speculative_model, args.speculative_draft_tensor_parallel_size,
args.speculative_disable_by_batch_size, args.spec_decoding_acceptance_method,
args.enable_lora, args.max_lora_rank, args.merge_lora, args.lora_extra_vocab_size,
args.enable_lora, args.max_lora_rank, args.lora_extra_vocab_size,
args.lora_target_modules, args.num_speculative_heads,
args.num_speculative_tokens
]
......
......@@ -2635,9 +2635,6 @@ class LoRAConfig:
max_cpu_loras: Optional[int] = None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
merge_lora: bool = False
"""Enable merge lora
"""
lora_target_modules: Optional[List[str]] = None
"""List of lora module name, If not specified,
modules will be chosen according to the model architecture.
......@@ -2678,7 +2675,6 @@ class LoRAConfig:
factors.append(self.lora_extra_vocab_size)
factors.append(self.long_lora_scaling_factors)
factors.append(self.bias_enabled)
factors.append(self.merge_lora)
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str
......@@ -2704,10 +2700,6 @@ class LoRAConfig:
raise ValueError(
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})")
if self.merge_lora and self.max_loras > 1:
raise ValueError(
f"merge_lora ({self.merge_lora}) can only be used when "
f"max_loras ({self.max_loras}) is 1")
def verify_with_cache_config(self, cache_config: CacheConfig):
if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
......
......@@ -280,7 +280,6 @@ class EngineArgs:
max_lora_rank: int = LoRAConfig.max_lora_rank
fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
merge_lora: bool = LoRAConfig.merge_lora
lora_target_modules: Optional[List[str]] = LoRAConfig.lora_target_modules
lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
......@@ -705,8 +704,6 @@ class EngineArgs:
lora_group.add_argument('--max-loras', **lora_kwargs["max_loras"])
lora_group.add_argument('--max-lora-rank',
**lora_kwargs["max_lora_rank"])
lora_group.add_argument('--merge-lora',
**lora_kwargs["merge_lora"])
lora_group.add_argument('--lora-target-modules',
**lora_kwargs["lora_target_modules"])
lora_group.add_argument('--lora-extra-vocab-size',
......@@ -1238,7 +1235,6 @@ class EngineArgs:
lora_dtype=self.lora_dtype,
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
and self.max_cpu_loras > 0 else None,
merge_lora=self.merge_lora,
lora_target_modules=self.lora_target_modules) if self.enable_lora else None
if self.qlora_adapter_name_or_path is not None and \
......
......@@ -345,6 +345,9 @@ class LoRAModelManager(AdapterModelManager):
self.supported_lora_modules = get_supported_lora_modules(self.model)
assert self.supported_lora_modules, "No supported LoRA modules found in"
f"{self.model.__class__.__name__}."
if lora_config.lora_target_modules is not None:
self.supported_lora_modules = lora_config.lora_target_modules
if lora_config.long_lora_scaling_factors:
# We need to replace rotary emb layer to do batch computation
# for long lora.
......
......@@ -258,7 +258,8 @@ def _lora_expand(
num_warps=NUM_WARPS,
num_ctas=NUM_CTAS,
num_stages=NUM_STAGES,
maxnreg=MAX_NREG,
# Triton is currently not supported maxnreg
#maxnreg=MAX_NREG,
)
return
......
......@@ -213,7 +213,8 @@ def _lora_shrink(
num_warps=NUM_WARPS,
num_ctas=NUM_CTAS,
num_stages=NUM_STAGES,
maxnreg=MAX_NREG,
# Triton is currently not supported maxnreg
#maxnreg=MAX_NREG,
)
return
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment