Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e14b43ff
Commit
e14b43ff
authored
May 26, 2025
by
王敏
Browse files
[fix]1.去掉lora kernel中的maxnreg参数,暂不支持;2.去掉merge-lora参数
parent
7f022e4d
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
12 additions
and
27 deletions
+12
-27
examples/medusa/medusa_benchmark_throughput.py
examples/medusa/medusa_benchmark_throughput.py
+2
-6
vllm/config.py
vllm/config.py
+0
-8
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+3
-11
vllm/lora/models.py
vllm/lora/models.py
+3
-0
vllm/lora/ops/triton_ops/lora_expand.py
vllm/lora/ops/triton_ops/lora_expand.py
+2
-1
vllm/lora/ops/triton_ops/lora_shrink.py
vllm/lora/ops/triton_ops/lora_shrink.py
+2
-1
No files found.
examples/medusa/medusa_benchmark_throughput.py
View file @
e14b43ff
...
...
@@ -92,7 +92,6 @@ def run_vllm(
spec_decoding_acceptance_method
:
str
=
None
,
enable_lora
:
bool
=
False
,
max_lora_rank
:
int
=
32
,
merge_lora
:
bool
=
False
,
lora_extra_vocab_size
:
int
=
0
,
lora_target_modules
:
List
[
str
]
=
None
,
num_speculative_heads
:
int
=
5
,
...
...
@@ -131,7 +130,6 @@ def run_vllm(
spec_decoding_acceptance_method
=
spec_decoding_acceptance_method
,
enable_lora
=
enable_lora
,
max_lora_rank
=
max_lora_rank
,
merge_lora
=
merge_lora
,
lora_extra_vocab_size
=
lora_extra_vocab_size
,
lora_target_modules
=
lora_target_modules
,
num_speculative_heads
=
num_speculative_heads
,
...
...
@@ -226,7 +224,6 @@ async def run_vllm_async(
spec_decoding_acceptance_method
:
str
=
None
,
enable_lora
:
bool
=
False
,
max_lora_rank
:
int
=
32
,
merge_lora
:
bool
=
False
,
lora_extra_vocab_size
:
int
=
0
,
lora_target_modules
:
List
[
str
]
=
None
,
num_speculative_heads
:
int
=
5
,
...
...
@@ -267,7 +264,6 @@ async def run_vllm_async(
spec_decoding_acceptance_method
=
spec_decoding_acceptance_method
,
enable_lora
=
enable_lora
,
max_lora_rank
=
max_lora_rank
,
merge_lora
=
merge_lora
,
lora_extra_vocab_size
=
lora_extra_vocab_size
,
lora_target_modules
=
lora_target_modules
,
num_speculative_heads
=
num_speculative_heads
,
...
...
@@ -342,7 +338,7 @@ def main(args: argparse.Namespace):
args
.
disable_async_output_proc
,
False
,
args
.
max_num_seqs
,
args
.
speculative_model
,
args
.
speculative_draft_tensor_parallel_size
,
args
.
speculative_disable_by_batch_size
,
args
.
spec_decoding_acceptance_method
,
args
.
enable_lora
,
args
.
max_lora_rank
,
args
.
merge_lora
,
args
.
lora_extra_vocab_size
,
args
.
enable_lora
,
args
.
max_lora_rank
,
args
.
lora_extra_vocab_size
,
args
.
lora_target_modules
,
args
.
num_speculative_heads
,
args
.
num_speculative_tokens
]
...
...
@@ -360,7 +356,7 @@ def main(args: argparse.Namespace):
args
.
disable_async_output_proc
,
args
.
max_num_seqs
,
args
.
speculative_model
,
args
.
speculative_draft_tensor_parallel_size
,
args
.
speculative_disable_by_batch_size
,
args
.
spec_decoding_acceptance_method
,
args
.
enable_lora
,
args
.
max_lora_rank
,
args
.
merge_lora
,
args
.
lora_extra_vocab_size
,
args
.
enable_lora
,
args
.
max_lora_rank
,
args
.
lora_extra_vocab_size
,
args
.
lora_target_modules
,
args
.
num_speculative_heads
,
args
.
num_speculative_tokens
]
...
...
vllm/config.py
View file @
e14b43ff
...
...
@@ -2635,9 +2635,6 @@ class LoRAConfig:
max_cpu_loras
:
Optional
[
int
]
=
None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
merge_lora
:
bool
=
False
"""Enable merge lora
"""
lora_target_modules
:
Optional
[
List
[
str
]]
=
None
"""List of lora module name, If not specified,
modules will be chosen according to the model architecture.
...
...
@@ -2678,7 +2675,6 @@ class LoRAConfig:
factors
.
append
(
self
.
lora_extra_vocab_size
)
factors
.
append
(
self
.
long_lora_scaling_factors
)
factors
.
append
(
self
.
bias_enabled
)
factors
.
append
(
self
.
merge_lora
)
hash_str
=
hashlib
.
md5
(
str
(
factors
).
encode
(),
usedforsecurity
=
False
).
hexdigest
()
return
hash_str
...
...
@@ -2704,10 +2700,6 @@ class LoRAConfig:
raise
ValueError
(
f
"max_cpu_loras (
{
self
.
max_cpu_loras
}
) must be >= "
f
"max_loras (
{
self
.
max_loras
}
)"
)
if
self
.
merge_lora
and
self
.
max_loras
>
1
:
raise
ValueError
(
f
"merge_lora (
{
self
.
merge_lora
}
) can only be used when "
f
"max_loras (
{
self
.
max_loras
}
) is 1"
)
def
verify_with_cache_config
(
self
,
cache_config
:
CacheConfig
):
if
cache_config
.
cpu_offload_gb
>
0
and
not
envs
.
VLLM_USE_V1
:
...
...
vllm/engine/arg_utils.py
View file @
e14b43ff
...
...
@@ -280,7 +280,6 @@ class EngineArgs:
max_lora_rank
:
int
=
LoRAConfig
.
max_lora_rank
fully_sharded_loras
:
bool
=
LoRAConfig
.
fully_sharded_loras
max_cpu_loras
:
Optional
[
int
]
=
LoRAConfig
.
max_cpu_loras
merge_lora
:
bool
=
LoRAConfig
.
merge_lora
lora_target_modules
:
Optional
[
List
[
str
]]
=
LoRAConfig
.
lora_target_modules
lora_dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
LoRAConfig
.
lora_dtype
lora_extra_vocab_size
:
int
=
LoRAConfig
.
lora_extra_vocab_size
...
...
@@ -421,7 +420,7 @@ class EngineArgs:
'--tokenizer-mode'
,
type
=
str
,
default
=
EngineArgs
.
tokenizer_mode
,
choices
=
[
'auto'
,
'cpm'
,
'slow'
,
'mistral'
,
'custom'
],
choices
=
[
'auto'
,
'slow'
,
'mistral'
,
'custom'
],
help
=
'The tokenizer mode.
\n\n
* "auto" will use the '
'fast tokenizer if available.
\n
* "slow" will '
'always use the slow tokenizer.
\n
* '
...
...
@@ -706,9 +705,8 @@ class EngineArgs:
lora_group
.
add_argument
(
'--max-lora-rank'
,
**
lora_kwargs
[
"max_lora_rank"
])
lora_group
.
add_argument
(
'--merge-lora'
,
**
lora_kwargs
[
"merge-lora"
])
# action=argparse.BooleanOptionalAction,
# help='If set to True, the weights of the base layer will be merged with the weights of Lora.')
action
=
argparse
.
BooleanOptionalAction
,
help
=
'If set to True, the weights of the base layer will be merged with the weights of Lora.'
)
lora_group
.
add_argument
(
'--lora-target-modules'
,
**
lora_kwargs
[
"lora_target_modules"
])
lora_group
.
add_argument
(
'--lora-extra-vocab-size'
,
...
...
@@ -1240,7 +1238,6 @@ class EngineArgs:
lora_dtype
=
self
.
lora_dtype
,
max_cpu_loras
=
self
.
max_cpu_loras
if
self
.
max_cpu_loras
and
self
.
max_cpu_loras
>
0
else
None
,
merge_lora
=
self
.
merge_lora
,
lora_target_modules
=
self
.
lora_target_modules
)
if
self
.
enable_lora
else
None
if
self
.
qlora_adapter_name_or_path
is
not
None
and
\
...
...
@@ -1384,11 +1381,6 @@ class EngineArgs:
from
vllm.attention.utils.fa_utils
import
(
flash_attn_supports_fp8
)
supported
=
flash_attn_supports_fp8
()
int8_attention
=
self
.
kv_cache_dtype
.
startswith
(
"int8"
)
if
int8_attention
:
supported
=
True
if
not
supported
:
_raise_or_fallback
(
feature_name
=
"--kv-cache-dtype"
,
recommend_to_remove
=
False
)
...
...
vllm/lora/models.py
View file @
e14b43ff
...
...
@@ -345,6 +345,9 @@ class LoRAModelManager(AdapterModelManager):
self
.
supported_lora_modules
=
get_supported_lora_modules
(
self
.
model
)
assert
self
.
supported_lora_modules
,
"No supported LoRA modules found in"
f
"
{
self
.
model
.
__class__
.
__name__
}
."
if
lora_config
.
lora_target_modules
is
not
None
:
self
.
supported_lora_modules
=
lora_config
.
lora_target_modules
if
lora_config
.
long_lora_scaling_factors
:
# We need to replace rotary emb layer to do batch computation
# for long lora.
...
...
vllm/lora/ops/triton_ops/lora_expand.py
View file @
e14b43ff
...
...
@@ -258,7 +258,8 @@ def _lora_expand(
num_warps
=
NUM_WARPS
,
num_ctas
=
NUM_CTAS
,
num_stages
=
NUM_STAGES
,
maxnreg
=
MAX_NREG
,
# Triton is currently not supported maxnreg
#maxnreg=MAX_NREG,
)
return
...
...
vllm/lora/ops/triton_ops/lora_shrink.py
View file @
e14b43ff
...
...
@@ -213,7 +213,8 @@ def _lora_shrink(
num_warps
=
NUM_WARPS
,
num_ctas
=
NUM_CTAS
,
num_stages
=
NUM_STAGES
,
maxnreg
=
MAX_NREG
,
# Triton is currently not supported maxnreg
#maxnreg=MAX_NREG,
)
return
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment