Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e14b43ff
Commit
e14b43ff
authored
May 26, 2025
by
王敏
Browse files
[fix]1.去掉lora kernel中的maxnreg参数,暂不支持;2.去掉merge-lora参数
parent
7f022e4d
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
12 additions
and
27 deletions
+12
-27
examples/medusa/medusa_benchmark_throughput.py
examples/medusa/medusa_benchmark_throughput.py
+2
-6
vllm/config.py
vllm/config.py
+0
-8
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+3
-11
vllm/lora/models.py
vllm/lora/models.py
+3
-0
vllm/lora/ops/triton_ops/lora_expand.py
vllm/lora/ops/triton_ops/lora_expand.py
+2
-1
vllm/lora/ops/triton_ops/lora_shrink.py
vllm/lora/ops/triton_ops/lora_shrink.py
+2
-1
No files found.
examples/medusa/medusa_benchmark_throughput.py
View file @
e14b43ff
...
@@ -92,7 +92,6 @@ def run_vllm(
...
@@ -92,7 +92,6 @@ def run_vllm(
spec_decoding_acceptance_method
:
str
=
None
,
spec_decoding_acceptance_method
:
str
=
None
,
enable_lora
:
bool
=
False
,
enable_lora
:
bool
=
False
,
max_lora_rank
:
int
=
32
,
max_lora_rank
:
int
=
32
,
merge_lora
:
bool
=
False
,
lora_extra_vocab_size
:
int
=
0
,
lora_extra_vocab_size
:
int
=
0
,
lora_target_modules
:
List
[
str
]
=
None
,
lora_target_modules
:
List
[
str
]
=
None
,
num_speculative_heads
:
int
=
5
,
num_speculative_heads
:
int
=
5
,
...
@@ -131,7 +130,6 @@ def run_vllm(
...
@@ -131,7 +130,6 @@ def run_vllm(
spec_decoding_acceptance_method
=
spec_decoding_acceptance_method
,
spec_decoding_acceptance_method
=
spec_decoding_acceptance_method
,
enable_lora
=
enable_lora
,
enable_lora
=
enable_lora
,
max_lora_rank
=
max_lora_rank
,
max_lora_rank
=
max_lora_rank
,
merge_lora
=
merge_lora
,
lora_extra_vocab_size
=
lora_extra_vocab_size
,
lora_extra_vocab_size
=
lora_extra_vocab_size
,
lora_target_modules
=
lora_target_modules
,
lora_target_modules
=
lora_target_modules
,
num_speculative_heads
=
num_speculative_heads
,
num_speculative_heads
=
num_speculative_heads
,
...
@@ -226,7 +224,6 @@ async def run_vllm_async(
...
@@ -226,7 +224,6 @@ async def run_vllm_async(
spec_decoding_acceptance_method
:
str
=
None
,
spec_decoding_acceptance_method
:
str
=
None
,
enable_lora
:
bool
=
False
,
enable_lora
:
bool
=
False
,
max_lora_rank
:
int
=
32
,
max_lora_rank
:
int
=
32
,
merge_lora
:
bool
=
False
,
lora_extra_vocab_size
:
int
=
0
,
lora_extra_vocab_size
:
int
=
0
,
lora_target_modules
:
List
[
str
]
=
None
,
lora_target_modules
:
List
[
str
]
=
None
,
num_speculative_heads
:
int
=
5
,
num_speculative_heads
:
int
=
5
,
...
@@ -267,7 +264,6 @@ async def run_vllm_async(
...
@@ -267,7 +264,6 @@ async def run_vllm_async(
spec_decoding_acceptance_method
=
spec_decoding_acceptance_method
,
spec_decoding_acceptance_method
=
spec_decoding_acceptance_method
,
enable_lora
=
enable_lora
,
enable_lora
=
enable_lora
,
max_lora_rank
=
max_lora_rank
,
max_lora_rank
=
max_lora_rank
,
merge_lora
=
merge_lora
,
lora_extra_vocab_size
=
lora_extra_vocab_size
,
lora_extra_vocab_size
=
lora_extra_vocab_size
,
lora_target_modules
=
lora_target_modules
,
lora_target_modules
=
lora_target_modules
,
num_speculative_heads
=
num_speculative_heads
,
num_speculative_heads
=
num_speculative_heads
,
...
@@ -342,7 +338,7 @@ def main(args: argparse.Namespace):
...
@@ -342,7 +338,7 @@ def main(args: argparse.Namespace):
args
.
disable_async_output_proc
,
False
,
args
.
max_num_seqs
,
args
.
disable_async_output_proc
,
False
,
args
.
max_num_seqs
,
args
.
speculative_model
,
args
.
speculative_draft_tensor_parallel_size
,
args
.
speculative_model
,
args
.
speculative_draft_tensor_parallel_size
,
args
.
speculative_disable_by_batch_size
,
args
.
spec_decoding_acceptance_method
,
args
.
speculative_disable_by_batch_size
,
args
.
spec_decoding_acceptance_method
,
args
.
enable_lora
,
args
.
max_lora_rank
,
args
.
merge_lora
,
args
.
lora_extra_vocab_size
,
args
.
enable_lora
,
args
.
max_lora_rank
,
args
.
lora_extra_vocab_size
,
args
.
lora_target_modules
,
args
.
num_speculative_heads
,
args
.
lora_target_modules
,
args
.
num_speculative_heads
,
args
.
num_speculative_tokens
args
.
num_speculative_tokens
]
]
...
@@ -360,7 +356,7 @@ def main(args: argparse.Namespace):
...
@@ -360,7 +356,7 @@ def main(args: argparse.Namespace):
args
.
disable_async_output_proc
,
args
.
max_num_seqs
,
args
.
disable_async_output_proc
,
args
.
max_num_seqs
,
args
.
speculative_model
,
args
.
speculative_draft_tensor_parallel_size
,
args
.
speculative_model
,
args
.
speculative_draft_tensor_parallel_size
,
args
.
speculative_disable_by_batch_size
,
args
.
spec_decoding_acceptance_method
,
args
.
speculative_disable_by_batch_size
,
args
.
spec_decoding_acceptance_method
,
args
.
enable_lora
,
args
.
max_lora_rank
,
args
.
merge_lora
,
args
.
lora_extra_vocab_size
,
args
.
enable_lora
,
args
.
max_lora_rank
,
args
.
lora_extra_vocab_size
,
args
.
lora_target_modules
,
args
.
num_speculative_heads
,
args
.
lora_target_modules
,
args
.
num_speculative_heads
,
args
.
num_speculative_tokens
args
.
num_speculative_tokens
]
]
...
...
vllm/config.py
View file @
e14b43ff
...
@@ -2635,9 +2635,6 @@ class LoRAConfig:
...
@@ -2635,9 +2635,6 @@ class LoRAConfig:
max_cpu_loras
:
Optional
[
int
]
=
None
max_cpu_loras
:
Optional
[
int
]
=
None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
`max_loras`."""
merge_lora
:
bool
=
False
"""Enable merge lora
"""
lora_target_modules
:
Optional
[
List
[
str
]]
=
None
lora_target_modules
:
Optional
[
List
[
str
]]
=
None
"""List of lora module name, If not specified,
"""List of lora module name, If not specified,
modules will be chosen according to the model architecture.
modules will be chosen according to the model architecture.
...
@@ -2678,7 +2675,6 @@ class LoRAConfig:
...
@@ -2678,7 +2675,6 @@ class LoRAConfig:
factors
.
append
(
self
.
lora_extra_vocab_size
)
factors
.
append
(
self
.
lora_extra_vocab_size
)
factors
.
append
(
self
.
long_lora_scaling_factors
)
factors
.
append
(
self
.
long_lora_scaling_factors
)
factors
.
append
(
self
.
bias_enabled
)
factors
.
append
(
self
.
bias_enabled
)
factors
.
append
(
self
.
merge_lora
)
hash_str
=
hashlib
.
md5
(
str
(
factors
).
encode
(),
hash_str
=
hashlib
.
md5
(
str
(
factors
).
encode
(),
usedforsecurity
=
False
).
hexdigest
()
usedforsecurity
=
False
).
hexdigest
()
return
hash_str
return
hash_str
...
@@ -2704,10 +2700,6 @@ class LoRAConfig:
...
@@ -2704,10 +2700,6 @@ class LoRAConfig:
raise
ValueError
(
raise
ValueError
(
f
"max_cpu_loras (
{
self
.
max_cpu_loras
}
) must be >= "
f
"max_cpu_loras (
{
self
.
max_cpu_loras
}
) must be >= "
f
"max_loras (
{
self
.
max_loras
}
)"
)
f
"max_loras (
{
self
.
max_loras
}
)"
)
if
self
.
merge_lora
and
self
.
max_loras
>
1
:
raise
ValueError
(
f
"merge_lora (
{
self
.
merge_lora
}
) can only be used when "
f
"max_loras (
{
self
.
max_loras
}
) is 1"
)
def
verify_with_cache_config
(
self
,
cache_config
:
CacheConfig
):
def
verify_with_cache_config
(
self
,
cache_config
:
CacheConfig
):
if
cache_config
.
cpu_offload_gb
>
0
and
not
envs
.
VLLM_USE_V1
:
if
cache_config
.
cpu_offload_gb
>
0
and
not
envs
.
VLLM_USE_V1
:
...
...
vllm/engine/arg_utils.py
View file @
e14b43ff
...
@@ -280,7 +280,6 @@ class EngineArgs:
...
@@ -280,7 +280,6 @@ class EngineArgs:
max_lora_rank
:
int
=
LoRAConfig
.
max_lora_rank
max_lora_rank
:
int
=
LoRAConfig
.
max_lora_rank
fully_sharded_loras
:
bool
=
LoRAConfig
.
fully_sharded_loras
fully_sharded_loras
:
bool
=
LoRAConfig
.
fully_sharded_loras
max_cpu_loras
:
Optional
[
int
]
=
LoRAConfig
.
max_cpu_loras
max_cpu_loras
:
Optional
[
int
]
=
LoRAConfig
.
max_cpu_loras
merge_lora
:
bool
=
LoRAConfig
.
merge_lora
lora_target_modules
:
Optional
[
List
[
str
]]
=
LoRAConfig
.
lora_target_modules
lora_target_modules
:
Optional
[
List
[
str
]]
=
LoRAConfig
.
lora_target_modules
lora_dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
LoRAConfig
.
lora_dtype
lora_dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
LoRAConfig
.
lora_dtype
lora_extra_vocab_size
:
int
=
LoRAConfig
.
lora_extra_vocab_size
lora_extra_vocab_size
:
int
=
LoRAConfig
.
lora_extra_vocab_size
...
@@ -421,7 +420,7 @@ class EngineArgs:
...
@@ -421,7 +420,7 @@ class EngineArgs:
'--tokenizer-mode'
,
'--tokenizer-mode'
,
type
=
str
,
type
=
str
,
default
=
EngineArgs
.
tokenizer_mode
,
default
=
EngineArgs
.
tokenizer_mode
,
choices
=
[
'auto'
,
'cpm'
,
'slow'
,
'mistral'
,
'custom'
],
choices
=
[
'auto'
,
'slow'
,
'mistral'
,
'custom'
],
help
=
'The tokenizer mode.
\n\n
* "auto" will use the '
help
=
'The tokenizer mode.
\n\n
* "auto" will use the '
'fast tokenizer if available.
\n
* "slow" will '
'fast tokenizer if available.
\n
* "slow" will '
'always use the slow tokenizer.
\n
* '
'always use the slow tokenizer.
\n
* '
...
@@ -706,9 +705,8 @@ class EngineArgs:
...
@@ -706,9 +705,8 @@ class EngineArgs:
lora_group
.
add_argument
(
'--max-lora-rank'
,
lora_group
.
add_argument
(
'--max-lora-rank'
,
**
lora_kwargs
[
"max_lora_rank"
])
**
lora_kwargs
[
"max_lora_rank"
])
lora_group
.
add_argument
(
'--merge-lora'
,
lora_group
.
add_argument
(
'--merge-lora'
,
**
lora_kwargs
[
"merge-lora"
])
action
=
argparse
.
BooleanOptionalAction
,
# action=argparse.BooleanOptionalAction,
help
=
'If set to True, the weights of the base layer will be merged with the weights of Lora.'
)
# help='If set to True, the weights of the base layer will be merged with the weights of Lora.')
lora_group
.
add_argument
(
'--lora-target-modules'
,
lora_group
.
add_argument
(
'--lora-target-modules'
,
**
lora_kwargs
[
"lora_target_modules"
])
**
lora_kwargs
[
"lora_target_modules"
])
lora_group
.
add_argument
(
'--lora-extra-vocab-size'
,
lora_group
.
add_argument
(
'--lora-extra-vocab-size'
,
...
@@ -1240,7 +1238,6 @@ class EngineArgs:
...
@@ -1240,7 +1238,6 @@ class EngineArgs:
lora_dtype
=
self
.
lora_dtype
,
lora_dtype
=
self
.
lora_dtype
,
max_cpu_loras
=
self
.
max_cpu_loras
if
self
.
max_cpu_loras
max_cpu_loras
=
self
.
max_cpu_loras
if
self
.
max_cpu_loras
and
self
.
max_cpu_loras
>
0
else
None
,
and
self
.
max_cpu_loras
>
0
else
None
,
merge_lora
=
self
.
merge_lora
,
lora_target_modules
=
self
.
lora_target_modules
)
if
self
.
enable_lora
else
None
lora_target_modules
=
self
.
lora_target_modules
)
if
self
.
enable_lora
else
None
if
self
.
qlora_adapter_name_or_path
is
not
None
and
\
if
self
.
qlora_adapter_name_or_path
is
not
None
and
\
...
@@ -1384,11 +1381,6 @@ class EngineArgs:
...
@@ -1384,11 +1381,6 @@ class EngineArgs:
from
vllm.attention.utils.fa_utils
import
(
from
vllm.attention.utils.fa_utils
import
(
flash_attn_supports_fp8
)
flash_attn_supports_fp8
)
supported
=
flash_attn_supports_fp8
()
supported
=
flash_attn_supports_fp8
()
int8_attention
=
self
.
kv_cache_dtype
.
startswith
(
"int8"
)
if
int8_attention
:
supported
=
True
if
not
supported
:
if
not
supported
:
_raise_or_fallback
(
feature_name
=
"--kv-cache-dtype"
,
_raise_or_fallback
(
feature_name
=
"--kv-cache-dtype"
,
recommend_to_remove
=
False
)
recommend_to_remove
=
False
)
...
...
vllm/lora/models.py
View file @
e14b43ff
...
@@ -345,6 +345,9 @@ class LoRAModelManager(AdapterModelManager):
...
@@ -345,6 +345,9 @@ class LoRAModelManager(AdapterModelManager):
self
.
supported_lora_modules
=
get_supported_lora_modules
(
self
.
model
)
self
.
supported_lora_modules
=
get_supported_lora_modules
(
self
.
model
)
assert
self
.
supported_lora_modules
,
"No supported LoRA modules found in"
assert
self
.
supported_lora_modules
,
"No supported LoRA modules found in"
f
"
{
self
.
model
.
__class__
.
__name__
}
."
f
"
{
self
.
model
.
__class__
.
__name__
}
."
if
lora_config
.
lora_target_modules
is
not
None
:
self
.
supported_lora_modules
=
lora_config
.
lora_target_modules
if
lora_config
.
long_lora_scaling_factors
:
if
lora_config
.
long_lora_scaling_factors
:
# We need to replace rotary emb layer to do batch computation
# We need to replace rotary emb layer to do batch computation
# for long lora.
# for long lora.
...
...
vllm/lora/ops/triton_ops/lora_expand.py
View file @
e14b43ff
...
@@ -258,7 +258,8 @@ def _lora_expand(
...
@@ -258,7 +258,8 @@ def _lora_expand(
num_warps
=
NUM_WARPS
,
num_warps
=
NUM_WARPS
,
num_ctas
=
NUM_CTAS
,
num_ctas
=
NUM_CTAS
,
num_stages
=
NUM_STAGES
,
num_stages
=
NUM_STAGES
,
maxnreg
=
MAX_NREG
,
# Triton is currently not supported maxnreg
#maxnreg=MAX_NREG,
)
)
return
return
...
...
vllm/lora/ops/triton_ops/lora_shrink.py
View file @
e14b43ff
...
@@ -213,7 +213,8 @@ def _lora_shrink(
...
@@ -213,7 +213,8 @@ def _lora_shrink(
num_warps
=
NUM_WARPS
,
num_warps
=
NUM_WARPS
,
num_ctas
=
NUM_CTAS
,
num_ctas
=
NUM_CTAS
,
num_stages
=
NUM_STAGES
,
num_stages
=
NUM_STAGES
,
maxnreg
=
MAX_NREG
,
# Triton is currently not supported maxnreg
#maxnreg=MAX_NREG,
)
)
return
return
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment