Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ec0136e7
Commit
ec0136e7
authored
Dec 24, 2024
by
zhuwenwen
Browse files
Merge branch 'v0.6.2-dev_wm' into 'v0.6.2-dev'
解决qwen lora模型推理结果异常问题 See merge request dcutoolkit/deeplearing/vllm!56
parents
7fd1d015
da39222e
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
12 additions
and
19 deletions
+12
-19
benchmarks/kernels/benchmark_moe.py
benchmarks/kernels/benchmark_moe.py
+6
-1
csrc/attention/attention_with_mask_kernels.cu
csrc/attention/attention_with_mask_kernels.cu
+1
-1
csrc/attention/attention_with_mask_kernels_opt.cu
csrc/attention/attention_with_mask_kernels_opt.cu
+1
-1
csrc/attention/attention_with_mask_kernels_opt_tc.cu
csrc/attention/attention_with_mask_kernels_opt_tc.cu
+1
-1
examples/medusa/medusa_benchmark_throughput.py
examples/medusa/medusa_benchmark_throughput.py
+0
-3
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+3
-12
No files found.
benchmarks/kernels/benchmark_moe.py
View file @
ec0136e7
...
@@ -284,6 +284,11 @@ def main(args: argparse.Namespace):
...
@@ -284,6 +284,11 @@ def main(args: argparse.Namespace):
topk
=
config
.
num_experts_per_tok
topk
=
config
.
num_experts_per_tok
intermediate_size
=
config
.
intermediate_size
intermediate_size
=
config
.
intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
elif
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
:
E
=
config
.
n_routed_experts
topk
=
config
.
num_experts_per_tok
intermediate_size
=
config
.
moe_intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
else
:
else
:
# Default: Mixtral.
# Default: Mixtral.
E
=
config
.
num_local_experts
E
=
config
.
num_local_experts
...
@@ -306,7 +311,7 @@ def main(args: argparse.Namespace):
...
@@ -306,7 +311,7 @@ def main(args: argparse.Namespace):
ray
.
init
(
address
=
None
,
ray
.
init
(
address
=
None
,
ignore_reinit_error
=
True
,
ignore_reinit_error
=
True
,
num_gpus
=
args
.
tp_size
)
num_gpus
=
1
)
num_gpus
=
int
(
ray
.
available_resources
()[
"GPU"
])
num_gpus
=
int
(
ray
.
available_resources
()[
"GPU"
])
workers
=
[
BenchmarkWorker
.
remote
(
args
.
seed
)
for
_
in
range
(
num_gpus
)]
workers
=
[
BenchmarkWorker
.
remote
(
args
.
seed
)
for
_
in
range
(
num_gpus
)]
...
...
csrc/attention/attention_with_mask_kernels.cu
View file @
ec0136e7
...
@@ -298,7 +298,7 @@ __device__ void paged_attention_with_mask_kernel(
...
@@ -298,7 +298,7 @@ __device__ void paged_attention_with_mask_kernel(
qk
+=
(
alibi_slope
!=
0
)
?
alibi_slope
*
(
token_idx
-
seq_len
+
1
)
:
0
;
qk
+=
(
alibi_slope
!=
0
)
?
alibi_slope
*
(
token_idx
-
seq_len
+
1
)
:
0
;
// used for tree-style attention
// used for tree-style attention
if
(
attn_masks
!=
nullptr
)
{
if
(
attn_masks
!=
nullptr
&&
token_idx
<
seq_len
)
{
const
int
*
attn_masks_ptr
=
attn_masks
+
seq_idx
*
attn_masks_stride
;
const
int
*
attn_masks_ptr
=
attn_masks
+
seq_idx
*
attn_masks_stride
;
if
(
attn_masks_ptr
[
token_idx
]
==
0
)
{
if
(
attn_masks_ptr
[
token_idx
]
==
0
)
{
qk
=
-
FLT_MAX
;
qk
=
-
FLT_MAX
;
...
...
csrc/attention/attention_with_mask_kernels_opt.cu
View file @
ec0136e7
...
@@ -329,7 +329,7 @@ __device__ void paged_attention_with_mask_kernel_opt(
...
@@ -329,7 +329,7 @@ __device__ void paged_attention_with_mask_kernel_opt(
qk
+=
(
alibi_slope
!=
0
)
?
alibi_slope
*
(
token_idx
-
seq_len
+
1
)
:
0
;
qk
+=
(
alibi_slope
!=
0
)
?
alibi_slope
*
(
token_idx
-
seq_len
+
1
)
:
0
;
// used for tree-style attention
// used for tree-style attention
if
(
attn_masks
!=
nullptr
)
{
if
(
attn_masks
!=
nullptr
&&
token_idx
<
seq_len
)
{
const
int
*
attn_masks_ptr
=
attn_masks
+
seq_idx
*
attn_masks_stride
;
const
int
*
attn_masks_ptr
=
attn_masks
+
seq_idx
*
attn_masks_stride
;
if
(
attn_masks_ptr
[
token_idx
]
==
0
)
{
if
(
attn_masks_ptr
[
token_idx
]
==
0
)
{
qk
=
-
FLT_MAX
;
qk
=
-
FLT_MAX
;
...
...
csrc/attention/attention_with_mask_kernels_opt_tc.cu
View file @
ec0136e7
...
@@ -294,7 +294,7 @@ __device__ void paged_attention_with_mask_kernel_TC(
...
@@ -294,7 +294,7 @@ __device__ void paged_attention_with_mask_kernel_TC(
}
}
// used for tree-style attention
// used for tree-style attention
if
(
attn_masks
!=
nullptr
)
{
if
(
attn_masks
!=
nullptr
&&
token_idx
<
seq_len
)
{
const
int
*
attn_masks_ptr
=
attn_masks
+
seq_idx
*
attn_masks_stride
;
const
int
*
attn_masks_ptr
=
attn_masks
+
seq_idx
*
attn_masks_stride
;
if
(
attn_masks_ptr
[
token_idx
]
==
0
)
{
if
(
attn_masks_ptr
[
token_idx
]
==
0
)
{
qk_vec
[
i
]
=
-
FLT_MAX
;
qk_vec
[
i
]
=
-
FLT_MAX
;
...
...
examples/medusa/medusa_benchmark_throughput.py
View file @
ec0136e7
...
@@ -42,9 +42,6 @@ def sample_requests(
...
@@ -42,9 +42,6 @@ def sample_requests(
# Only keep the first two turns of each conversation.
# Only keep the first two turns of each conversation.
dataset
=
[
data
[
"prompt"
]
for
data
in
dataset
]
dataset
=
[
data
[
"prompt"
]
for
data
in
dataset
]
# Shuffle the dataset.
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
filtered_dataset
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
for
i
in
range
(
len
(
dataset
)):
for
i
in
range
(
len
(
dataset
)):
...
...
vllm/model_executor/models/qwen2.py
View file @
ec0136e7
...
@@ -363,18 +363,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
...
@@ -363,18 +363,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
if
config
.
tie_word_embeddings
:
if
config
.
tie_word_embeddings
:
self
.
lm_head
=
self
.
model
.
embed_tokens
self
.
lm_head
=
self
.
model
.
embed_tokens
else
:
else
:
# self.lm_head = ParallelLMHead(config.vocab_size,
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
# config.hidden_size,
config
.
hidden_size
,
# quant_config=quant_config)
quant_config
=
quant_config
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
quant_config
=
quant_config
,
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
Sampler
()
self
.
sampler
=
Sampler
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment