Commit ec0136e7 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.2-dev_wm' into 'v0.6.2-dev'

解决qwen lora模型推理结果异常问题

See merge request dcutoolkit/deeplearing/vllm!56
parents 7fd1d015 da39222e
...@@ -284,6 +284,11 @@ def main(args: argparse.Namespace): ...@@ -284,6 +284,11 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size intermediate_size = config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] == "DeepseekV2ForCausalLM":
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
else: else:
# Default: Mixtral. # Default: Mixtral.
E = config.num_local_experts E = config.num_local_experts
...@@ -306,7 +311,7 @@ def main(args: argparse.Namespace): ...@@ -306,7 +311,7 @@ def main(args: argparse.Namespace):
ray.init(address=None, ray.init(address=None,
ignore_reinit_error=True, ignore_reinit_error=True,
num_gpus=args.tp_size) num_gpus=1)
num_gpus = int(ray.available_resources()["GPU"]) num_gpus = int(ray.available_resources()["GPU"])
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
......
...@@ -298,7 +298,7 @@ __device__ void paged_attention_with_mask_kernel( ...@@ -298,7 +298,7 @@ __device__ void paged_attention_with_mask_kernel(
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0; qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
// used for tree-style attention // used for tree-style attention
if (attn_masks != nullptr) { if (attn_masks != nullptr && token_idx < seq_len) {
const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride; const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride;
if (attn_masks_ptr[token_idx] == 0) { if (attn_masks_ptr[token_idx] == 0) {
qk = -FLT_MAX; qk = -FLT_MAX;
......
...@@ -329,7 +329,7 @@ __device__ void paged_attention_with_mask_kernel_opt( ...@@ -329,7 +329,7 @@ __device__ void paged_attention_with_mask_kernel_opt(
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0; qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
// used for tree-style attention // used for tree-style attention
if (attn_masks != nullptr) { if (attn_masks != nullptr && token_idx < seq_len) {
const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride; const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride;
if (attn_masks_ptr[token_idx] == 0) { if (attn_masks_ptr[token_idx] == 0) {
qk = -FLT_MAX; qk = -FLT_MAX;
......
...@@ -294,7 +294,7 @@ __device__ void paged_attention_with_mask_kernel_TC( ...@@ -294,7 +294,7 @@ __device__ void paged_attention_with_mask_kernel_TC(
} }
// used for tree-style attention // used for tree-style attention
if (attn_masks != nullptr) { if (attn_masks != nullptr && token_idx < seq_len) {
const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride; const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride;
if (attn_masks_ptr[token_idx] == 0) { if (attn_masks_ptr[token_idx] == 0) {
qk_vec[i] = -FLT_MAX; qk_vec[i] = -FLT_MAX;
......
...@@ -42,9 +42,6 @@ def sample_requests( ...@@ -42,9 +42,6 @@ def sample_requests(
# Only keep the first two turns of each conversation. # Only keep the first two turns of each conversation.
dataset = [data["prompt"] for data in dataset] dataset = [data["prompt"] for data in dataset]
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out sequences that are too long or too short # Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = [] filtered_dataset: List[Tuple[str, int, int]] = []
for i in range(len(dataset)): for i in range(len(dataset)):
......
...@@ -363,18 +363,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA): ...@@ -363,18 +363,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
if config.tie_word_embeddings: if config.tie_word_embeddings:
self.lm_head = self.model.embed_tokens self.lm_head = self.model.embed_tokens
else: else:
# self.lm_head = ParallelLMHead(config.vocab_size, self.lm_head = ParallelLMHead(config.vocab_size,
# config.hidden_size,
# quant_config=quant_config)
self.unpadded_vocab_size = config.vocab_size
if lora_config:
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
self.lm_head = ParallelLMHead(
self.unpadded_vocab_size,
config.hidden_size, config.hidden_size,
org_num_embeddings=config.vocab_size, quant_config=quant_config)
quant_config=quant_config,
)
self.logits_processor = LogitsProcessor(config.vocab_size) self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = Sampler() self.sampler = Sampler()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment