Merge branch 'v0.6.2-dev_wm' into 'v0.6.2-dev'

解决qwen lora模型推理结果异常问题 See merge request dcutoolkit/deeplearing/vllm!56

Merge branch 'v0.6.2-dev_wm' into 'v0.6.2-dev'
解决qwen lora模型推理结果异常问题 See merge request dcutoolkit/deeplearing/vllm!56
ec0136e7 · zhuwenwen · 7fd1d015 · da39222e · ec0136e7 · ec0136e7
Commit ec0136e7 authored Dec 24, 2024 by zhuwenwen
6 changed files
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -284,6 +284,11 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] == "DeepseekV2ForCausalLM":
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
        # Default: Mixtral.
        E = config.num_local_experts
@@ -306,7 +311,7 @@ def main(args: argparse.Namespace):
    ray.init(address=None,
                 ignore_reinit_error=True,
-                 num_gpus=args.tp_size)
+                 num_gpus=1)
    num_gpus = int(ray.available_resources()["GPU"])
    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]

--- a/csrc/attention/attention_with_mask_kernels.cu
+++ b/csrc/attention/attention_with_mask_kernels.cu
@@ -298,7 +298,7 @@ __device__ void paged_attention_with_mask_kernel(
      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
      // used for tree-style attention
-      if (attn_masks != nullptr) {
+      if (attn_masks != nullptr && token_idx < seq_len) {
        const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride;
        if (attn_masks_ptr[token_idx] == 0) {
          qk = -FLT_MAX;

--- a/csrc/attention/attention_with_mask_kernels_opt.cu
+++ b/csrc/attention/attention_with_mask_kernels_opt.cu
@@ -329,7 +329,7 @@ __device__ void paged_attention_with_mask_kernel_opt(
      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
      // used for tree-style attention
-      if (attn_masks != nullptr) {
+      if (attn_masks != nullptr && token_idx < seq_len) {
        const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride;
        if (attn_masks_ptr[token_idx] == 0) {
          qk = -FLT_MAX;

--- a/csrc/attention/attention_with_mask_kernels_opt_tc.cu
+++ b/csrc/attention/attention_with_mask_kernels_opt_tc.cu
@@ -294,7 +294,7 @@ __device__ void paged_attention_with_mask_kernel_TC(
        }
        // used for tree-style attention
-        if (attn_masks != nullptr) {
+        if (attn_masks != nullptr && token_idx < seq_len) {
          const int* attn_masks_ptr = attn_masks + seq_idx * attn_masks_stride;
          if (attn_masks_ptr[token_idx] == 0) {
            qk_vec[i] = -FLT_MAX;

--- a/examples/medusa/medusa_benchmark_throughput.py
+++ b/examples/medusa/medusa_benchmark_throughput.py
@@ -42,9 +42,6 @@ def sample_requests(
    # Only keep the first two turns of each conversation.
    dataset = [data["prompt"] for data in dataset]
-    # Shuffle the dataset.
-    random.shuffle(dataset)
    # Filter out sequences that are too long or too short
    filtered_dataset: List[Tuple[str, int, int]] = []
    for i in range(len(dataset)):

--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -363,18 +363,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
        if config.tie_word_embeddings:
            self.lm_head = self.model.embed_tokens
        else:
-            # self.lm_head = ParallelLMHead(config.vocab_size,
+            self.lm_head = ParallelLMHead(config.vocab_size,
-            #                               config.hidden_size,
-            #                               quant_config=quant_config)
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-            self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
                                          config.hidden_size,
-                org_num_embeddings=config.vocab_size,
+                                          quant_config=quant_config)
-                quant_config=quant_config,
-            )
        self.logits_processor = LogitsProcessor(config.vocab_size)
        self.sampler = Sampler()