Merge branch 'v0.6.2-dev_wm' into 'v0.6.2-dev'

修复单测中test_long_context、test_attention部分错误 See merge request dcutoolkit/deeplearing/vllm!33

Merge branch 'v0.6.2-dev_wm' into 'v0.6.2-dev'
修复单测中test_long_context、test_attention部分错误 See merge request dcutoolkit/deeplearing/vllm!33
aad58f06 · zhuwenwen · 70c661da · 137e8a16 · aad58f06 · aad58f06
Commit aad58f06 authored Nov 15, 2024 by zhuwenwen
4 changed files
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -206,7 +206,7 @@ def test_paged_attention(
        opcheck(torch.ops._C.paged_attention_v1,
                (output, query, key_cache, value_cache, num_kv_heads, scale,
                 block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
-                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
+                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0, None, 0),
                cond=(head_size == HEAD_SIZES[0]
                      and block_size == BLOCK_SIZES[0]))
@@ -248,7 +248,7 @@ def test_paged_attention(
                    (output, exp_sums, max_logits, tmp_output, query,
                     key_cache, value_cache, num_kv_heads, scale, block_tables,
                     seq_lens, block_size, max_seq_len, alibi_slopes,
-                     kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
+                     kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0, None, 0),
                    cond=(head_size == HEAD_SIZES[0]
                          and block_size == BLOCK_SIZES[0]))

--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -111,7 +111,7 @@ def lora_llm(long_context_infos):
    llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf",
                   enable_lora=True,
                   max_num_seqs=16,
-                   max_loras=2,
+                   max_loras=8,
                   long_lora_scaling_factors=tuple(scaling_factors),
                   max_num_batched_tokens=4096 * 8,
                   tensor_parallel_size=4,

--- a/tests/lora/test_punica_variation.py
+++ b/tests/lora/test_punica_variation.py
@@ -20,7 +20,7 @@ from vllm.utils import seed_everything
 from .utils import (generate_data, generate_data_for_expand_nslices,
                    ref_torch_groupgemm)
-HIDDEN_SIZES = [4097]
+HIDDEN_SIZES = [1024]
 BATCHES = [1, 4, 16, 32]
 NUM_LORA = [1, 8, 32, 128]

--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -83,7 +83,9 @@ class LoRARequest(
        and comparison lora adapter across engines.
        """
        return isinstance(value,
-                          self.__class__) and self.lora_name == value.lora_name
+                          self.__class__) and self.lora_name == value.lora_name and \
+                            self.lora_int_id == value.lora_int_id and \
+                            self.lora_path == value.lora_path
    def __hash__(self) -> int:
        """