Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/tests/lora/test_punica_ops_sizes.py
+++ b/tests/lora/test_punica_ops_sizes.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This script is mainly used to tests various hidden_sizes. We have collected the
 hidden_sizes included in the LoRA models currently supported by vLLM. It tests

--- a/tests/lora/test_punica_ops_variation.py
+++ b/tests/lora/test_punica_ops_variation.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This script is mainly used to test whether trtion kernels can run normally
 under different conditions, including various batches, numbers of LoRA , and

--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
+# SPDX-License-Identifier: Apache-2.0
 # Adapted from
 # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
 from dataclasses import dataclass

--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List
 import pytest

--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
+# SPDX-License-Identifier: Apache-2.0
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase

--- a/tests/lora/test_ultravox.py
+++ b/tests/lora/test_ultravox.py
+# SPDX-License-Identifier: Apache-2.0
+import shutil
+from os import path
+from tempfile import TemporaryDirectory
+from typing import List, Tuple
+import torch
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file, save_file
+from transformers import AutoTokenizer
+from vllm.lora.request import LoRARequest
+from ..models.utils import check_outputs_equal
+ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3"
+LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
+VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
+PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"
+def llama3_1_8b_chess_lora_path():
+    return snapshot_download(
+        repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
+# can't use llama lora adapter without module name transformation
+# because ultravox nest language model
+def transform_module_names_for_ultravox(state_dict):
+    transformed_state_dict = {}
+    for key, value in state_dict.items():
+        new_key = key.replace("base_model.model",
+                              "base_model.model.language_model")
+        transformed_state_dict[new_key] = value
+    return transformed_state_dict
+def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path):
+    tensor_file = "adapter_model.safetensors"
+    state_dict = load_file(path.join(source_repo, tensor_file))
+    transformed_state_dict = transform_module_names_for_ultravox(state_dict)
+    save_file(transformed_state_dict, path.join(target_path, tensor_file))
+    config_file = "adapter_config.json"
+    shutil.copyfile(path.join(source_repo, config_file),
+                    path.join(target_path, config_file))
+    return target_path
+def _get_prompt(audio_count, question, placeholder, model_name) -> str:
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    placeholder = f"{placeholder}\n" * audio_count
+    return tokenizer.apply_chat_template([{
+        'role': 'user',
+        'content': f"{placeholder}{question}"
+    }],
+                                         tokenize=False,
+                                         add_generation_prompt=True)
+def test_ultravox_lora(vllm_runner):
+    """
+    TODO: Train an Ultravox LoRA instead of using a Llama LoRA.
+    """
+    # Workaround to prevent device mismatch in Whisper.
+    # Can be removed when it is fixed upstream in transformer
+    # https://github.com/huggingface/transformers/pull/35866
+    torch.set_default_device("cpu")
+    llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path()
+    with TemporaryDirectory() as temp_ultravox_lora_dir:
+        llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora(
+            llama3_1_8b_chess_lora, temp_ultravox_lora_dir)
+        with vllm_runner(
+                ULTRAVOX_MODEL_NAME,
+                enforce_eager=True,
+                max_num_seqs=2,
+                enable_lora=True,
+                max_loras=1,
+                max_lora_rank=128,
+                dtype="bfloat16",
+                max_model_len=1024,
+        ) as vllm_model:
+            ultravox_outputs: List[Tuple[
+                List[int], str]] = vllm_model.generate_greedy(
+                    [
+                        _get_prompt(0, PROMPT, VLLM_PLACEHOLDER,
+                                    ULTRAVOX_MODEL_NAME)
+                    ],
+                    256,
+                    lora_request=LoRARequest(str(1), 1,
+                                             llama3_1_8b_ultravox_chess_lora),
+                )
+    # run llama with and without lora to compare outputs with above
+    with vllm_runner(
+            LLMA_MODEL_NAME,
+            enforce_eager=True,
+            max_num_seqs=2,
+            enable_lora=True,
+            max_loras=1,
+            max_lora_rank=128,
+            dtype="bfloat16",
+            max_model_len=1024,
+    ) as vllm_model:
+        llama_outputs: List[Tuple[List[int], str]] = (
+            vllm_model.generate_greedy(
+                [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)],
+                256,
+                lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora),
+            ))
+    check_outputs_equal(
+        outputs_0_lst=ultravox_outputs,
+        outputs_1_lst=llama_outputs,
+        name_0="ultravox",
+        name_1="llama",
+    )
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
+# SPDX-License-Identifier: Apache-2.0
 from collections import OrderedDict
 from unittest.mock import patch

--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import random
 import tempfile

--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import Dict, List, Optional
 import torch

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import time
 from typing import List

--- a/tests/model_executor/conftest.py
+++ b/tests/model_executor/conftest.py
+# SPDX-License-Identifier: Apache-2.0
 import pytest

--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List
 import pytest

--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
+# SPDX-License-Identifier: Apache-2.0
 import pickle
 import pytest

--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import pytest

--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import tempfile

--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List, Optional, Tuple, Type
 import numpy as np

--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of a AQLM model between vLLM and HF Transformers
 Run `pytest tests/models/test_aqlm.py`.

--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
+# SPDX-License-Identifier: Apache-2.0
 # flake8: noqa
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.

--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
+# SPDX-License-Identifier: Apache-2.0
 """
 Tests gguf models against unquantized models generations
 Note: To pass the test, quantization higher than Q4 should be used

--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
+# SPDX-License-Identifier: Apache-2.0
 """Compares the outputs of gptq vs gptq_marlin 
 Note: GPTQ and Marlin do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the