Commit 8e340b4f authored by yangql's avatar yangql
Browse files

Merge remote-tracking branch 'origin/v0.8.5.post1-dev' into v0.8.5.post1-dev

parents 1cb37dab a68aef25
# SPDX-License-Identifier: Apache-2.0
import os
import glob
import tempfile
......@@ -9,6 +10,7 @@ import torch
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf, runai_safetensors_weights_iterator,
safetensors_weights_iterator)
from ..utils import models_path_prefix
def test_runai_model_loader():
......@@ -23,10 +25,10 @@ def test_runai_model_loader():
runai_model_streamer_tensors = {}
hf_safetensors_tensors = {}
for name, tensor in runai_safetensors_weights_iterator(safetensors):
for name, tensor in runai_safetensors_weights_iterator(safetensors, False):
runai_model_streamer_tensors[name] = tensor
for name, tensor in safetensors_weights_iterator(safetensors):
for name, tensor in safetensors_weights_iterator(safetensors, False):
hf_safetensors_tensors[name] = tensor
assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)
......@@ -38,4 +40,4 @@ def test_runai_model_loader():
if __name__ == "__main__":
test_runai_model_loader()
test_runai_model_loader()
\ No newline at end of file
......@@ -43,7 +43,8 @@ def _generate(
class TestOneTokenBadWord:
MODEL = os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-fp16")
# MODEL = os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-fp16")
MODEL = "TheBloke/Llama-2-7B-fp16"
PROMPT = "Hi! How are"
TARGET_TOKEN = "you"
......@@ -191,4 +192,4 @@ class TestTwoTokenBadWord:
prompt: str,
add_special_tokens: bool = True) -> list[int]:
return self.tokenizer(prompt,
add_special_tokens=add_special_tokens).input_ids
add_special_tokens=add_special_tokens).input_ids
\ No newline at end of file
......@@ -7,16 +7,15 @@ import pathlib
import subprocess
from functools import partial
from unittest.mock import MagicMock, patch
from typing import List, Tuple, Optional
import openai
import pytest
import torch
from huggingface_hub import snapshot_download
from typing import List, Tuple, Optional
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.lora.request import LoRARequest
# yapf conflicts with isort for this docstring
# yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
......@@ -26,6 +25,8 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
open_stream,
serialize_vllm_model,
tensorize_vllm_model)
from vllm.lora.request import LoRARequest
# yapf: enable
from vllm.utils import PlaceholderModule, import_from_path
......@@ -245,7 +246,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
EXAMPLES_PATH / "offline_inference/multilora_inference.py",
)
model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
test_prompts = multilora_inference.create_test_prompts(lora_path)
......
......@@ -89,7 +89,7 @@ def tokenizer(tokenizer_name):
AutoTokenizer.from_pretrained(tokenizer_name))
@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
@pytest.mark.parametrize("tokenizer_name", [os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409")])
@pytest.mark.parametrize(
"truth",
[
......@@ -403,4 +403,4 @@ def test_decode_prompt_logprobs_chunked_prefill(
generated_string += prompt_logprobs[prompt_token].decoded_token
assert generated_string == example_prompts[idx], (
"Detokenized prompt logprobs do not match original prompt")
"Detokenized prompt logprobs do not match original prompt")
\ No newline at end of file
......@@ -8,11 +8,13 @@ from ..utils import models_path_prefix
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
# export HF_ENDPOINT=https://hf-mirror.com
@pytest.mark.asyncio
async def test_tokenizer_group():
reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
# reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer_group = TokenizerGroup(
tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
# tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora=False,
max_num_seqs=1,
max_input_length=None,
......
This diff is collapsed.
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass
import os
import pytest
from vllm import LLM, SamplingParams
from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
prep_prompts)
from ...utils import models_path_prefix
@dataclass
......@@ -16,16 +18,16 @@ class TestConfig:
model_config = {
"bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
"google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
os.path.join(models_path_prefix, "bigcode/starcoder2-3b"): TestConfig(4096, (800, 1100)),
os.path.join(models_path_prefix, "google/gemma-2-2b-it"): TestConfig(4096, (400, 800)),
}
@pytest.mark.parametrize(
"model",
[
"bigcode/starcoder2-3b", # sliding window only
"google/gemma-2-2b-it", # sliding window + full attention
os.path.join(models_path_prefix, "bigcode/starcoder2-3b"), # sliding window only
os.path.join(models_path_prefix, "google/gemma-2-2b-it"), # sliding window + full attention
])
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
......
......@@ -4,9 +4,11 @@ from __future__ import annotations
import random
from typing import Any
import os
import pytest
from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
@pytest.fixture
......@@ -49,14 +51,17 @@ def sampling_config():
@pytest.fixture
def model_name():
# return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
return "meta-llama/Llama-3.1-8B-Instruct"
def eagle_model_name():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
def eagle3_model_name():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm import LLM, SamplingParams
from ...utils import fork_new_process_for_each_test
from ...utils import fork_new_process_for_each_test, models_path_prefix
@fork_new_process_for_each_test
@pytest.mark.parametrize("attn_backend",
["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
["FLASH_ATTN_VLLM_V1"]) # "FLASHINFER_VLLM_V1"
def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
......@@ -17,7 +18,7 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
llm = LLM(model=os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"))
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
# No cascade attention.
......@@ -29,4 +30,4 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
prompts = [example_system_message + prompt] * 64
responses = llm.generate(prompts, sampling_params)
for response in responses:
assert response.outputs[0].text == ref_output
assert response.outputs[0].text == ref_output
\ No newline at end of file
......@@ -3,11 +3,13 @@
import random
from typing import Optional
import os
import pytest
from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
MODEL = "facebook/opt-125m"
MODEL = os.path.join(models_path_prefix, "facebook/opt-125m")
DTYPE = "half"
......@@ -96,4 +98,4 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
}
raise AssertionError(
f"{len(completion_counts)} unique completions; expected"
f" {n}. Repeats: {repeats}")
f" {n}. Repeats: {repeats}")
\ No newline at end of file
......@@ -20,6 +20,7 @@ from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.output_processor import (OutputProcessor,
RequestOutputCollector)
from vllm.v1.metrics.stats import IterationStats
from ...utils import models_path_prefix
def _ref_convert_id_to_token(
......@@ -520,7 +521,7 @@ def test_stop_token(include_stop_str_in_output: bool,
dummy_test_vectors: dummy engine core outputs and other data structures
"""
model_id = dummy_test_vectors.tokenizer.name_or_path
if model_id != 'meta-llama/Llama-3.2-1B':
if model_id != os.path.join(models_path_prefix, 'meta-llama/Llama-3.2-1B'):
raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
f"{model_id} is in use.")
do_logprobs = num_sample_logprobs is not None
......@@ -992,4 +993,4 @@ async def test_cumulative_output_collector_n():
# Third is the one where index is 2
third = [k for k in result.outputs if k.index == 2]
assert len(third) == 1
assert third[0].text == "c"
assert third[0].text == "c"
\ No newline at end of file
......@@ -7,6 +7,7 @@ import re
from enum import Enum
from typing import Any
import os
import jsonschema
import pytest
from pydantic import BaseModel
......@@ -15,22 +16,23 @@ from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from ....utils import models_path_prefix
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
(os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "xgrammar:disable-any-whitespace",
"auto"),
("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace",
(os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "guidance:disable-any-whitespace",
"auto"),
("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
(os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "xgrammar:disable-any-whitespace",
"mistral"),
("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
(os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"), "xgrammar:disable-any-whitespace", "auto"),
#FIXME: This test is flaky on CI thus disabled
#("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
]
PARAMS_MODELS_TOKENIZER_MODE = [
("mistralai/Ministral-8B-Instruct-2410", "auto"),
("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
(os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "auto"),
(os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"), "auto"),
]
......@@ -572,4 +574,4 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
assert "a3" in generated
assert "a4" not in generated
assert "a5" not in generated
assert "a6" not in generated
assert "a6" not in generated
\ No newline at end of file
......@@ -3,6 +3,7 @@
import itertools
from collections.abc import Generator
import os
import pytest
import torch
......@@ -13,8 +14,9 @@ from tests.v1.sample.utils import (
from vllm import SamplingParams
from ...conftest import HfRunner, VllmRunner
from ...utils import models_path_prefix
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
DTYPE = "half"
NONE = BatchLogprobsComposition.NONE
......
# SPDX-License-Identifier: Apache-2.0
import os
import lm_eval
from ...utils import RemoteOpenAIServer
from ...utils import RemoteOpenAIServer, models_path_prefix
# arc-easy uses prompt_logprobs=1, logprobs=1
TASK = "arc_easy"
......@@ -11,7 +12,7 @@ RTOL = 0.03
EXPECTED_VALUE = 0.62
# FIXME(rob): enable prefix caching once supported.
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False" # noqa: E501
SERVER_ARGS = [
"--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
......@@ -49,4 +50,4 @@ def test_promt_logprobs_e2e_server():
measured_value = results["results"][TASK][FILTER]
assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
\ No newline at end of file
......@@ -4,11 +4,12 @@ import os
import pytest
from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B"
MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
PROMPT = "Hello my name is Robert and I"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment