Commit a68aef25 authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests] fix v1, tokenization and runai_model_streamer_test

parent d36deb1a
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import glob import glob
import tempfile import tempfile
...@@ -9,6 +10,7 @@ import torch ...@@ -9,6 +10,7 @@ import torch
from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf, runai_safetensors_weights_iterator, download_weights_from_hf, runai_safetensors_weights_iterator,
safetensors_weights_iterator) safetensors_weights_iterator)
from ..utils import models_path_prefix
def test_runai_model_loader(): def test_runai_model_loader():
...@@ -23,10 +25,10 @@ def test_runai_model_loader(): ...@@ -23,10 +25,10 @@ def test_runai_model_loader():
runai_model_streamer_tensors = {} runai_model_streamer_tensors = {}
hf_safetensors_tensors = {} hf_safetensors_tensors = {}
for name, tensor in runai_safetensors_weights_iterator(safetensors): for name, tensor in runai_safetensors_weights_iterator(safetensors, False):
runai_model_streamer_tensors[name] = tensor runai_model_streamer_tensors[name] = tensor
for name, tensor in safetensors_weights_iterator(safetensors): for name, tensor in safetensors_weights_iterator(safetensors, False):
hf_safetensors_tensors[name] = tensor hf_safetensors_tensors[name] = tensor
assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors) assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)
...@@ -38,4 +40,4 @@ def test_runai_model_loader(): ...@@ -38,4 +40,4 @@ def test_runai_model_loader():
if __name__ == "__main__": if __name__ == "__main__":
test_runai_model_loader() test_runai_model_loader()
\ No newline at end of file
...@@ -43,7 +43,8 @@ def _generate( ...@@ -43,7 +43,8 @@ def _generate(
class TestOneTokenBadWord: class TestOneTokenBadWord:
MODEL = os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-fp16") # MODEL = os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-fp16")
MODEL = "TheBloke/Llama-2-7B-fp16"
PROMPT = "Hi! How are" PROMPT = "Hi! How are"
TARGET_TOKEN = "you" TARGET_TOKEN = "you"
...@@ -191,4 +192,4 @@ class TestTwoTokenBadWord: ...@@ -191,4 +192,4 @@ class TestTwoTokenBadWord:
prompt: str, prompt: str,
add_special_tokens: bool = True) -> list[int]: add_special_tokens: bool = True) -> list[int]:
return self.tokenizer(prompt, return self.tokenizer(prompt,
add_special_tokens=add_special_tokens).input_ids add_special_tokens=add_special_tokens).input_ids
\ No newline at end of file
...@@ -7,16 +7,15 @@ import pathlib ...@@ -7,16 +7,15 @@ import pathlib
import subprocess import subprocess
from functools import partial from functools import partial
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from typing import List, Tuple, Optional
import openai import openai
import pytest import pytest
import torch import torch
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from typing import List, Tuple, Optional
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.lora.request import LoRARequest
# yapf conflicts with isort for this docstring # yapf conflicts with isort for this docstring
# yapf: disable # yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
...@@ -26,6 +25,8 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, ...@@ -26,6 +25,8 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
open_stream, open_stream,
serialize_vllm_model, serialize_vllm_model,
tensorize_vllm_model) tensorize_vllm_model)
from vllm.lora.request import LoRARequest
# yapf: enable # yapf: enable
from vllm.utils import PlaceholderModule, import_from_path from vllm.utils import PlaceholderModule, import_from_path
...@@ -245,7 +246,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): ...@@ -245,7 +246,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
EXAMPLES_PATH / "offline_inference/multilora_inference.py", EXAMPLES_PATH / "offline_inference/multilora_inference.py",
) )
model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf") model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test") lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
test_prompts = multilora_inference.create_test_prompts(lora_path) test_prompts = multilora_inference.create_test_prompts(lora_path)
......
...@@ -89,7 +89,7 @@ def tokenizer(tokenizer_name): ...@@ -89,7 +89,7 @@ def tokenizer(tokenizer_name):
AutoTokenizer.from_pretrained(tokenizer_name)) AutoTokenizer.from_pretrained(tokenizer_name))
@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"]) @pytest.mark.parametrize("tokenizer_name", [os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409")])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"truth", "truth",
[ [
...@@ -403,4 +403,4 @@ def test_decode_prompt_logprobs_chunked_prefill( ...@@ -403,4 +403,4 @@ def test_decode_prompt_logprobs_chunked_prefill(
generated_string += prompt_logprobs[prompt_token].decoded_token generated_string += prompt_logprobs[prompt_token].decoded_token
assert generated_string == example_prompts[idx], ( assert generated_string == example_prompts[idx], (
"Detokenized prompt logprobs do not match original prompt") "Detokenized prompt logprobs do not match original prompt")
\ No newline at end of file
...@@ -8,11 +8,13 @@ from ..utils import models_path_prefix ...@@ -8,11 +8,13 @@ from ..utils import models_path_prefix
from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.transformers_utils.tokenizer_group import TokenizerGroup
# export HF_ENDPOINT=https://hf-mirror.com
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_tokenizer_group(): async def test_tokenizer_group():
reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2")) # reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer_group = TokenizerGroup( tokenizer_group = TokenizerGroup(
tokenizer_id=os.path.join(models_path_prefix, "gpt2"), # tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora=False, enable_lora=False,
max_num_seqs=1, max_num_seqs=1,
max_input_length=None, max_input_length=None,
......
This diff is collapsed.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass from dataclasses import dataclass
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ...core.block.e2e.test_correctness_sliding_window import (check_answers, from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
prep_prompts) prep_prompts)
from ...utils import models_path_prefix
@dataclass @dataclass
...@@ -16,16 +18,16 @@ class TestConfig: ...@@ -16,16 +18,16 @@ class TestConfig:
model_config = { model_config = {
"bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)), os.path.join(models_path_prefix, "bigcode/starcoder2-3b"): TestConfig(4096, (800, 1100)),
"google/gemma-2-2b-it": TestConfig(4096, (400, 800)), os.path.join(models_path_prefix, "google/gemma-2-2b-it"): TestConfig(4096, (400, 800)),
} }
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model", "model",
[ [
"bigcode/starcoder2-3b", # sliding window only os.path.join(models_path_prefix, "bigcode/starcoder2-3b"), # sliding window only
"google/gemma-2-2b-it", # sliding window + full attention os.path.join(models_path_prefix, "google/gemma-2-2b-it"), # sliding window + full attention
]) ])
@pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
......
...@@ -4,9 +4,11 @@ from __future__ import annotations ...@@ -4,9 +4,11 @@ from __future__ import annotations
import random import random
from typing import Any from typing import Any
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
@pytest.fixture @pytest.fixture
...@@ -49,14 +51,17 @@ def sampling_config(): ...@@ -49,14 +51,17 @@ def sampling_config():
@pytest.fixture @pytest.fixture
def model_name(): def model_name():
# return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
return "meta-llama/Llama-3.1-8B-Instruct" return "meta-llama/Llama-3.1-8B-Instruct"
def eagle_model_name(): def eagle_model_name():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
def eagle3_model_name(): def eagle3_model_name():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ...utils import fork_new_process_for_each_test from ...utils import fork_new_process_for_each_test, models_path_prefix
@fork_new_process_for_each_test @fork_new_process_for_each_test
@pytest.mark.parametrize("attn_backend", @pytest.mark.parametrize("attn_backend",
["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"]) ["FLASH_ATTN_VLLM_V1"]) # "FLASHINFER_VLLM_V1"
def test_cascade_attention(example_system_message, monkeypatch, attn_backend): def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:" prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
...@@ -17,7 +18,7 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend): ...@@ -17,7 +18,7 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") llm = LLM(model=os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"))
sampling_params = SamplingParams(temperature=0.0, max_tokens=100) sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
# No cascade attention. # No cascade attention.
...@@ -29,4 +30,4 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend): ...@@ -29,4 +30,4 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
prompts = [example_system_message + prompt] * 64 prompts = [example_system_message + prompt] * 64
responses = llm.generate(prompts, sampling_params) responses = llm.generate(prompts, sampling_params)
for response in responses: for response in responses:
assert response.outputs[0].text == ref_output assert response.outputs[0].text == ref_output
\ No newline at end of file
...@@ -3,11 +3,13 @@ ...@@ -3,11 +3,13 @@
import random import random
from typing import Optional from typing import Optional
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
MODEL = "facebook/opt-125m" MODEL = os.path.join(models_path_prefix, "facebook/opt-125m")
DTYPE = "half" DTYPE = "half"
...@@ -96,4 +98,4 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: ...@@ -96,4 +98,4 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
} }
raise AssertionError( raise AssertionError(
f"{len(completion_counts)} unique completions; expected" f"{len(completion_counts)} unique completions; expected"
f" {n}. Repeats: {repeats}") f" {n}. Repeats: {repeats}")
\ No newline at end of file
...@@ -20,6 +20,7 @@ from vllm.v1.engine import EngineCoreRequest ...@@ -20,6 +20,7 @@ from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.output_processor import (OutputProcessor, from vllm.v1.engine.output_processor import (OutputProcessor,
RequestOutputCollector) RequestOutputCollector)
from vllm.v1.metrics.stats import IterationStats from vllm.v1.metrics.stats import IterationStats
from ...utils import models_path_prefix
def _ref_convert_id_to_token( def _ref_convert_id_to_token(
...@@ -520,7 +521,7 @@ def test_stop_token(include_stop_str_in_output: bool, ...@@ -520,7 +521,7 @@ def test_stop_token(include_stop_str_in_output: bool,
dummy_test_vectors: dummy engine core outputs and other data structures dummy_test_vectors: dummy engine core outputs and other data structures
""" """
model_id = dummy_test_vectors.tokenizer.name_or_path model_id = dummy_test_vectors.tokenizer.name_or_path
if model_id != 'meta-llama/Llama-3.2-1B': if model_id != os.path.join(models_path_prefix, 'meta-llama/Llama-3.2-1B'):
raise AssertionError("Test requires meta-llama/Llama-3.2-1B but " raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
f"{model_id} is in use.") f"{model_id} is in use.")
do_logprobs = num_sample_logprobs is not None do_logprobs = num_sample_logprobs is not None
...@@ -992,4 +993,4 @@ async def test_cumulative_output_collector_n(): ...@@ -992,4 +993,4 @@ async def test_cumulative_output_collector_n():
# Third is the one where index is 2 # Third is the one where index is 2
third = [k for k in result.outputs if k.index == 2] third = [k for k in result.outputs if k.index == 2]
assert len(third) == 1 assert len(third) == 1
assert third[0].text == "c" assert third[0].text == "c"
\ No newline at end of file
...@@ -7,6 +7,7 @@ import re ...@@ -7,6 +7,7 @@ import re
from enum import Enum from enum import Enum
from typing import Any from typing import Any
import os
import jsonschema import jsonschema
import pytest import pytest
from pydantic import BaseModel from pydantic import BaseModel
...@@ -15,22 +16,23 @@ from vllm.entrypoints.llm import LLM ...@@ -15,22 +16,23 @@ from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from ....utils import models_path_prefix
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace", (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "xgrammar:disable-any-whitespace",
"auto"), "auto"),
("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace", (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "guidance:disable-any-whitespace",
"auto"), "auto"),
("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace", (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "xgrammar:disable-any-whitespace",
"mistral"), "mistral"),
("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"), (os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"), "xgrammar:disable-any-whitespace", "auto"),
#FIXME: This test is flaky on CI thus disabled #FIXME: This test is flaky on CI thus disabled
#("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"), #("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
] ]
PARAMS_MODELS_TOKENIZER_MODE = [ PARAMS_MODELS_TOKENIZER_MODE = [
("mistralai/Ministral-8B-Instruct-2410", "auto"), (os.path.join(models_path_prefix, "mistralai/Ministral-8B-Instruct-2410"), "auto"),
("Qwen/Qwen2.5-1.5B-Instruct", "auto"), (os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"), "auto"),
] ]
...@@ -572,4 +574,4 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): ...@@ -572,4 +574,4 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
assert "a3" in generated assert "a3" in generated
assert "a4" not in generated assert "a4" not in generated
assert "a5" not in generated assert "a5" not in generated
assert "a6" not in generated assert "a6" not in generated
\ No newline at end of file
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import itertools import itertools
from collections.abc import Generator from collections.abc import Generator
import os
import pytest import pytest
import torch import torch
...@@ -13,8 +14,9 @@ from tests.v1.sample.utils import ( ...@@ -13,8 +14,9 @@ from tests.v1.sample.utils import (
from vllm import SamplingParams from vllm import SamplingParams
from ...conftest import HfRunner, VllmRunner from ...conftest import HfRunner, VllmRunner
from ...utils import models_path_prefix
MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
DTYPE = "half" DTYPE = "half"
NONE = BatchLogprobsComposition.NONE NONE = BatchLogprobsComposition.NONE
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import lm_eval import lm_eval
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer, models_path_prefix
# arc-easy uses prompt_logprobs=1, logprobs=1 # arc-easy uses prompt_logprobs=1, logprobs=1
TASK = "arc_easy" TASK = "arc_easy"
...@@ -11,7 +12,7 @@ RTOL = 0.03 ...@@ -11,7 +12,7 @@ RTOL = 0.03
EXPECTED_VALUE = 0.62 EXPECTED_VALUE = 0.62
# FIXME(rob): enable prefix caching once supported. # FIXME(rob): enable prefix caching once supported.
MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False" # noqa: E501 MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False" # noqa: E501
SERVER_ARGS = [ SERVER_ARGS = [
"--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests" "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
...@@ -49,4 +50,4 @@ def test_promt_logprobs_e2e_server(): ...@@ -49,4 +50,4 @@ def test_promt_logprobs_e2e_server():
measured_value = results["results"][TASK][FILTER] measured_value = results["results"][TASK][FILTER]
assert (measured_value - RTOL < EXPECTED_VALUE assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
\ No newline at end of file
...@@ -4,11 +4,12 @@ import os ...@@ -4,11 +4,12 @@ import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ...utils import models_path_prefix
if os.getenv("VLLM_USE_V1", "0") != "1": if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True) pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B" MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
PROMPT = "Hello my name is Robert and I" PROMPT = "Hello my name is Robert and I"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment