Commit 3c9817d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH  to load models from local path instead of Hugging Face Hub
parent 49204f68
import pytest import pytest
import os
from vllm import SamplingParams from vllm import SamplingParams
from .conftest import get_output_from_llm_generator from .conftest import get_output_from_llm_generator
from ...utils import models_path_prefix
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
# Required for spec decode. # Required for spec decode.
...@@ -47,8 +49,8 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator): ...@@ -47,8 +49,8 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model": "meta-llama/Llama-2-7b-chat-hf", "model": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
# Required for spec decode. # Required for spec decode.
...@@ -97,8 +99,8 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): ...@@ -97,8 +99,8 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
@pytest.mark.parametrize("common_llm_kwargs", [{ @pytest.mark.parametrize("common_llm_kwargs", [{
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......
...@@ -20,14 +20,16 @@ correctess for the target model outputs. ...@@ -20,14 +20,16 @@ correctess for the target model outputs.
""" """
import pytest import pytest
import os
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model # main model
MAIN_MODEL = "JackFram/llama-68m" MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
# speculative model # speculative model
SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random" SPEC_MODEL = os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random")
# max. number of speculative tokens: this corresponds to # max. number of speculative tokens: this corresponds to
# num_heads in the config.json of the speculator model. # num_heads in the config.json of the speculator model.
......
...@@ -3,10 +3,12 @@ other features, e.g. cuda graphs. ...@@ -3,10 +3,12 @@ other features, e.g. cuda graphs.
""" """
import pytest import pytest
import os
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
MAIN_MODEL = "JackFram/llama-68m" MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -17,14 +19,14 @@ MAIN_MODEL = "JackFram/llama-68m" ...@@ -17,14 +19,14 @@ MAIN_MODEL = "JackFram/llama-68m"
# Verify equality when cuda graphs allowed. # Verify equality when cuda graphs allowed.
"enforce_eager": False, "enforce_eager": False,
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}]) }])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[ [
{ {
# Identical models. # Identical models.
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
]) ])
...@@ -53,7 +55,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -53,7 +55,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -63,7 +65,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, ...@@ -63,7 +65,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
{ {
"speculative_model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "speculative_model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
]) ])
......
...@@ -4,10 +4,12 @@ tensor parallelism. ...@@ -4,10 +4,12 @@ tensor parallelism.
import pytest import pytest
import torch import torch
import os
from vllm.utils import is_hip from vllm.utils import is_hip
from .conftest import run_equality_correctness_test_tp from .conftest import run_equality_correctness_test_tp
from ...utils import models_path_prefix
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
...@@ -28,7 +30,7 @@ from .conftest import run_equality_correctness_test_tp ...@@ -28,7 +30,7 @@ from .conftest import run_equality_correctness_test_tp
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
[ [
"--speculative-model", "--speculative-model",
"JackFram/llama-68m", os.path.join(models_path_prefix, "JackFram/llama-68m"),
"--num-speculative-tokens", "--num-speculative-tokens",
"3", "3",
], ],
...@@ -56,7 +58,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -56,7 +58,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
""" """
if is_hip(): if is_hip():
pytest.skip("hip is not well-supported yet") pytest.skip("hip is not well-supported yet")
run_equality_correctness_test_tp("JackFram/llama-68m", run_equality_correctness_test_tp(os.path.join(models_path_prefix, "JackFram/llama-68m"),
common_llm_kwargs, common_llm_kwargs,
per_test_common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, baseline_llm_kwargs,
...@@ -89,15 +91,15 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -89,15 +91,15 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
@pytest.mark.parametrize("model, test_llm_kwargs", @pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [ [("JackFram/llama-68m", [
"--speculative-model", "--speculative-model",
"JackFram/llama-68m", os.path.join(models_path_prefix, "JackFram/llama-68m"),
"--num_speculative-tokens", "--num_speculative-tokens",
"5", "5",
"--speculative-draft-tensor-parallel-size", "--speculative-draft-tensor-parallel-size",
"1", "1",
]), ]),
("ibm-granite/granite-3b-code-instruct", [ (os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
"--speculative-model", "--speculative-model",
"ibm-granite/granite-3b-code-instruct", os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
"--num_speculative-tokens", "--num_speculative-tokens",
"5", "5",
"--speculative-draft-tensor-parallel-size", "--speculative-draft-tensor-parallel-size",
......
...@@ -5,11 +5,13 @@ tensor parallelism. ...@@ -5,11 +5,13 @@ tensor parallelism.
import openai import openai
import pytest import pytest
import torch import torch
import os
from .conftest import run_equality_correctness_test_tp from .conftest import run_equality_correctness_test_tp
from ...utils import models_path_prefix
MAIN_MODEL = "JackFram/llama-68m" MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
SPEC_MODEL = "JackFram/llama-68m" SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
@pytest.mark.skipif(torch.cuda.device_count() < 4, @pytest.mark.skipif(torch.cuda.device_count() < 4,
......
from itertools import cycle from itertools import cycle
import pytest import pytest
import os
from vllm import SamplingParams from vllm import SamplingParams
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -22,11 +24,11 @@ from .conftest import run_equality_correctness_test ...@@ -22,11 +24,11 @@ from .conftest import run_equality_correctness_test
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs",
[{ [{
"speculative_model": "JackFram/llama-160m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs_during_spec_decoding": False,
}, { }, {
"speculative_model": "JackFram/llama-160m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": True, "disable_logprobs_during_spec_decoding": True,
}]) }])
...@@ -63,7 +65,7 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs, ...@@ -63,7 +65,7 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -75,11 +77,11 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs, ...@@ -75,11 +77,11 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs",
[{ [{
"speculative_model": "JackFram/llama-160m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs_during_spec_decoding": False,
}, { }, {
"speculative_model": "JackFram/llama-160m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 6, "num_speculative_tokens": 6,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs_during_spec_decoding": False,
}]) }])
...@@ -115,7 +117,7 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs, ...@@ -115,7 +117,7 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -128,7 +130,7 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs, ...@@ -128,7 +130,7 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_llm_kwargs", "test_llm_kwargs",
[{ [{
"speculative_model": "JackFram/llama-160m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs_during_spec_decoding": False,
...@@ -169,7 +171,7 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs, ...@@ -169,7 +171,7 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -181,7 +183,7 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs, ...@@ -181,7 +183,7 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs",
[{ [{
"speculative_model": "JackFram/llama-160m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False, "disable_logprobs_during_spec_decoding": False,
}]) }])
...@@ -248,7 +250,7 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs, ...@@ -248,7 +250,7 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
# Required for spec decode. # Required for spec decode.
...@@ -258,7 +260,7 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs, ...@@ -258,7 +260,7 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", @pytest.mark.parametrize("test_llm_kwargs",
[{ [{
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": True, "disable_logprobs_during_spec_decoding": True,
}]) }])
......
...@@ -20,16 +20,18 @@ correctess for the target model outputs. ...@@ -20,16 +20,18 @@ correctess for the target model outputs.
""" """
import pytest import pytest
import os
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model # main model
# lmsys/vicuna-7b-v1.3 was to be used but it's causing # lmsys/vicuna-7b-v1.3 was to be used but it's causing
# OOM in CI pipeline, so using a smaller model. # OOM in CI pipeline, so using a smaller model.
MAIN_MODEL = "JackFram/llama-68m" MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
# speculative model # speculative model
SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random" SPEC_MODEL = os.path.join(models_path_prefix, "abhigoyal/vllm-medusa-llama-68m-random")
# max number of speculative tokens: this corresponds to # max number of speculative tokens: this corresponds to
# num_heads in the config.json of the speculator model. # num_heads in the config.json of the speculator model.
......
...@@ -22,16 +22,18 @@ correctness for the target model outputs. ...@@ -22,16 +22,18 @@ correctness for the target model outputs.
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
import os
from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model # main model
MAIN_MODEL = "JackFram/llama-160m" MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
# speculative model # speculative model
SPEC_MODEL = "ibm-fms/llama-160m-accelerator" SPEC_MODEL = os.path.join(models_path_prefix, "ibm-fms/llama-160m-accelerator")
# max. number of speculative tokens: this corresponds to # max. number of speculative tokens: this corresponds to
# n_predict in the config.json of the speculator model. # n_predict in the config.json of the speculator model.
......
...@@ -37,6 +37,7 @@ greedy-equality tests for those batch sizes/prompts. ...@@ -37,6 +37,7 @@ greedy-equality tests for those batch sizes/prompts.
from itertools import cycle from itertools import cycle
import pytest import pytest
import os
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import SamplingParams from vllm import SamplingParams
...@@ -44,6 +45,7 @@ from vllm import SamplingParams ...@@ -44,6 +45,7 @@ from vllm import SamplingParams
from ...utils import fork_new_process_for_each_test from ...utils import fork_new_process_for_each_test
from .conftest import (get_output_from_llm_generator, from .conftest import (get_output_from_llm_generator,
run_equality_correctness_test) run_equality_correctness_test)
from ...utils import models_path_prefix
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -51,7 +53,7 @@ from .conftest import (get_output_from_llm_generator, ...@@ -51,7 +53,7 @@ from .conftest import (get_output_from_llm_generator,
[{ [{
# Use a small model for a fast test. # Use a small model for a fast test.
# Note this is repeated in the test body; to initialize a tokenizer. # Note this is repeated in the test body; to initialize a tokenizer.
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -63,7 +65,7 @@ from .conftest import (get_output_from_llm_generator, ...@@ -63,7 +65,7 @@ from .conftest import (get_output_from_llm_generator,
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[ [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
{ {
...@@ -111,7 +113,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, ...@@ -111,7 +113,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
for token_ids in batch_token_ids] == ([output_len] * batch_size) for token_ids in batch_token_ids] == ([output_len] * batch_size)
# Expect detokenized string to match. # Expect detokenized string to match.
tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") tok = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "JackFram/llama-68m"))
for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids):
expected_tokens = tok.decode(actual_token_ids) expected_tokens = tok.decode(actual_token_ids)
print(f"{actual_token_ids=}") print(f"{actual_token_ids=}")
...@@ -136,16 +138,16 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, ...@@ -136,16 +138,16 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
# Try two different tiny base models. # Try two different tiny base models.
# Note that one is equal to the draft model, another isn't. # Note that one is equal to the draft model, another isn't.
{ {
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}, },
{ {
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
}, },
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
]) ])
...@@ -202,16 +204,16 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( ...@@ -202,16 +204,16 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
# Try two different tiny base models. # Try two different tiny base models.
# Note that one is equal to the draft model, another isn't. # Note that one is equal to the draft model, another isn't.
{ {
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}, },
{ {
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
}, },
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
]) ])
...@@ -256,16 +258,16 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( ...@@ -256,16 +258,16 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
# Try two different tiny base models. # Try two different tiny base models.
# Note that one is equal to the draft model, another isn't. # Note that one is equal to the draft model, another isn't.
{ {
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}, },
{ {
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m")",
}, },
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
]) ])
...@@ -298,7 +300,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( ...@@ -298,7 +300,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# A "real" model (not tiny). # A "real" model (not tiny).
"model_name": "meta-llama/Llama-2-7b-chat-hf", "model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -313,7 +315,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( ...@@ -313,7 +315,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
]) ])
...@@ -348,7 +350,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( ...@@ -348,7 +350,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
# A "real" model (not tiny). # A "real" model (not tiny).
"model_name": "meta-llama/Llama-2-7b-chat-hf", "model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -363,7 +365,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( ...@@ -363,7 +365,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
]) ])
...@@ -410,13 +412,13 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( ...@@ -410,13 +412,13 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
{ {
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
}, },
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
]) ])
...@@ -450,7 +452,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( ...@@ -450,7 +452,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -476,7 +478,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( ...@@ -476,7 +478,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
]) ])
...@@ -510,7 +512,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, ...@@ -510,7 +512,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -524,7 +526,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, ...@@ -524,7 +526,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
"test_llm_kwargs", "test_llm_kwargs",
[ [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
# Artificially limit the draft model max model len; this forces vLLM # Artificially limit the draft model max model len; this forces vLLM
...@@ -566,7 +568,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs, ...@@ -566,7 +568,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -578,7 +580,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs, ...@@ -578,7 +580,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"speculative_disable_by_batch_size": 2, "speculative_disable_by_batch_size": 2,
}, },
...@@ -607,7 +609,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs, ...@@ -607,7 +609,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -621,7 +623,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs, ...@@ -621,7 +623,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
"test_llm_kwargs", "test_llm_kwargs",
[ [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": k, "num_speculative_tokens": k,
} }
# Try a range of common k, as well as large speculation. # Try a range of common k, as well as large speculation.
...@@ -656,7 +658,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -656,7 +658,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -670,7 +672,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -670,7 +672,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
"test_llm_kwargs", "test_llm_kwargs",
[ [
{ {
"speculative_model": "JackFram/llama-68m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": k, "num_speculative_tokens": k,
"spec_decoding_acceptance_method": "typical_acceptance_sampler" "spec_decoding_acceptance_method": "typical_acceptance_sampler"
} }
......
...@@ -25,8 +25,10 @@ for the target model outputs. ...@@ -25,8 +25,10 @@ for the target model outputs.
""" """
import pytest import pytest
import os
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -43,7 +45,7 @@ from .conftest import run_equality_correctness_test ...@@ -43,7 +45,7 @@ from .conftest import run_equality_correctness_test
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
{ {
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}, },
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -90,7 +92,7 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -90,7 +92,7 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
{ {
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}, },
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -151,7 +153,7 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ...@@ -151,7 +153,7 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
{ {
"model_name": "JackFram/llama-160m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
}, },
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
...@@ -191,7 +193,7 @@ def test_ngram_e2e_greedy_correctness_with_preemption( ...@@ -191,7 +193,7 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -250,7 +252,7 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs, ...@@ -250,7 +252,7 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
......
import pytest import pytest
import os
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model # main model
MAIN_MODEL = "JackFram/llama-68m" MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
# speculative model # speculative model
SPEC_MODEL = "JackFram/llama-160m" SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -21,7 +23,7 @@ SPEC_MODEL = "JackFram/llama-160m" ...@@ -21,7 +23,7 @@ SPEC_MODEL = "JackFram/llama-160m"
"use_v2_block_manager": True, "use_v2_block_manager": True,
# speculative model # speculative model
"speculative_model": "JackFram/llama-160m", "speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# num speculative tokens # num speculative tokens
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
......
...@@ -4,6 +4,7 @@ from unittest.mock import MagicMock ...@@ -4,6 +4,7 @@ from unittest.mock import MagicMock
import pytest import pytest
import torch import torch
import os
from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.utils import set_random_seed from vllm.model_executor.utils import set_random_seed
...@@ -17,6 +18,7 @@ from vllm.worker.worker import Worker ...@@ -17,6 +18,7 @@ from vllm.worker.worker import Worker
from .utils import (assert_logprobs_dict_allclose, create_batch, from .utils import (assert_logprobs_dict_allclose, create_batch,
create_seq_group_metadata_from_prompts, create_worker, create_seq_group_metadata_from_prompts, create_worker,
patch_execute_model_with_seeds, zero_kv_cache) patch_execute_model_with_seeds, zero_kv_cache)
from ..utils import models_path_prefix
@pytest.mark.parametrize('num_steps', list(range(1, 17))) @pytest.mark.parametrize('num_steps', list(range(1, 17)))
...@@ -78,7 +80,7 @@ def test_same_output_for_single_step(): ...@@ -78,7 +80,7 @@ def test_same_output_for_single_step():
worker for num_steps=1. worker for num_steps=1.
""" """
seed = 100 seed = 100
model_name = 'JackFram/llama-68m' model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
block_size = 32 block_size = 32
num_gpu_blocks = 2048 // block_size num_gpu_blocks = 2048 // block_size
...@@ -163,7 +165,7 @@ def test_same_output_for_multi_step(): ...@@ -163,7 +165,7 @@ def test_same_output_for_multi_step():
then runs the worker num_steps times, and compares the output. then runs the worker num_steps times, and compares the output.
""" """
seed = 100 seed = 100
model_name = 'JackFram/llama-68m' model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
block_size = 16 block_size = 16
num_gpu_blocks = 2048 // block_size num_gpu_blocks = 2048 // block_size
...@@ -291,7 +293,7 @@ def test_multi_step_with_batch_expansion_correct_output(): ...@@ -291,7 +293,7 @@ def test_multi_step_with_batch_expansion_correct_output():
expanded batch is then used for predicting the next tokens. expanded batch is then used for predicting the next tokens.
""" """
seed = 100 seed = 100
model_name = 'JackFram/llama-68m' model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
block_size = 16 block_size = 16
num_gpu_blocks = 2048 // block_size num_gpu_blocks = 2048 // block_size
...@@ -385,7 +387,7 @@ def test_multi_step_with_batch_expansion_incorrect_output(): ...@@ -385,7 +387,7 @@ def test_multi_step_with_batch_expansion_incorrect_output():
the sequence ID is specified incorrectly. the sequence ID is specified incorrectly.
""" """
seed = 100 seed = 100
model_name = 'JackFram/llama-68m' model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
block_size = 16 block_size = 16
num_gpu_blocks = 2048 // block_size num_gpu_blocks = 2048 // block_size
...@@ -652,7 +654,7 @@ def test_use_draft_model_runner_advance_step(): ...@@ -652,7 +654,7 @@ def test_use_draft_model_runner_advance_step():
when applicable. when applicable.
""" """
seed = 100 seed = 100
model_name = 'JackFram/llama-68m' model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
k = 5 k = 5
batch_size = 32 batch_size = 32
......
import torch import torch
import os
from vllm.sequence import ExecuteModelRequest from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.ngram_worker import NGramWorker
from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.spec_decode.top1_proposer import Top1Proposer
from .utils import create_seq_group_metadata_from_prompts, create_worker from .utils import create_seq_group_metadata_from_prompts, create_worker
from ..utils import models_path_prefix
def test_ngram_algo_correctness_for_single_no_match(): def test_ngram_algo_correctness_for_single_no_match():
...@@ -15,7 +17,7 @@ def test_ngram_algo_correctness_for_single_no_match(): ...@@ -15,7 +17,7 @@ def test_ngram_algo_correctness_for_single_no_match():
block_size = 32 block_size = 32
num_gpu_blocks = 2048 // block_size num_gpu_blocks = 2048 // block_size
seed = 100 seed = 100
model_name = 'JackFram/llama-68m' model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
vocab_size = 32_000 vocab_size = 32_000
device = 'cuda:0' device = 'cuda:0'
...@@ -73,7 +75,7 @@ def test_ngram_algo_correctness_for_batches_not_match_all(): ...@@ -73,7 +75,7 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
block_size = 32 block_size = 32
num_gpu_blocks = 2048 // block_size num_gpu_blocks = 2048 // block_size
seed = 100 seed = 100
model_name = 'JackFram/llama-68m' model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
vocab_size = 32_000 vocab_size = 32_000
device = 'cuda:0' device = 'cuda:0'
...@@ -153,7 +155,7 @@ def test_ngram_algo_correctness_for_batches_match_all(): ...@@ -153,7 +155,7 @@ def test_ngram_algo_correctness_for_batches_match_all():
block_size = 32 block_size = 32
num_gpu_blocks = 2048 // block_size num_gpu_blocks = 2048 // block_size
seed = 100 seed = 100
model_name = 'JackFram/llama-68m' model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
vocab_size = 32_000 vocab_size = 32_000
device = 'cuda:0' device = 'cuda:0'
......
...@@ -22,7 +22,7 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, ...@@ -22,7 +22,7 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
tensorize_vllm_model) tensorize_vllm_model)
from ..conftest import VllmRunner from ..conftest import VllmRunner
from ..utils import RemoteOpenAIServer from ..utils import RemoteOpenAIServer, models_path_prefix
from .conftest import retry_until_skip from .conftest import retry_until_skip
# yapf conflicts with isort for this docstring # yapf conflicts with isort for this docstring
...@@ -37,7 +37,7 @@ prompts = [ ...@@ -37,7 +37,7 @@ prompts = [
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0) sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
model_ref = "facebook/opt-125m" model_ref = os.path.join(models_path_prefix, "facebook/opt-125m")
tensorize_model_for_testing_script = os.path.join( tensorize_model_for_testing_script = os.path.join(
os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py") os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
...@@ -84,7 +84,7 @@ def test_load_with_tensorizer(mock_agent, tensorizer_config): ...@@ -84,7 +84,7 @@ def test_load_with_tensorizer(mock_agent, tensorizer_config):
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_can_deserialize_s3(vllm_runner): def test_can_deserialize_s3(vllm_runner):
model_ref = "EleutherAI/pythia-1.4b" model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors" tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
with vllm_runner(model_ref, with vllm_runner(model_ref,
...@@ -156,13 +156,14 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, ...@@ -156,13 +156,14 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
from huggingface_hub import snapshot_download # from huggingface_hub import snapshot_download
from examples.multilora_inference import (create_test_prompts, from examples.multilora_inference import (create_test_prompts,
process_requests) process_requests)
model_ref = "meta-llama/Llama-2-7b-hf" model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
test_prompts = create_test_prompts(lora_path) test_prompts = create_test_prompts(lora_path)
# Serialize model before deserializing and binding LoRA adapters # Serialize model before deserializing and binding LoRA adapters
...@@ -255,7 +256,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner): ...@@ -255,7 +256,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
reason="Requires 2 GPUs") reason="Requires 2 GPUs")
def test_tensorizer_with_tp_path_without_template(vllm_runner): def test_tensorizer_with_tp_path_without_template(vllm_runner):
with pytest.raises(ValueError): with pytest.raises(ValueError):
model_ref = "EleutherAI/pythia-1.4b" model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors" tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
vllm_runner( vllm_runner(
...@@ -275,7 +276,7 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner): ...@@ -275,7 +276,7 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner):
reason="Requires 2 GPUs") reason="Requires 2 GPUs")
def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
tmp_path): tmp_path):
model_ref = "EleutherAI/pythia-1.4b" model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
# record outputs from un-sharded un-tensorized model # record outputs from un-sharded un-tensorized model
with vllm_runner( with vllm_runner(
model_ref, model_ref,
...@@ -324,7 +325,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, ...@@ -324,7 +325,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
gc.collect() gc.collect()
torch.cuda.empty_cache() torch.cuda.empty_cache()
model_ref = "facebook/opt-125m" model_ref = os.path.join(models_path_prefix, "facebook/opt-125m")
model_path = tmp_path / (model_ref + ".tensors") model_path = tmp_path / (model_ref + ".tensors")
config = TensorizerConfig(tensorizer_uri=str(model_path)) config = TensorizerConfig(tensorizer_uri=str(model_path))
......
...@@ -9,6 +9,8 @@ import pytest ...@@ -9,6 +9,8 @@ import pytest
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sequence import Sequence from vllm.sequence import Sequence
from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.transformers_utils.tokenizer_group import TokenizerGroup
from utils import models_path_prefix
import os
# Make two prefixes with different first blocks. # Make two prefixes with different first blocks.
prefix_start = [("You are an expert"), ("You are a")] prefix_start = [("You are an expert"), ("You are a")]
...@@ -36,7 +38,7 @@ def flatten_2d(li): ...@@ -36,7 +38,7 @@ def flatten_2d(li):
return [lss for ls in li for lss in ls] return [lss for ls in li for lss in ls]
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("max_num_seqs", [256]) @pytest.mark.parametrize("max_num_seqs", [256])
@pytest.mark.parametrize("concurrent_lora_int_ids", @pytest.mark.parametrize("concurrent_lora_int_ids",
...@@ -45,7 +47,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, ...@@ -45,7 +47,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
concurrent_lora_int_ids: List[Optional[int]]): concurrent_lora_int_ids: List[Optional[int]]):
tokenizer = TokenizerGroup( tokenizer = TokenizerGroup(
tokenizer_id="facebook/opt-125m", tokenizer_id=os.path.join(models_path_prefix, "facebook/opt-125m"),
enable_lora=False, enable_lora=False,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
max_input_length=None, max_input_length=None,
......
import pytest import pytest
import os
from vllm.config import ModelConfig from vllm.config import ModelConfig
from utils import models_path_prefix
MODEL_IDS_EXPECTED = [ MODEL_IDS_EXPECTED = [
("Qwen/Qwen1.5-7B", 32768), (os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 32768),
("mistralai/Mistral-7B-v0.1", 4096), (os.path.join(models_path_prefix, "mistralai/Mistral-7B-v0.1"), 4096),
("mistralai/Mistral-7B-Instruct-v0.2", 32768), (os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.2"), 32768),
] ]
...@@ -31,8 +33,8 @@ def test_get_sliding_window(): ...@@ -31,8 +33,8 @@ def test_get_sliding_window():
# For Qwen1.5/Qwen2, get_sliding_window() should be None # For Qwen1.5/Qwen2, get_sliding_window() should be None
# when use_sliding_window is False. # when use_sliding_window is False.
qwen2_model_config = ModelConfig( qwen2_model_config = ModelConfig(
"Qwen/Qwen1.5-7B", os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"),
"Qwen/Qwen1.5-7B", os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"),
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
seed=0, seed=0,
...@@ -48,8 +50,8 @@ def test_get_sliding_window(): ...@@ -48,8 +50,8 @@ def test_get_sliding_window():
assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
mistral_model_config = ModelConfig( mistral_model_config = ModelConfig(
"mistralai/Mistral-7B-v0.1", os.path.join(models_path_prefix, "mistralai/Mistral-7B-v0.1"),
"mistralai/Mistral-7B-v0.1", os.path.join(models_path_prefix, "mistralai/Mistral-7B-v0.1"),
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
seed=0, seed=0,
...@@ -69,8 +71,8 @@ def test_rope_customization(): ...@@ -69,8 +71,8 @@ def test_rope_customization():
LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0} LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}
llama_model_config = ModelConfig( llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
"meta-llama/Meta-Llama-3-8B-Instruct", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
dtype="float16", dtype="float16",
...@@ -81,8 +83,8 @@ def test_rope_customization(): ...@@ -81,8 +83,8 @@ def test_rope_customization():
assert llama_model_config.max_model_len == 8192 assert llama_model_config.max_model_len == 8192
llama_model_config = ModelConfig( llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
"meta-llama/Meta-Llama-3-8B-Instruct", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
dtype="float16", dtype="float16",
...@@ -97,8 +99,8 @@ def test_rope_customization(): ...@@ -97,8 +99,8 @@ def test_rope_customization():
assert llama_model_config.max_model_len == 16384 assert llama_model_config.max_model_len == 16384
longchat_model_config = ModelConfig( longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k", os.path.join(models_path_prefix, "lmsys/longchat-13b-16k"),
"lmsys/longchat-13b-16k", os.path.join(models_path_prefix, "lmsys/longchat-13b-16k"),
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
dtype="float16", dtype="float16",
...@@ -111,8 +113,8 @@ def test_rope_customization(): ...@@ -111,8 +113,8 @@ def test_rope_customization():
assert longchat_model_config.max_model_len == 16384 assert longchat_model_config.max_model_len == 16384
longchat_model_config = ModelConfig( longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k", os.path.join(models_path_prefix, "lmsys/longchat-13b-16k"),
"lmsys/longchat-13b-16k", os.path.join(models_path_prefix, "lmsys/longchat-13b-16k"),
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
dtype="float16", dtype="float16",
......
...@@ -9,6 +9,8 @@ import gc ...@@ -9,6 +9,8 @@ import gc
import torch import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from utils import models_path_prefix
import os
def test_duplicated_ignored_sequence_group(): def test_duplicated_ignored_sequence_group():
...@@ -17,7 +19,7 @@ def test_duplicated_ignored_sequence_group(): ...@@ -17,7 +19,7 @@ def test_duplicated_ignored_sequence_group():
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=256) max_tokens=256)
llm = LLM(model="facebook/opt-125m", llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1)
prompts = ["This is a short prompt", "This is a very long prompt " * 1000] prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
...@@ -30,7 +32,7 @@ def test_max_tokens_none(): ...@@ -30,7 +32,7 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=None) max_tokens=None)
llm = LLM(model="facebook/opt-125m", llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1)
prompts = ["Just say hello!"] prompts = ["Just say hello!"]
...@@ -40,7 +42,7 @@ def test_max_tokens_none(): ...@@ -40,7 +42,7 @@ def test_max_tokens_none():
def test_gc(): def test_gc():
llm = LLM("facebook/opt-125m", enforce_eager=True) llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m"), enforce_eager=True)
del llm del llm
gc.collect() gc.collect()
...@@ -55,7 +57,7 @@ def test_gc(): ...@@ -55,7 +57,7 @@ def test_gc():
def test_model_from_modelscope(monkeypatch): def test_model_from_modelscope(monkeypatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat" MODELSCOPE_MODEL_NAME = os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat")
monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True") monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
try: try:
llm = LLM(model=MODELSCOPE_MODEL_NAME) llm = LLM(model=MODELSCOPE_MODEL_NAME)
......
...@@ -9,6 +9,7 @@ from huggingface_hub import snapshot_download ...@@ -9,6 +9,7 @@ from huggingface_hub import snapshot_download
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.model_executor.model_loader.loader import ShardedStateLoader from vllm.model_executor.model_loader.loader import ShardedStateLoader
from utils import models_path_prefix
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
...@@ -46,9 +47,10 @@ def test_filter_subtensors(): ...@@ -46,9 +47,10 @@ def test_filter_subtensors():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llama_2_7b_files(): def llama_2_7b_files():
with TemporaryDirectory() as cache_dir: with TemporaryDirectory() as cache_dir:
input_dir = snapshot_download("meta-llama/Llama-2-7b-hf", # input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
cache_dir=cache_dir, # cache_dir=cache_dir,
ignore_patterns="*.bin*") # ignore_patterns="*.bin*")
input_dir = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
yield input_dir yield input_dir
......
...@@ -10,6 +10,7 @@ from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs, ...@@ -10,6 +10,7 @@ from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs,
get_open_port, merge_async_iterators) get_open_port, merge_async_iterators)
from .utils import error_on_warning from .utils import error_on_warning
from .utils import models_path_prefix
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -150,17 +151,18 @@ def test_underscore_to_dash(parser): ...@@ -150,17 +151,18 @@ def test_underscore_to_dash(parser):
def test_mixed_usage(parser): def test_mixed_usage(parser):
args = parser.parse_args([ args = parser.parse_args([
'--image_input_type', 'image_features', '--model-name', '--image_input_type', 'image_features', '--model-name',
'facebook/opt-125m' os.path.join(models_path_prefix, 'facebook/opt-125m')
]) ])
assert args.image_input_type == 'image_features' assert args.image_input_type == 'image_features'
assert args.model_name == 'facebook/opt-125m' assert args.model_name == os.path.join(models_path_prefix, 'facebook/opt-125m')
def test_with_equals_sign(parser): def test_with_equals_sign(parser):
model_name_with_path = os.path.join(models_path_prefix, 'facebook/opt-125m')
args = parser.parse_args( args = parser.parse_args(
['--image_input_type=pixel_values', '--model-name=facebook/opt-125m']) ['--image_input_type=pixel_values', f'--model-name={model_name_with_path}'])
assert args.image_input_type == 'pixel_values' assert args.image_input_type == 'pixel_values'
assert args.model_name == 'facebook/opt-125m' assert args.model_name == os.path.join(models_path_prefix, 'facebook/opt-125m')
def test_with_int_value(parser): def test_with_int_value(parser):
......
from copy import deepcopy from copy import deepcopy
import os
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.transformers_utils.tokenizer import get_cached_tokenizer from vllm.transformers_utils.tokenizer import get_cached_tokenizer
from ..utils import models_path_prefix
def test_cached_tokenizer(): def test_cached_tokenizer():
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"}) reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
reference_tokenizer.add_special_tokens( reference_tokenizer.add_special_tokens(
{"additional_special_tokens": ["<SEP>"]}) {"additional_special_tokens": ["<SEP>"]})
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment