Commit 3c9817d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH  to load models from local path instead of Hugging Face Hub
parent 49204f68
import pytest
import os
from vllm import SamplingParams
from .conftest import get_output_from_llm_generator
from ...utils import models_path_prefix
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": "JackFram/llama-68m",
"speculative_model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
# Required for spec decode.
......@@ -47,8 +49,8 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": "meta-llama/Llama-2-7b-chat-hf",
"speculative_model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
# Required for spec decode.
......@@ -97,8 +99,8 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
@pytest.mark.parametrize("common_llm_kwargs", [{
"model": "JackFram/llama-68m",
"speculative_model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......
......@@ -20,14 +20,16 @@ correctess for the target model outputs.
"""
import pytest
import os
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model
MAIN_MODEL = "JackFram/llama-68m"
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
# speculative model
SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random"
SPEC_MODEL = os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random")
# max. number of speculative tokens: this corresponds to
# num_heads in the config.json of the speculator model.
......
......@@ -3,10 +3,12 @@ other features, e.g. cuda graphs.
"""
import pytest
import os
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
MAIN_MODEL = "JackFram/llama-68m"
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
@pytest.mark.parametrize(
......@@ -17,14 +19,14 @@ MAIN_MODEL = "JackFram/llama-68m"
# Verify equality when cuda graphs allowed.
"enforce_eager": False,
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[
{
# Identical models.
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
},
])
......@@ -53,7 +55,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -63,7 +65,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
"speculative_model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
"speculative_model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"num_speculative_tokens": 5,
},
])
......
......@@ -4,10 +4,12 @@ tensor parallelism.
import pytest
import torch
import os
from vllm.utils import is_hip
from .conftest import run_equality_correctness_test_tp
from ...utils import models_path_prefix
@pytest.mark.skipif(torch.cuda.device_count() < 2,
......@@ -28,7 +30,7 @@ from .conftest import run_equality_correctness_test_tp
@pytest.mark.parametrize("test_llm_kwargs", [
[
"--speculative-model",
"JackFram/llama-68m",
os.path.join(models_path_prefix, "JackFram/llama-68m"),
"--num-speculative-tokens",
"3",
],
......@@ -56,7 +58,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
"""
if is_hip():
pytest.skip("hip is not well-supported yet")
run_equality_correctness_test_tp("JackFram/llama-68m",
run_equality_correctness_test_tp(os.path.join(models_path_prefix, "JackFram/llama-68m"),
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
......@@ -89,15 +91,15 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
@pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [
"--speculative-model",
"JackFram/llama-68m",
os.path.join(models_path_prefix, "JackFram/llama-68m"),
"--num_speculative-tokens",
"5",
"--speculative-draft-tensor-parallel-size",
"1",
]),
("ibm-granite/granite-3b-code-instruct", [
(os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
"--speculative-model",
"ibm-granite/granite-3b-code-instruct",
os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
"--num_speculative-tokens",
"5",
"--speculative-draft-tensor-parallel-size",
......
......@@ -5,11 +5,13 @@ tensor parallelism.
import openai
import pytest
import torch
import os
from .conftest import run_equality_correctness_test_tp
from ...utils import models_path_prefix
MAIN_MODEL = "JackFram/llama-68m"
SPEC_MODEL = "JackFram/llama-68m"
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
@pytest.mark.skipif(torch.cuda.device_count() < 4,
......
from itertools import cycle
import pytest
import os
from vllm import SamplingParams
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -22,11 +24,11 @@ from .conftest import run_equality_correctness_test
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
[{
"speculative_model": "JackFram/llama-160m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False,
}, {
"speculative_model": "JackFram/llama-160m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": True,
}])
......@@ -63,7 +65,7 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -75,11 +77,11 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
[{
"speculative_model": "JackFram/llama-160m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False,
}, {
"speculative_model": "JackFram/llama-160m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 6,
"disable_logprobs_during_spec_decoding": False,
}])
......@@ -115,7 +117,7 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -128,7 +130,7 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"test_llm_kwargs",
[{
"speculative_model": "JackFram/llama-160m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False,
......@@ -169,7 +171,7 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -181,7 +183,7 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
[{
"speculative_model": "JackFram/llama-160m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
"num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": False,
}])
......@@ -248,7 +250,7 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Required for spec decode.
......@@ -258,7 +260,7 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
[{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 3,
"disable_logprobs_during_spec_decoding": True,
}])
......
......@@ -20,16 +20,18 @@ correctess for the target model outputs.
"""
import pytest
import os
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model
# lmsys/vicuna-7b-v1.3 was to be used but it's causing
# OOM in CI pipeline, so using a smaller model.
MAIN_MODEL = "JackFram/llama-68m"
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
# speculative model
SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random"
SPEC_MODEL = os.path.join(models_path_prefix, "abhigoyal/vllm-medusa-llama-68m-random")
# max number of speculative tokens: this corresponds to
# num_heads in the config.json of the speculator model.
......
......@@ -22,16 +22,18 @@ correctness for the target model outputs.
from unittest.mock import patch
import pytest
import os
from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model
MAIN_MODEL = "JackFram/llama-160m"
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
# speculative model
SPEC_MODEL = "ibm-fms/llama-160m-accelerator"
SPEC_MODEL = os.path.join(models_path_prefix, "ibm-fms/llama-160m-accelerator")
# max. number of speculative tokens: this corresponds to
# n_predict in the config.json of the speculator model.
......
......@@ -37,6 +37,7 @@ greedy-equality tests for those batch sizes/prompts.
from itertools import cycle
import pytest
import os
from transformers import AutoTokenizer
from vllm import SamplingParams
......@@ -44,6 +45,7 @@ from vllm import SamplingParams
from ...utils import fork_new_process_for_each_test
from .conftest import (get_output_from_llm_generator,
run_equality_correctness_test)
from ...utils import models_path_prefix
@pytest.mark.parametrize(
......@@ -51,7 +53,7 @@ from .conftest import (get_output_from_llm_generator,
[{
# Use a small model for a fast test.
# Note this is repeated in the test body; to initialize a tokenizer.
"model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -63,7 +65,7 @@ from .conftest import (get_output_from_llm_generator,
"per_test_common_llm_kwargs",
[
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
},
{
......@@ -111,7 +113,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
for token_ids in batch_token_ids] == ([output_len] * batch_size)
# Expect detokenized string to match.
tok = AutoTokenizer.from_pretrained("JackFram/llama-68m")
tok = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "JackFram/llama-68m"))
for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids):
expected_tokens = tok.decode(actual_token_ids)
print(f"{actual_token_ids=}")
......@@ -136,16 +138,16 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
# Try two different tiny base models.
# Note that one is equal to the draft model, another isn't.
{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
},
{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
},
])
......@@ -202,16 +204,16 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
# Try two different tiny base models.
# Note that one is equal to the draft model, another isn't.
{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
},
{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
},
])
......@@ -256,16 +258,16 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
# Try two different tiny base models.
# Note that one is equal to the draft model, another isn't.
{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
},
{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m")",
},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
},
])
......@@ -298,7 +300,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
"common_llm_kwargs",
[{
# A "real" model (not tiny).
"model_name": "meta-llama/Llama-2-7b-chat-hf",
"model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -313,7 +315,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
},
])
......@@ -348,7 +350,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
"common_llm_kwargs",
[{
# A "real" model (not tiny).
"model_name": "meta-llama/Llama-2-7b-chat-hf",
"model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -363,7 +365,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
},
])
......@@ -410,13 +412,13 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
},
])
......@@ -450,7 +452,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -476,7 +478,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
},
])
......@@ -510,7 +512,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -524,7 +526,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
"test_llm_kwargs",
[
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
# Artificially limit the draft model max model len; this forces vLLM
......@@ -566,7 +568,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -578,7 +580,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"speculative_disable_by_batch_size": 2,
},
......@@ -607,7 +609,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -621,7 +623,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
"test_llm_kwargs",
[
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": k,
}
# Try a range of common k, as well as large speculation.
......@@ -656,7 +658,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -670,7 +672,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
"test_llm_kwargs",
[
{
"speculative_model": "JackFram/llama-68m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": k,
"spec_decoding_acceptance_method": "typical_acceptance_sampler"
}
......
......@@ -25,8 +25,10 @@ for the target model outputs.
"""
import pytest
import os
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
@pytest.mark.parametrize(
......@@ -43,7 +45,7 @@ from .conftest import run_equality_correctness_test
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -90,7 +92,7 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -151,7 +153,7 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
"model_name": "JackFram/llama-160m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......@@ -191,7 +193,7 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -250,7 +252,7 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......
import pytest
import os
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model
MAIN_MODEL = "JackFram/llama-68m"
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
# speculative model
SPEC_MODEL = "JackFram/llama-160m"
SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......@@ -21,7 +23,7 @@ SPEC_MODEL = "JackFram/llama-160m"
"use_v2_block_manager": True,
# speculative model
"speculative_model": "JackFram/llama-160m",
"speculative_model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
# num speculative tokens
"num_speculative_tokens": 3,
......
......@@ -4,6 +4,7 @@ from unittest.mock import MagicMock
import pytest
import torch
import os
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.utils import set_random_seed
......@@ -17,6 +18,7 @@ from vllm.worker.worker import Worker
from .utils import (assert_logprobs_dict_allclose, create_batch,
create_seq_group_metadata_from_prompts, create_worker,
patch_execute_model_with_seeds, zero_kv_cache)
from ..utils import models_path_prefix
@pytest.mark.parametrize('num_steps', list(range(1, 17)))
......@@ -78,7 +80,7 @@ def test_same_output_for_single_step():
worker for num_steps=1.
"""
seed = 100
model_name = 'JackFram/llama-68m'
model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
block_size = 32
num_gpu_blocks = 2048 // block_size
......@@ -163,7 +165,7 @@ def test_same_output_for_multi_step():
then runs the worker num_steps times, and compares the output.
"""
seed = 100
model_name = 'JackFram/llama-68m'
model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
block_size = 16
num_gpu_blocks = 2048 // block_size
......@@ -291,7 +293,7 @@ def test_multi_step_with_batch_expansion_correct_output():
expanded batch is then used for predicting the next tokens.
"""
seed = 100
model_name = 'JackFram/llama-68m'
model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
block_size = 16
num_gpu_blocks = 2048 // block_size
......@@ -385,7 +387,7 @@ def test_multi_step_with_batch_expansion_incorrect_output():
the sequence ID is specified incorrectly.
"""
seed = 100
model_name = 'JackFram/llama-68m'
model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
block_size = 16
num_gpu_blocks = 2048 // block_size
......@@ -652,7 +654,7 @@ def test_use_draft_model_runner_advance_step():
when applicable.
"""
seed = 100
model_name = 'JackFram/llama-68m'
model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
k = 5
batch_size = 32
......
import torch
import os
from vllm.sequence import ExecuteModelRequest
from vllm.spec_decode.ngram_worker import NGramWorker
from vllm.spec_decode.top1_proposer import Top1Proposer
from .utils import create_seq_group_metadata_from_prompts, create_worker
from ..utils import models_path_prefix
def test_ngram_algo_correctness_for_single_no_match():
......@@ -15,7 +17,7 @@ def test_ngram_algo_correctness_for_single_no_match():
block_size = 32
num_gpu_blocks = 2048 // block_size
seed = 100
model_name = 'JackFram/llama-68m'
model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
vocab_size = 32_000
device = 'cuda:0'
......@@ -73,7 +75,7 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
block_size = 32
num_gpu_blocks = 2048 // block_size
seed = 100
model_name = 'JackFram/llama-68m'
model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
vocab_size = 32_000
device = 'cuda:0'
......@@ -153,7 +155,7 @@ def test_ngram_algo_correctness_for_batches_match_all():
block_size = 32
num_gpu_blocks = 2048 // block_size
seed = 100
model_name = 'JackFram/llama-68m'
model_name = os.path.join(models_path_prefix, 'JackFram/llama-68m')
vocab_size = 32_000
device = 'cuda:0'
......
......@@ -22,7 +22,7 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
tensorize_vllm_model)
from ..conftest import VllmRunner
from ..utils import RemoteOpenAIServer
from ..utils import RemoteOpenAIServer, models_path_prefix
from .conftest import retry_until_skip
# yapf conflicts with isort for this docstring
......@@ -37,7 +37,7 @@ prompts = [
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
model_ref = "facebook/opt-125m"
model_ref = os.path.join(models_path_prefix, "facebook/opt-125m")
tensorize_model_for_testing_script = os.path.join(
os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
......@@ -84,7 +84,7 @@ def test_load_with_tensorizer(mock_agent, tensorizer_config):
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_can_deserialize_s3(vllm_runner):
model_ref = "EleutherAI/pythia-1.4b"
model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
with vllm_runner(model_ref,
......@@ -156,13 +156,14 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
from huggingface_hub import snapshot_download
# from huggingface_hub import snapshot_download
from examples.multilora_inference import (create_test_prompts,
process_requests)
model_ref = "meta-llama/Llama-2-7b-hf"
lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
test_prompts = create_test_prompts(lora_path)
# Serialize model before deserializing and binding LoRA adapters
......@@ -255,7 +256,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
reason="Requires 2 GPUs")
def test_tensorizer_with_tp_path_without_template(vllm_runner):
with pytest.raises(ValueError):
model_ref = "EleutherAI/pythia-1.4b"
model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
vllm_runner(
......@@ -275,7 +276,7 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner):
reason="Requires 2 GPUs")
def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
tmp_path):
model_ref = "EleutherAI/pythia-1.4b"
model_ref = os.path.join(models_path_prefix, "EleutherAI/pythia-1.4b")
# record outputs from un-sharded un-tensorized model
with vllm_runner(
model_ref,
......@@ -324,7 +325,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
gc.collect()
torch.cuda.empty_cache()
model_ref = "facebook/opt-125m"
model_ref = os.path.join(models_path_prefix, "facebook/opt-125m")
model_path = tmp_path / (model_ref + ".tensors")
config = TensorizerConfig(tensorizer_uri=str(model_path))
......
......@@ -9,6 +9,8 @@ import pytest
from vllm.lora.request import LoRARequest
from vllm.sequence import Sequence
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
from utils import models_path_prefix
import os
# Make two prefixes with different first blocks.
prefix_start = [("You are an expert"), ("You are a")]
......@@ -36,7 +38,7 @@ def flatten_2d(li):
return [lss for ls in li for lss in ls]
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("max_num_seqs", [256])
@pytest.mark.parametrize("concurrent_lora_int_ids",
......@@ -45,7 +47,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
concurrent_lora_int_ids: List[Optional[int]]):
tokenizer = TokenizerGroup(
tokenizer_id="facebook/opt-125m",
tokenizer_id=os.path.join(models_path_prefix, "facebook/opt-125m"),
enable_lora=False,
max_num_seqs=max_num_seqs,
max_input_length=None,
......
import pytest
import os
from vllm.config import ModelConfig
from utils import models_path_prefix
MODEL_IDS_EXPECTED = [
("Qwen/Qwen1.5-7B", 32768),
("mistralai/Mistral-7B-v0.1", 4096),
("mistralai/Mistral-7B-Instruct-v0.2", 32768),
(os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 32768),
(os.path.join(models_path_prefix, "mistralai/Mistral-7B-v0.1"), 4096),
(os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.2"), 32768),
]
......@@ -31,8 +33,8 @@ def test_get_sliding_window():
# For Qwen1.5/Qwen2, get_sliding_window() should be None
# when use_sliding_window is False.
qwen2_model_config = ModelConfig(
"Qwen/Qwen1.5-7B",
"Qwen/Qwen1.5-7B",
os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"),
os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"),
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
......@@ -48,8 +50,8 @@ def test_get_sliding_window():
assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
mistral_model_config = ModelConfig(
"mistralai/Mistral-7B-v0.1",
"mistralai/Mistral-7B-v0.1",
os.path.join(models_path_prefix, "mistralai/Mistral-7B-v0.1"),
os.path.join(models_path_prefix, "mistralai/Mistral-7B-v0.1"),
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
......@@ -69,8 +71,8 @@ def test_rope_customization():
LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}
llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Meta-Llama-3-8B-Instruct",
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
......@@ -81,8 +83,8 @@ def test_rope_customization():
assert llama_model_config.max_model_len == 8192
llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Meta-Llama-3-8B-Instruct",
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
......@@ -97,8 +99,8 @@ def test_rope_customization():
assert llama_model_config.max_model_len == 16384
longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k",
"lmsys/longchat-13b-16k",
os.path.join(models_path_prefix, "lmsys/longchat-13b-16k"),
os.path.join(models_path_prefix, "lmsys/longchat-13b-16k"),
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
......@@ -111,8 +113,8 @@ def test_rope_customization():
assert longchat_model_config.max_model_len == 16384
longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k",
"lmsys/longchat-13b-16k",
os.path.join(models_path_prefix, "lmsys/longchat-13b-16k"),
os.path.join(models_path_prefix, "lmsys/longchat-13b-16k"),
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
......
......@@ -9,6 +9,8 @@ import gc
import torch
from vllm import LLM, SamplingParams
from utils import models_path_prefix
import os
def test_duplicated_ignored_sequence_group():
......@@ -17,7 +19,7 @@ def test_duplicated_ignored_sequence_group():
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=256)
llm = LLM(model="facebook/opt-125m",
llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_batched_tokens=4096,
tensor_parallel_size=1)
prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
......@@ -30,7 +32,7 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=None)
llm = LLM(model="facebook/opt-125m",
llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_num_batched_tokens=4096,
tensor_parallel_size=1)
prompts = ["Just say hello!"]
......@@ -40,7 +42,7 @@ def test_max_tokens_none():
def test_gc():
llm = LLM("facebook/opt-125m", enforce_eager=True)
llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m"), enforce_eager=True)
del llm
gc.collect()
......@@ -55,7 +57,7 @@ def test_gc():
def test_model_from_modelscope(monkeypatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
MODELSCOPE_MODEL_NAME = os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat")
monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
try:
llm = LLM(model=MODELSCOPE_MODEL_NAME)
......
......@@ -9,6 +9,7 @@ from huggingface_hub import snapshot_download
from vllm import LLM, SamplingParams
from vllm.model_executor.model_loader.loader import ShardedStateLoader
from utils import models_path_prefix
prompts = [
"Hello, my name is",
......@@ -46,9 +47,10 @@ def test_filter_subtensors():
@pytest.fixture(scope="module")
def llama_2_7b_files():
with TemporaryDirectory() as cache_dir:
input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
cache_dir=cache_dir,
ignore_patterns="*.bin*")
# input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
# cache_dir=cache_dir,
# ignore_patterns="*.bin*")
input_dir = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
yield input_dir
......
......@@ -10,6 +10,7 @@ from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs,
get_open_port, merge_async_iterators)
from .utils import error_on_warning
from .utils import models_path_prefix
@pytest.mark.asyncio
......@@ -150,17 +151,18 @@ def test_underscore_to_dash(parser):
def test_mixed_usage(parser):
args = parser.parse_args([
'--image_input_type', 'image_features', '--model-name',
'facebook/opt-125m'
os.path.join(models_path_prefix, 'facebook/opt-125m')
])
assert args.image_input_type == 'image_features'
assert args.model_name == 'facebook/opt-125m'
assert args.model_name == os.path.join(models_path_prefix, 'facebook/opt-125m')
def test_with_equals_sign(parser):
model_name_with_path = os.path.join(models_path_prefix, 'facebook/opt-125m')
args = parser.parse_args(
['--image_input_type=pixel_values', '--model-name=facebook/opt-125m'])
['--image_input_type=pixel_values', f'--model-name={model_name_with_path}'])
assert args.image_input_type == 'pixel_values'
assert args.model_name == 'facebook/opt-125m'
assert args.model_name == os.path.join(models_path_prefix, 'facebook/opt-125m')
def test_with_int_value(parser):
......
from copy import deepcopy
import os
from transformers import AutoTokenizer
from vllm.transformers_utils.tokenizer import get_cached_tokenizer
from ..utils import models_path_prefix
def test_cached_tokenizer():
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
reference_tokenizer.add_special_tokens(
{"additional_special_tokens": ["<SEP>"]})
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment