Commit ced28510 authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests] fix tests of core, engine and detokenizer

parent 734a433d
...@@ -29,18 +29,18 @@ class TestSetting: ...@@ -29,18 +29,18 @@ class TestSetting:
"test_setting", "test_setting",
[ [
# basic llama model # basic llama model
TestSetting( # TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct", # model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
model_args=[], # model_args=[],
pp_size=2, # pp_size=2,
tp_size=2, # tp_size=2,
attn_backend="FLASHINFER", # attn_backend="FLASHINFER",
method="generate", # method="generate",
fullgraph=True, # fullgraph=True,
), # ),
# llama model with quantization # llama model with quantization
TestSetting( TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
model_args=["--quantization", "gptq"], model_args=["--quantization", "gptq"],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
...@@ -50,7 +50,7 @@ class TestSetting: ...@@ -50,7 +50,7 @@ class TestSetting:
), ),
# MoE model # MoE model
TestSetting( TestSetting(
model="ibm/PowerMoE-3b", model=os.path.join(models_path_prefix, "ibm/PowerMoE-3b"),
model_args=[], model_args=[],
pp_size=1, pp_size=1,
tp_size=2, tp_size=2,
...@@ -60,7 +60,7 @@ class TestSetting: ...@@ -60,7 +60,7 @@ class TestSetting:
), ),
# embedding model # embedding model
TestSetting( TestSetting(
model="BAAI/bge-multilingual-gemma2", model=os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
model_args=["--task", "embed", "--dtype", "bfloat16"], model_args=["--task", "embed", "--dtype", "bfloat16"],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
...@@ -69,18 +69,18 @@ class TestSetting: ...@@ -69,18 +69,18 @@ class TestSetting:
fullgraph=True, fullgraph=True,
), ),
# encoder-based embedding model (BERT) # encoder-based embedding model (BERT)
TestSetting( # TestSetting(
model="BAAI/bge-base-en-v1.5", # model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
model_args=["--task", "embed"], # model_args=["--task", "embed"],
pp_size=1, # pp_size=1,
tp_size=1, # tp_size=1,
attn_backend="XFORMERS", # attn_backend="XFORMERS",
method="encode", # method="encode",
fullgraph=True, # fullgraph=True,
), # ),
# vision language model # vision language model
TestSetting( TestSetting(
model="microsoft/Phi-3.5-vision-instruct", model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args=["--trust-remote-code", "--max-model-len", "2048"], model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2, pp_size=2,
tp_size=1, tp_size=1,
...@@ -146,4 +146,4 @@ def test_compile_correctness( ...@@ -146,4 +146,4 @@ def test_compile_correctness(
all_envs[-1][ all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
compare_all_settings(model, all_args * 3, all_envs, method=method) compare_all_settings(model, all_args * 3, all_envs, method=method)
\ No newline at end of file
...@@ -9,6 +9,8 @@ from vllm import SamplingParams ...@@ -9,6 +9,8 @@ from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator from .conftest import get_token_ids_from_llm_generator
import os import os
from ....utils import models_path_prefix from ....utils import models_path_prefix
import vllm.envs as envs
from vllm.utils import SUPPORT_TC, gpuname
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -21,7 +23,7 @@ from ....utils import models_path_prefix ...@@ -21,7 +23,7 @@ from ....utils import models_path_prefix
"enforce_eager": True, "enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case. # Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1), "num_gpu_blocks_override": 5 * (64 + 1),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
...@@ -104,19 +106,19 @@ def test_block_manager_with_preemption(baseline_llm_generator, ...@@ -104,19 +106,19 @@ def test_block_manager_with_preemption(baseline_llm_generator,
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[ [
{ {
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
# Allow only 2 sequences of ~128 tokens in worst case. # Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size # Note 8 = 128/block_size
"num_gpu_blocks_override": 2 * (8 + 1), "num_gpu_blocks_override": 2 * (8 + 1),
}, },
{ # {
"block_size": 8, # "block_size": 8,
# Allow only 2 sequences of ~128 tokens in worst case. # # Allow only 2 sequences of ~128 tokens in worst case.
# Note 16 = 128/block_size # # Note 16 = 128/block_size
"num_gpu_blocks_override": 2 * (16 + 2), # "num_gpu_blocks_override": 2 * (16 + 2),
} # }
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{ @pytest.mark.parametrize("baseline_llm_kwargs", [{
"num_lookahead_slots": 0, "num_lookahead_slots": 0,
...@@ -197,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, ...@@ -197,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
]) ])
@pytest.mark.parametrize("per_test_common_llm_kwargs", @pytest.mark.parametrize("per_test_common_llm_kwargs",
[{ [{
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 2, "max_num_batched_tokens": 2,
"max_num_seqs": 2, "max_num_seqs": 2,
}, { }, {
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 3, "max_num_batched_tokens": 3,
"max_num_seqs": 2, "max_num_seqs": 2,
}, { }, {
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 256, "max_num_batched_tokens": 256,
"max_num_seqs": 10, "max_num_seqs": 10,
}]) }])
...@@ -271,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator, ...@@ -271,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
"enforce_eager": True, "enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case. # Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1), "num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache # Enable prefill cache
...@@ -352,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption( ...@@ -352,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
"enforce_eager": True, "enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case. # Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1), "num_gpu_blocks_override": 5 * (64 + 1),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
...@@ -427,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, ...@@ -427,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
# we keep the blocks small, so that hit eviction quickly # we keep the blocks small, so that hit eviction quickly
"max_model_len": 48, "max_model_len": 48,
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 3, "num_gpu_blocks_override": 3,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
...@@ -477,4 +479,4 @@ def test_auto_prefix_caching_after_evition_start(baseline_llm_generator, ...@@ -477,4 +479,4 @@ def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
test_token_ids): test_token_ids):
assert expected_token_ids == actual_token_ids assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids assert baseline_token_ids == test_token_ids
\ No newline at end of file
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
from unittest.mock import MagicMock from unittest.mock import MagicMock
import os
import pytest # noqa import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig from vllm.config import CacheConfig, SchedulerConfig
...@@ -12,6 +13,9 @@ from vllm.sampling_params import SamplingParams ...@@ -12,6 +13,9 @@ from vllm.sampling_params import SamplingParams
from vllm.sequence import Logprob, SequenceGroup from vllm.sequence import Logprob, SequenceGroup
from .utils import create_dummy_prompt from .utils import create_dummy_prompt
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
def get_sequence_groups(scheduler_output): def get_sequence_groups(scheduler_output):
...@@ -830,7 +834,7 @@ def test_prefix_caching_with_concurrent_partial_prefills(): ...@@ -830,7 +834,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
assert out.num_batched_tokens == 44 assert out.num_batched_tokens == 44
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8]) @pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
def test_chunked_prefill_with_actual_engine(model: str, def test_chunked_prefill_with_actual_engine(model: str,
max_num_partial_prefills: int): max_num_partial_prefills: int):
...@@ -847,6 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str, ...@@ -847,6 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
max_num_seqs=8, max_num_seqs=8,
enable_chunked_prefill=True, enable_chunked_prefill=True,
gpu_memory_utilization=0.8, gpu_memory_utilization=0.8,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
...@@ -858,4 +863,4 @@ def test_chunked_prefill_with_actual_engine(model: str, ...@@ -858,4 +863,4 @@ def test_chunked_prefill_with_actual_engine(model: str,
request_outputs = engine.step() request_outputs = engine.step()
# means all are prefilling # means all are prefilling
assert len(request_outputs) == 0 assert len(request_outputs) == 0
assert len(engine.scheduler[0].running) == max_num_partial_prefills assert len(engine.scheduler[0].running) == max_num_partial_prefills
\ No newline at end of file
...@@ -9,6 +9,8 @@ from vllm.engine.llm_engine import LLMEngine ...@@ -9,6 +9,8 @@ from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import SequenceGroup from vllm.sequence import SequenceGroup
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m") MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
...@@ -37,7 +39,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int, ...@@ -37,7 +39,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
num_scheduler_steps=num_scheduler_steps, num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager) enforce_eager=enforce_eager,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
engine: LLMEngine = runner.model.llm_engine engine: LLMEngine = runner.model.llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step. # In multi-step + chunked-prefill there is no separate single prompt step.
...@@ -81,4 +84,4 @@ def test_num_computed_tokens_update(num_scheduler_steps: int, ...@@ -81,4 +84,4 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
# Test correctness of num_computed_tokens after the sequence finish. # Test correctness of num_computed_tokens after the sequence finish.
assert seq.data.get_num_computed_tokens( assert seq.data.get_num_computed_tokens(
) == prompt_len + num_output_tokens - 1 ) == prompt_len + num_output_tokens - 1
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
import vllm.envs as envs
from vllm.utils import SUPPORT_TC, gpuname
@pytest.mark.skip_v1 @pytest.mark.skip_v1
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_computed_prefix_blocks(model: str): def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and # This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text # without optional detokenization, that detokenization includes text
...@@ -18,7 +22,7 @@ def test_computed_prefix_blocks(model: str): ...@@ -18,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available " "paper clips? Is there an easy to follow video tutorial available "
"online for free?") "online for free?")
llm = LLM(model=model) llm = LLM(model=model, block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
sampling_params = SamplingParams(max_tokens=10, sampling_params = SamplingParams(max_tokens=10,
temperature=0.0, temperature=0.0,
detokenize=False) detokenize=False)
...@@ -32,4 +36,4 @@ def test_computed_prefix_blocks(model: str): ...@@ -32,4 +36,4 @@ def test_computed_prefix_blocks(model: str):
assert outputs_no_detokenization.text == '' assert outputs_no_detokenization.text == ''
assert outputs_with_detokenization.text != '' assert outputs_with_detokenization.text != ''
assert outputs_no_detokenization.token_ids == \ assert outputs_no_detokenization.token_ids == \
outputs_with_detokenization.token_ids outputs_with_detokenization.token_ids
\ No newline at end of file
...@@ -2,11 +2,13 @@ ...@@ -2,11 +2,13 @@
from typing import Any, Optional from typing import Any, Optional
import os
import pytest import pytest
from vllm import LLM, SamplingParams, envs from vllm import LLM, SamplingParams, envs
from ..utils import models_path_prefix
MODEL = "meta-llama/llama-2-7b-hf" MODEL = os.path.join(models_path_prefix, "meta-llama/llama-2-7b-hf")
MAX_TOKENS = 200 MAX_TOKENS = 200
...@@ -138,4 +140,4 @@ def test_stop_strings(): ...@@ -138,4 +140,4 @@ def test_stop_strings():
_stop_token_id(vllm_model) _stop_token_id(vllm_model)
_set_async_mode(vllm_model, False) _set_async_mode(vllm_model, False)
_stop_token_id(vllm_model) _stop_token_id(vllm_model)
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
@pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("block_size", [64] if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else [16])
def test_computed_prefix_blocks(model: str, block_size: int): def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion # This test checks if we are able to run the engine to completion
# without triggering asserts. # without triggering asserts.
...@@ -33,4 +37,4 @@ def test_computed_prefix_blocks(model: str, block_size: int): ...@@ -33,4 +37,4 @@ def test_computed_prefix_blocks(model: str, block_size: int):
engine.add_request("0", prompt + prompt2, sampling_params) engine.add_request("0", prompt + prompt2, sampling_params)
engine.step() engine.step()
engine.add_request("1", prompt, sampling_params) engine.add_request("1", prompt, sampling_params)
engine.step() engine.step()
\ No newline at end of file
...@@ -13,6 +13,8 @@ from vllm.executor.uniproc_executor import UniProcExecutor ...@@ -13,6 +13,8 @@ from vllm.executor.uniproc_executor import UniProcExecutor
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
class Mock: class Mock:
...@@ -57,6 +59,7 @@ def test_custom_executor(model, tmp_path): ...@@ -57,6 +59,7 @@ def test_custom_executor(model, tmp_path):
model=model, model=model,
distributed_executor_backend=CustomUniExecutor, distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
...@@ -69,7 +72,7 @@ def test_custom_executor(model, tmp_path): ...@@ -69,7 +72,7 @@ def test_custom_executor(model, tmp_path):
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_custom_executor_async(model, tmp_path): def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmp_path) os.chdir(tmp_path)
...@@ -80,6 +83,7 @@ def test_custom_executor_async(model, tmp_path): ...@@ -80,6 +83,7 @@ def test_custom_executor_async(model, tmp_path):
model=model, model=model,
distributed_executor_backend=CustomUniExecutorAsync, distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) )
engine = AsyncLLMEngine.from_engine_args(engine_args) engine = AsyncLLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
...@@ -96,7 +100,7 @@ def test_custom_executor_async(model, tmp_path): ...@@ -96,7 +100,7 @@ def test_custom_executor_async(model, tmp_path):
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_respect_ray(model): def test_respect_ray(model):
# even for TP=1 and PP=1, # even for TP=1 and PP=1,
# if users specify ray, we should use ray. # if users specify ray, we should use ray.
...@@ -106,6 +110,7 @@ def test_respect_ray(model): ...@@ -106,6 +110,7 @@ def test_respect_ray(model):
model=model, model=model,
distributed_executor_backend="ray", distributed_executor_backend="ray",
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
assert engine.model_executor.uses_ray assert engine.model_executor.uses_ray
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from ..conftest import IMAGE_ASSETS from ..conftest import IMAGE_ASSETS
...@@ -30,4 +31,4 @@ def test_context_length_too_short(vllm_runner, image_assets, model): ...@@ -30,4 +31,4 @@ def test_context_length_too_short(vllm_runner, image_assets, model):
with vllm_model: with vllm_model:
vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]], vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
max_tokens=1, max_tokens=1,
images=[images[0]]) images=[images[0]])
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_skip_tokenizer_initialization(model: str): def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization # This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain # of tokenizer and detokenizer. The generated output is expected to contain
...@@ -14,6 +18,7 @@ def test_skip_tokenizer_initialization(model: str): ...@@ -14,6 +18,7 @@ def test_skip_tokenizer_initialization(model: str):
llm = LLM( llm = LLM(
model=model, model=model,
skip_tokenizer_init=True, skip_tokenizer_init=True,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) )
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
...@@ -26,4 +31,4 @@ def test_skip_tokenizer_initialization(model: str): ...@@ -26,4 +31,4 @@ def test_skip_tokenizer_initialization(model: str):
completions = outputs[0].outputs completions = outputs[0].outputs
assert len(completions) > 0 assert len(completions) > 0
assert completions[0].text == "" assert completions[0].text == ""
assert completions[0].token_ids assert completions[0].token_ids
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import LoadFormat from vllm.config import LoadFormat
from ..utils import models_path_prefix
test_model = "openai-community/gpt2" test_model = os.path.join(models_path_prefix, "openai-community/gpt2")
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
...@@ -19,4 +21,4 @@ def test_model_loader_download_files(vllm_runner): ...@@ -19,4 +21,4 @@ def test_model_loader_download_files(vllm_runner):
with vllm_runner(test_model, with vllm_runner(test_model,
load_format=LoadFormat.FASTSAFETENSORS) as llm: load_format=LoadFormat.FASTSAFETENSORS) as llm:
deserialized_outputs = llm.generate(prompts, sampling_params) deserialized_outputs = llm.generate(prompts, sampling_params)
assert deserialized_outputs assert deserialized_outputs
\ No newline at end of file
...@@ -2,9 +2,11 @@ ...@@ -2,9 +2,11 @@
import glob import glob
import tempfile import tempfile
import os
import huggingface_hub.constants import huggingface_hub.constants
import torch import torch
from ..utils import models_path_prefix
from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf, fastsafetensors_weights_iterator, download_weights_from_hf, fastsafetensors_weights_iterator,
...@@ -14,7 +16,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -14,7 +16,7 @@ from vllm.model_executor.model_loader.weight_utils import (
def test_fastsafetensors_model_loader(): def test_fastsafetensors_model_loader():
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
huggingface_hub.constants.HF_HUB_OFFLINE = False huggingface_hub.constants.HF_HUB_OFFLINE = False
download_weights_from_hf("openai-community/gpt2", download_weights_from_hf(os.path.join(models_path_prefix, "openai-community/gpt2"),
allow_patterns=["*.safetensors"], allow_patterns=["*.safetensors"],
cache_dir=tmpdir) cache_dir=tmpdir)
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True) safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
...@@ -43,4 +45,4 @@ def test_fastsafetensors_model_loader(): ...@@ -43,4 +45,4 @@ def test_fastsafetensors_model_loader():
if __name__ == "__main__": if __name__ == "__main__":
test_fastsafetensors_model_loader() test_fastsafetensors_model_loader()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment