Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ad58e9b3
Commit
ad58e9b3
authored
Sep 18, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.1.post2' into v0.6.1.post2-dev
parents
408f663a
9ba0817f
Changes
118
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
178 additions
and
299 deletions
+178
-299
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+55
-0
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+8
-4
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+3
-1
tests/conftest.py
tests/conftest.py
+13
-18
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+0
-80
tests/distributed/test_basic_distributed_correctness_enc_dec.py
...distributed/test_basic_distributed_correctness_enc_dec.py
+0
-102
tests/distributed/test_chunked_prefill_distributed.py
tests/distributed/test_chunked_prefill_distributed.py
+0
-75
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+5
-3
tests/distributed/test_same_node.py
tests/distributed/test_same_node.py
+7
-7
tests/engine/test_skip_tokenizer_init.py
tests/engine/test_skip_tokenizer_init.py
+3
-2
tests/entrypoints/offline_mode/__init__.py
tests/entrypoints/offline_mode/__init__.py
+0
-0
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/offline_mode/test_offline_mode.py
+77
-0
tests/kernels/utils.py
tests/kernels/utils.py
+3
-1
tests/models/decoder_only/__init__.py
tests/models/decoder_only/__init__.py
+0
-0
tests/models/decoder_only/audio_language/__init__.py
tests/models/decoder_only/audio_language/__init__.py
+0
-0
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/audio_language/test_ultravox.py
+2
-4
tests/models/decoder_only/language/__init__.py
tests/models/decoder_only/language/__init__.py
+0
-0
tests/models/decoder_only/language/test_aqlm.py
tests/models/decoder_only/language/test_aqlm.py
+0
-0
tests/models/decoder_only/language/test_big_models.py
tests/models/decoder_only/language/test_big_models.py
+1
-1
tests/models/decoder_only/language/test_danube3_4b.py
tests/models/decoder_only/language/test_danube3_4b.py
+1
-1
No files found.
tests/basic_correctness/test_chunked_prefill.py
View file @
ad58e9b3
...
...
@@ -6,11 +6,13 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
import
os
from
contextlib
import
nullcontext
import
pytest
from
..models.utils
import
check_logprobs_close
,
check_outputs_equal
from
..utils
import
multi_gpu_test
MODELS
=
[
"facebook/opt-125m"
,
...
...
@@ -66,6 +68,59 @@ def test_models(
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
distributed_executor_backend
:
str
,
)
->
None
:
if
(
model
==
"meta-llama/Llama-2-7b-hf"
and
distributed_executor_backend
==
"ray"
):
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
dtype
=
"half"
max_tokens
=
5
chunked_prefill_token_size
=
16
# Add a chunked prefill config.
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
assert
chunked_prefill_token_size
!=
-
1
enable_chunked_prefill
=
True
max_num_batched_tokens
=
chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype,model"
,
[(
"fp8_e4m3"
,
...
...
tests/basic_correctness/test_preemption.py
View file @
ad58e9b3
...
...
@@ -19,10 +19,13 @@ MODELS = [
"facebook/opt-125m"
,
]
assert
ENABLE_ARTIFICIAL_PREEMPT
is
True
,
(
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
"tests/basic_correctness/test_preemption.py`"
)
@
pytest
.
fixture
(
scope
=
"module"
,
autouse
=
True
)
def
check_settings
():
assert
ENABLE_ARTIFICIAL_PREEMPT
is
True
,
(
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
"tests/basic_correctness/test_preemption.py`"
)
@
pytest
.
fixture
...
...
@@ -64,6 +67,7 @@ def test_chunked_prefill_recompute(
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_seqs
=
max_num_seqs
,
worker_use_ray
=
worker_use_ray
,
disable_log_stats
=
False
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
...
...
tests/compile/test_full_graph.py
View file @
ad58e9b3
...
...
@@ -16,5 +16,7 @@ def test_full_graph(model):
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B"
)
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B"
,
enforce_eager
=
True
,
load_format
=
"dummy"
)
llm
.
generate
(
prompts
,
sampling_params
)
tests/conftest.py
View file @
ad58e9b3
...
...
@@ -6,8 +6,8 @@ import sys
import
tempfile
from
collections
import
UserList
from
enum
import
Enum
from
typing
import
(
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Type
dDict
,
TypeVar
,
Union
)
from
typing
import
(
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
TypedDict
,
TypeVar
,
Union
)
import
numpy
as
np
import
pytest
...
...
@@ -18,6 +18,7 @@ from huggingface_hub import snapshot_download
from
PIL
import
Image
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
BatchEncoding
,
BatchFeature
)
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
...
...
@@ -260,7 +261,7 @@ class HfRunner:
*
,
model_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
is_embedding_model
:
bool
=
False
,
auto_cls
=
AutoModelForCausalLM
,
auto_cls
:
Type
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
]
=
identity
,
)
->
None
:
...
...
@@ -292,20 +293,14 @@ class HfRunner:
trust_remote_code
=
True
,
)
try
:
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoProcessor
# noqa: F401
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
)
except
Exception
as
exc
:
logger
.
warning
(
"Unable to auto-load HuggingFace processor for model (%s). "
"Using tokenizer instead. Reason: %s"
,
model_name
,
exc
)
self
.
processor
=
self
.
tokenizer
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers
import
AutoProcessor
# noqa: F401
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
)
self
.
postprocess_inputs
=
postprocess_inputs
...
...
@@ -658,8 +653,8 @@ class VllmRunner:
outputs
.
append
((
req_sample_output_ids
,
req_sample_output_strs
))
return
outputs
@
staticmethod
def
_final_steps_generate_w_logprobs
(
self
,
req_outputs
:
List
[
RequestOutput
],
)
->
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]:
outputs
:
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
=
[]
...
...
tests/distributed/test_basic_distributed_correctness.py
deleted
100644 → 0
View file @
408f663a
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_basic_distributed_correctness.py
```
"""
import
os
import
pytest
from
vllm.utils
import
cuda_device_count_stateless
from
..models.utils
import
check_outputs_equal
from
..utils
import
fork_new_process_for_each_test
TARGET_TEST_SUITE
=
os
.
environ
.
get
(
"TARGET_TEST_SUITE"
,
"L4"
)
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend, attention_backend, "
"test_suite"
,
[
(
"facebook/opt-125m"
,
"ray"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"ray"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"mp"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"ray"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
(
"meta-llama/Meta-Llama-3-8B"
,
"ray"
,
"FLASHINFER"
,
"A100"
),
])
@
fork_new_process_for_each_test
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
distributed_executor_backend
:
str
,
attention_backend
:
str
,
test_suite
:
str
,
)
->
None
:
if
test_suite
!=
TARGET_TEST_SUITE
:
pytest
.
skip
(
f
"Skip test for
{
test_suite
}
"
)
if
model
==
"meta-llama/Llama-2-7b-hf"
and
distributed_executor_backend
==
"ray"
and
attention_backend
==
""
and
test_suite
==
"L4"
:
# noqa
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
if
attention_backend
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
attention_backend
dtype
=
"half"
max_tokens
=
5
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/distributed/test_basic_distributed_correctness_enc_dec.py
deleted
100644 → 0
View file @
408f663a
"""For encoder/decoder models only:
Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_basic_distributed_correctness_enc_dec.py
```
"""
import
pytest
from
transformers
import
AutoModelForSeq2SeqLM
from
vllm.utils
import
cuda_device_count_stateless
from
..conftest
import
DecoderPromptType
from
..models.utils
import
check_logprobs_close
from
..utils
import
fork_new_process_for_each_test
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend"
,
[
(
"facebook/bart-large-cnn"
,
"ray"
),
(
"facebook/bart-large-cnn"
,
"mp"
),
])
@
fork_new_process_for_each_test
def
test_models
(
model
:
str
,
distributed_executor_backend
:
str
,
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
)
->
None
:
'''
Test vLLM BART inference on more than one GPU, comparing
outputs against HF as a baseline.
Fork a new process for each test, to prevent CUDA from
being re-initialized by successive tests within the same
process.
Arguments:
* model: the HF ID of the specific BART variant under test
* distributed_executor_backend
* hf_runner: HuggingFace (HF) test model runner
* vllm_runner: vLLM test model runner
* example_encoder_decoder_prompts: test fixture which provides a
dictionary of dummy prompts
'''
dtype
=
"float"
max_tokens
=
64
num_logprobs
=
5
# Example inputs with non-trivial (i.e. not None/empty) encoder &
# decoder prompts.
test_prompts
=
example_encoder_decoder_prompts
[
DecoderPromptType
.
CUSTOM
]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
test_prompts
,
max_tokens
,
num_logprobs
)
# Configuration settings for HF baseline
hf_kwargs
=
{
"top_k"
:
None
,
"num_beams"
:
1
,
"repetition_penalty"
:
1.0
,
"top_p"
:
1.0
,
"length_penalty"
:
1.0
,
"early_stopping"
:
False
,
"no_repeat_ngram_size"
:
None
,
"min_length"
:
0
}
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
hf_outputs
=
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
test_prompts
,
max_tokens
,
num_logprobs
,
**
hf_kwargs
,
))
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/distributed/test_chunked_prefill_distributed.py
deleted
100644 → 0
View file @
408f663a
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
pytest test_chunked_prefill_distributed.py
```
"""
import
os
import
pytest
from
vllm.utils
import
cuda_device_count_stateless
from
..models.utils
import
check_outputs_equal
from
..utils
import
fork_new_process_for_each_test
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend"
,
[
(
"facebook/opt-125m"
,
"ray"
),
(
"meta-llama/Llama-2-7b-hf"
,
"ray"
),
(
"facebook/opt-125m"
,
"mp"
),
(
"meta-llama/Llama-2-7b-hf"
,
"mp"
),
])
@
fork_new_process_for_each_test
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
distributed_executor_backend
:
str
,
)
->
None
:
if
model
==
"meta-llama/Llama-2-7b-hf"
and
distributed_executor_backend
==
"ray"
:
# noqa
assert
distributed_executor_backend
==
"ray"
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
dtype
=
"half"
max_tokens
=
5
chunked_prefill_token_size
=
16
# Add a chunked prefill config.
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
assert
chunked_prefill_token_size
!=
-
1
enable_chunked_prefill
=
True
max_num_batched_tokens
=
chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/distributed/test_pipeline_parallel.py
View file @
ad58e9b3
...
...
@@ -32,9 +32,11 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
(
1
,
4
,
1
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
1
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
0
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
2
,
1
,
1
,
1
,
"OpenGVLab/InternVL2-1B"
,
"ray"
),
(
1
,
2
,
1
,
1
,
1
,
"OpenGVLab/InternVL2-2B"
,
"ray"
),
(
1
,
2
,
1
,
0
,
1
,
"OpenGVLab/InternVL2-4B"
,
"ray"
),
# NOTE: InternVL2 multi-node tests are flaky,
# use mp backend to skip the multi-node tests
(
1
,
2
,
1
,
1
,
1
,
"OpenGVLab/InternVL2-1B"
,
"mp"
),
(
1
,
2
,
1
,
1
,
1
,
"OpenGVLab/InternVL2-2B"
,
"mp"
),
(
1
,
2
,
1
,
0
,
1
,
"OpenGVLab/InternVL2-4B"
,
"mp"
),
],
)
@
fork_new_process_for_each_test
...
...
tests/distributed/test_same_node.py
View file @
ad58e9b3
import
os
import
torch
import
torch
.distributed
as
dist
from
vllm.distributed.parallel_state
import
in_the_same_node_as
torch
.
distributed
.
init_process_group
(
backend
=
"gloo"
)
test_result
=
all
(
in_the_same_node_as
(
torch
.
distributed
.
group
.
WORLD
,
source_rank
=
0
))
if
__name__
==
"__main__"
:
dist
.
init_process_group
(
backend
=
"gloo"
)
test_result
=
all
(
in_the_same_node_as
(
dist
.
group
.
WORLD
,
source_rank
=
0
))
expected
=
os
.
environ
.
get
(
"VLLM_TEST_SAME_HOST"
,
"1"
)
==
"1"
assert
test_result
==
expected
,
f
"Expected
{
expected
}
, got
{
test_result
}
"
print
(
"Same node test passed!"
)
expected
=
os
.
environ
.
get
(
"VLLM_TEST_SAME_HOST"
,
"1"
)
==
"1"
assert
test_result
==
expected
,
f
"Expected
{
expected
}
, got
{
test_result
}
"
print
(
"Same node test passed!"
)
tests/engine/test_skip_tokenizer_init.py
View file @
ad58e9b3
...
...
@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
# token ids.
llm
=
LLM
(
model
=
model
,
skip_tokenizer_init
=
True
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
with
pytest
.
raises
(
ValueError
)
as
err
:
with
pytest
.
raises
(
ValueError
,
match
=
"cannot pass text prompts when"
):
llm
.
generate
(
"abc"
,
sampling_params
)
assert
"prompts must be None if"
in
str
(
err
.
value
)
outputs
=
llm
.
generate
({
"prompt_token_ids"
:
[
1
,
2
,
3
]},
sampling_params
=
sampling_params
)
assert
len
(
outputs
)
>
0
...
...
tests/entrypoints/offline_mode/__init__.py
0 → 100644
View file @
ad58e9b3
tests/entrypoints/offline_mode/test_offline_mode.py
0 → 100644
View file @
ad58e9b3
"""Tests for HF_HUB_OFFLINE mode"""
import
importlib
import
sys
import
weakref
import
pytest
from
vllm
import
LLM
from
...conftest
import
cleanup
MODEL_NAME
=
"facebook/opt-125m"
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
enforce_eager
=
True
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
cleanup
()
@
pytest
.
mark
.
skip_global_cleanup
def
test_offline_mode
(
llm
:
LLM
,
monkeypatch
):
# we use the llm fixture to ensure the model files are in-cache
del
llm
# Set HF to offline mode and ensure we can still construct an LLM
try
:
monkeypatch
.
setenv
(
"HF_HUB_OFFLINE"
,
"1"
)
# Need to re-import huggingface_hub and friends to setup offline mode
_re_import_modules
()
# Cached model files should be used in offline mode
LLM
(
model
=
MODEL_NAME
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
enforce_eager
=
True
)
finally
:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
monkeypatch
.
delenv
(
"HF_HUB_OFFLINE"
)
_re_import_modules
()
pass
def
_re_import_modules
():
hf_hub_module_names
=
[
k
for
k
in
sys
.
modules
if
k
.
startswith
(
"huggingface_hub"
)
]
transformers_module_names
=
[
k
for
k
in
sys
.
modules
if
k
.
startswith
(
"transformers"
)
and
not
k
.
startswith
(
"transformers_modules"
)
]
reload_exception
=
None
for
module_name
in
hf_hub_module_names
+
transformers_module_names
:
try
:
importlib
.
reload
(
sys
.
modules
[
module_name
])
except
Exception
as
e
:
reload_exception
=
e
# Try to continue clean up so that other tests are less likely to
# be affected
# Error this test if reloading a module failed
if
reload_exception
is
not
None
:
raise
reload_exception
tests/kernels/utils.py
View file @
ad58e9b3
...
...
@@ -10,7 +10,6 @@ import pytest
import
torch
from
vllm.attention
import
AttentionBackend
,
AttentionMetadata
,
AttentionType
from
vllm.attention.backends.xformers
import
XFormersBackend
from
vllm.utils
import
(
STR_BACKEND_ENV_VAR
,
STR_XFORMERS_ATTN_VAL
,
make_tensor_with_pad
)
...
...
@@ -521,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend:
* Backend instance
'''
if
backend_name
==
STR_XFORMERS_ATTN_VAL
:
# NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
from
vllm.attention.backends.xformers
import
XFormersBackend
return
XFormersBackend
()
raise
AssertionError
(
f
"Unrecognized backend_name
{
backend_name
}
for unit test"
)
...
...
tests/models/decoder_only/__init__.py
0 → 100644
View file @
ad58e9b3
tests/models/decoder_only/audio_language/__init__.py
0 → 100644
View file @
ad58e9b3
tests/models/test_ultravox.py
→
tests/models/
decoder_only/audio_language/
test_ultravox.py
View file @
ad58e9b3
...
...
@@ -7,10 +7,8 @@ from transformers import AutoModel, AutoTokenizer, BatchEncoding
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
..conftest
import
HfRunner
,
VllmRunner
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
from
....conftest
import
HfRunner
,
VllmRunner
from
...utils
import
check_logprobs_close
MODEL_NAME
=
"fixie-ai/ultravox-v0_3"
...
...
tests/models/decoder_only/language/__init__.py
0 → 100644
View file @
ad58e9b3
tests/models/test_aqlm.py
→
tests/models/
decoder_only/language/
test_aqlm.py
View file @
ad58e9b3
File moved
tests/models/test_big_models.py
→
tests/models/
decoder_only/language/
test_big_models.py
View file @
ad58e9b3
...
...
@@ -7,7 +7,7 @@ Run `pytest tests/models/test_big_models.py`.
import
pytest
import
torch
from
.utils
import
check_outputs_equal
from
..
.utils
import
check_outputs_equal
MODELS
=
[
"meta-llama/Llama-2-7b-hf"
,
...
...
tests/models/test_danube3_4b.py
→
tests/models/
decoder_only/language/
test_danube3_4b.py
View file @
ad58e9b3
...
...
@@ -6,7 +6,7 @@ Run `pytest tests/models/test_danube3_4b.py`.
"""
import
pytest
from
.utils
import
check_outputs_equal
from
..
.utils
import
check_outputs_equal
MODELS
=
[
"h2oai/h2o-danube3-4b-base"
]
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment