Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
99b471c2
Commit
99b471c2
authored
May 21, 2024
by
zhuwenwen
Browse files
merge v0.4.1
parents
1925d2e9
468d761b
Changes
336
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
873 additions
and
95 deletions
+873
-95
tests/kernels/test_prefix_prefill.py
tests/kernels/test_prefix_prefill.py
+1
-1
tests/lora/conftest.py
tests/lora/conftest.py
+17
-7
tests/lora/test_baichuan.py
tests/lora/test_baichuan.py
+1
-1
tests/lora/test_layers.py
tests/lora/test_layers.py
+72
-45
tests/lora/test_lora_checkpoints.py
tests/lora/test_lora_checkpoints.py
+58
-0
tests/lora/test_punica.py
tests/lora/test_punica.py
+47
-4
tests/lora/test_quant_model.py
tests/lora/test_quant_model.py
+179
-0
tests/lora/test_worker.py
tests/lora/test_worker.py
+10
-4
tests/model_executor/weight_utils.py
tests/model_executor/weight_utils.py
+26
-0
tests/models/test_aqlm.py
tests/models/test_aqlm.py
+95
-0
tests/models/test_marlin.py
tests/models/test_marlin.py
+4
-5
tests/models/test_models.py
tests/models/test_models.py
+1
-1
tests/models/test_oot_registration.py
tests/models/test_oot_registration.py
+32
-0
tests/quantization/test_autogptq_marlin_configs.py
tests/quantization/test_autogptq_marlin_configs.py
+64
-0
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+24
-0
tests/samplers/test_logits_processor.py
tests/samplers/test_logits_processor.py
+62
-0
tests/samplers/test_rejection_sampler.py
tests/samplers/test_rejection_sampler.py
+6
-2
tests/samplers/test_sampler.py
tests/samplers/test_sampler.py
+108
-25
tests/spec_decode/e2e/__init__.py
tests/spec_decode/e2e/__init__.py
+0
-0
tests/spec_decode/e2e/conftest.py
tests/spec_decode/e2e/conftest.py
+66
-0
No files found.
tests/kernels/test_prefix_prefill.py
View file @
99b471c2
...
@@ -10,7 +10,7 @@ from vllm.attention.ops.prefix_prefill import context_attention_fwd
...
@@ -10,7 +10,7 @@ from vllm.attention.ops.prefix_prefill import context_attention_fwd
NUM_HEADS
=
[
64
]
NUM_HEADS
=
[
64
]
NUM_QUERIES_PER_KV
=
[
1
,
8
,
64
]
NUM_QUERIES_PER_KV
=
[
1
,
8
,
64
]
HEAD_SIZES
=
[
128
]
HEAD_SIZES
=
[
128
,
96
]
DTYPES
=
[
torch
.
float16
]
DTYPES
=
[
torch
.
float16
]
CUDA_DEVICES
=
[
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
...
...
tests/lora/conftest.py
View file @
99b471c2
...
@@ -12,6 +12,7 @@ from huggingface_hub import snapshot_download
...
@@ -12,6 +12,7 @@ from huggingface_hub import snapshot_download
import
vllm
import
vllm
from
vllm.config
import
LoRAConfig
from
vllm.config
import
LoRAConfig
from
vllm.distributed
import
destroy_model_parallel
,
initialize_model_parallel
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
RowParallelLinear
)
RowParallelLinear
)
...
@@ -19,8 +20,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
...
@@ -19,8 +20,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.parallel_utils.parallel_state
import
(
destroy_model_parallel
,
initialize_model_parallel
)
def
cleanup
():
def
cleanup
():
...
@@ -144,16 +143,27 @@ def baichuan_lora_files():
...
@@ -144,16 +143,27 @@ def baichuan_lora_files():
return
snapshot_download
(
repo_id
=
"jeeejeee/baichuan7b-text2sql-spider"
)
return
snapshot_download
(
repo_id
=
"jeeejeee/baichuan7b-text2sql-spider"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
baichuan_zero_lora_files
():
# all the lora_B weights are initialized to zero.
return
snapshot_download
(
repo_id
=
"jeeejeee/baichuan7b-zero-init"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
tinyllama_lora_files
():
return
snapshot_download
(
repo_id
=
"jashing/tinyllama-colorist-lora"
)
@
pytest
.
fixture
@
pytest
.
fixture
def
llama_2_7b_engine_extra_embeddings
()
->
nn
.
Module
:
def
llama_2_7b_engine_extra_embeddings
()
->
nn
.
Module
:
cleanup
()
cleanup
()
get_model_old
=
get_model
get_model_old
=
get_model
def
get_model_patched
(
model_config
,
device_config
,
**
kwargs
):
def
get_model_patched
(
*
,
model_config
,
device_config
,
**
kwargs
):
return
get_model_old
(
model_config
,
kwargs
[
"lora_config"
]
=
LoRAConfig
(
max_loras
=
4
,
max_lora_rank
=
8
)
device
_config
,
return
get_model_old
(
model_config
=
model
_config
,
lora
_config
=
LoRAConfig
(
max_loras
=
4
,
device
_config
=
device_config
,
max_lora_rank
=
8
)
)
**
kwargs
)
with
patch
(
"vllm.worker.model_runner.get_model"
,
get_model_patched
):
with
patch
(
"vllm.worker.model_runner.get_model"
,
get_model_patched
):
engine
=
vllm
.
LLM
(
"meta-llama/Llama-2-7b-hf"
,
enable_lora
=
False
)
engine
=
vllm
.
LLM
(
"meta-llama/Llama-2-7b-hf"
,
enable_lora
=
False
)
...
...
tests/lora/test_baichuan.py
View file @
99b471c2
...
@@ -62,7 +62,7 @@ def test_baichuan_lora(baichuan_lora_files):
...
@@ -62,7 +62,7 @@ def test_baichuan_lora(baichuan_lora_files):
@
pytest
.
mark
.
skip
(
"Requires multiple GPUs"
)
@
pytest
.
mark
.
skip
(
"Requires multiple GPUs"
)
def
test_
llama
_tensor_parallel_equality
(
baichuan_lora_files
):
def
test_
baichuan
_tensor_parallel_equality
(
baichuan_lora_files
):
# Cannot use as it will initialize torch.cuda too early...
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < 4:
# if torch.cuda.device_count() < 4:
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
...
...
tests/lora/test_layers.py
View file @
99b471c2
...
@@ -170,7 +170,8 @@ def create_random_inputs(
...
@@ -170,7 +170,8 @@ def create_random_inputs(
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_embeddings
(
dist_init
,
num_loras
,
device
)
->
None
:
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
def
test_embeddings
(
dist_init
,
num_loras
,
device
,
vocab_size
)
->
None
:
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
max_loras
=
8
max_loras
=
8
...
@@ -179,9 +180,9 @@ def test_embeddings(dist_init, num_loras, device) -> None:
...
@@ -179,9 +180,9 @@ def test_embeddings(dist_init, num_loras, device) -> None:
lora_dtype
=
torch
.
float16
)
lora_dtype
=
torch
.
float16
)
def
create_random_embedding_layer
():
def
create_random_embedding_layer
():
embedding
=
VocabParallelEmbedding
(
512
,
256
)
embedding
=
VocabParallelEmbedding
(
vocab_size
,
256
)
embedding
.
weight
.
data
=
torch
.
rand_like
(
embedding
.
weight
.
data
)
embedding
.
weight
.
data
=
torch
.
rand_like
(
embedding
.
weight
.
data
)
embedding
.
weight
.
data
[
512
:,
:]
=
0
embedding
.
weight
.
data
[
vocab_size
:,
:]
=
0
lora_embedding
=
VocabParallelEmbeddingWithLoRA
(
embedding
)
lora_embedding
=
VocabParallelEmbeddingWithLoRA
(
embedding
)
lora_embedding
.
create_lora_weights
(
max_loras
,
lora_config
)
lora_embedding
.
create_lora_weights
(
max_loras
,
lora_config
)
...
@@ -203,12 +204,13 @@ def test_embeddings(dist_init, num_loras, device) -> None:
...
@@ -203,12 +204,13 @@ def test_embeddings(dist_init, num_loras, device) -> None:
active_lora_ids
=
list
(
lora_dict
.
keys
()),
active_lora_ids
=
list
(
lora_dict
.
keys
()),
num_inputs
=
num_loras
*
3
,
num_inputs
=
num_loras
*
3
,
input_size
=
(
200
,
),
input_size
=
(
200
,
),
input_range
=
(
1
,
512
),
input_range
=
(
1
,
vocab_size
),
)
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
)
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
lora_embedding
.
set_mapping
(
*
mapping_info
)
lora_embedding
.
set_mapping
(
*
mapping_info
)
lora_result
=
lora_embedding
(
torch
.
cat
(
inputs
))
lora_result
=
lora_embedding
(
torch
.
cat
(
inputs
))
...
@@ -240,12 +242,13 @@ def test_embeddings(dist_init, num_loras, device) -> None:
...
@@ -240,12 +242,13 @@ def test_embeddings(dist_init, num_loras, device) -> None:
active_lora_ids
=
[
0
],
active_lora_ids
=
[
0
],
num_inputs
=
num_loras
*
3
,
num_inputs
=
num_loras
*
3
,
input_size
=
(
200
,
),
input_size
=
(
200
,
),
input_range
=
(
1
,
512
),
input_range
=
(
1
,
vocab_size
),
)
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
)
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
lora_embedding
.
set_mapping
(
*
mapping_info
,
)
lora_embedding
.
set_mapping
(
*
mapping_info
,
)
lora_result
=
lora_embedding
(
torch
.
cat
(
inputs
))
lora_result
=
lora_embedding
(
torch
.
cat
(
inputs
))
...
@@ -263,7 +266,9 @@ def test_embeddings(dist_init, num_loras, device) -> None:
...
@@ -263,7 +266,9 @@ def test_embeddings(dist_init, num_loras, device) -> None:
# reason="Fails when loras are in any slot other than the first.")
# reason="Fails when loras are in any slot other than the first.")
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_embeddings_with_new_embeddings
(
dist_init
,
num_loras
,
device
)
->
None
:
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
def
test_embeddings_with_new_embeddings
(
dist_init
,
num_loras
,
device
,
vocab_size
)
->
None
:
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
max_loras
=
8
max_loras
=
8
...
@@ -272,15 +277,15 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
...
@@ -272,15 +277,15 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
lora_dtype
=
torch
.
float16
)
lora_dtype
=
torch
.
float16
)
def
create_random_embedding_layer
():
def
create_random_embedding_layer
():
embedding
=
VocabParallelEmbedding
(
512
,
256
)
embedding
=
VocabParallelEmbedding
(
vocab_size
,
256
)
embedding_data
=
torch
.
rand_like
(
embedding
.
weight
.
data
)
embedding_data
=
torch
.
rand_like
(
embedding
.
weight
.
data
)
embedding
.
weight
.
data
=
embedding_data
embedding
.
weight
.
data
=
embedding_data
embedding
.
weight
.
data
[
512
:,
:]
=
0
embedding
.
weight
.
data
[
vocab_size
:,
:]
=
0
expanded_embedding
=
VocabParallelEmbedding
(
expanded_embedding
=
VocabParallelEmbedding
(
512
+
lora_config
.
lora_extra_vocab_size
*
max_loras
,
vocab_size
+
lora_config
.
lora_extra_vocab_size
*
max_loras
,
256
,
256
,
org_num_embeddings
=
512
)
org_num_embeddings
=
vocab_size
)
expanded_embedding
.
weight
.
data
[:
512
,
:]
=
embedding_data
expanded_embedding
.
weight
.
data
[:
vocab_size
,
:]
=
embedding_data
# We need to deepcopy the embedding as it will be modified
# We need to deepcopy the embedding as it will be modified
# in place
# in place
lora_embedding
=
VocabParallelEmbeddingWithLoRA
(
lora_embedding
=
VocabParallelEmbeddingWithLoRA
(
...
@@ -298,7 +303,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
...
@@ -298,7 +303,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
id_to_index
,
id_to_index
,
layer
=
lora_embedding
,
layer
=
lora_embedding
,
layer_weights
=
torch
.
zeros
(
layer_weights
=
torch
.
zeros
(
(
256
,
512
+
lora_config
.
lora_extra_vocab_size
)),
(
256
,
vocab_size
+
lora_config
.
lora_extra_vocab_size
)),
generate_embeddings_tensor
=
256
,
generate_embeddings_tensor
=
256
,
)
)
...
@@ -316,7 +321,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
...
@@ -316,7 +321,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
active_lora_ids
=
list
(
lora_dict
.
keys
()),
active_lora_ids
=
list
(
lora_dict
.
keys
()),
num_inputs
=
num_loras
*
3
,
num_inputs
=
num_loras
*
3
,
input_size
=
(
200
,
),
input_size
=
(
200
,
),
input_range
=
(
1
,
512
),
input_range
=
(
1
,
vocab_size
),
)
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
...
@@ -327,16 +332,18 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
...
@@ -327,16 +332,18 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
for
input_
,
original_input_
,
lora_id
in
zip
(
inputs
,
original_inputs
,
for
input_
,
original_input_
,
lora_id
in
zip
(
inputs
,
original_inputs
,
prompt_mapping
):
prompt_mapping
):
embedding_id
=
lora_id
-
1
embedding_id
=
lora_id
-
1
input_
[
-
1
]
=
512
+
(
embedding_id
*
embeddings_tensor_len
)
input_
[
-
1
]
=
vocab_size
+
(
embedding_id
*
embeddings_tensor_len
)
original_input_
[
-
1
]
=
512
original_input_
[
-
1
]
=
vocab_size
input_
[
-
2
]
=
512
+
((
embedding_id
+
1
)
*
embeddings_tensor_len
-
1
)
input_
[
-
2
]
=
vocab_size
+
(
original_input_
[
-
2
]
=
512
+
embeddings_tensor_len
-
1
(
embedding_id
+
1
)
*
embeddings_tensor_len
-
1
)
original_input_
[
-
2
]
=
vocab_size
+
embeddings_tensor_len
-
1
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
)
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
lora_embedding
.
set_mapping
(
*
mapping_info
,
)
lora_embedding
.
set_mapping
(
*
mapping_info
,
)
expanded_embedding
.
weight
[
512
:
512
+
expanded_embedding
.
weight
[
vocab_size
:
vocab_size
+
(
embeddings_tensor_len
*
(
embeddings_tensor_len
*
max_loras
)]
=
torch
.
cat
(
embeddings_tensors
)
max_loras
)]
=
torch
.
cat
(
embeddings_tensors
)
...
@@ -370,14 +377,15 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
...
@@ -370,14 +377,15 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
active_lora_ids
=
[
0
],
active_lora_ids
=
[
0
],
num_inputs
=
num_loras
*
3
,
num_inputs
=
num_loras
*
3
,
input_size
=
(
200
,
),
input_size
=
(
200
,
),
input_range
=
(
1
,
512
),
input_range
=
(
1
,
vocab_size
),
)
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
original_inputs
=
deepcopy
(
inputs
)
original_inputs
=
deepcopy
(
inputs
)
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
)
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
lora_embedding
.
set_mapping
(
*
mapping_info
,
)
lora_embedding
.
set_mapping
(
*
mapping_info
,
)
lora_result
=
lora_embedding
(
torch
.
cat
(
original_inputs
))
lora_result
=
lora_embedding
(
torch
.
cat
(
original_inputs
))
...
@@ -393,7 +401,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
...
@@ -393,7 +401,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_lm_head_logits_processor
(
dist_init
,
num_loras
,
device
)
->
None
:
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
def
test_lm_head_logits_processor
(
dist_init
,
num_loras
,
device
,
vocab_size
)
->
None
:
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
max_loras
=
8
max_loras
=
8
...
@@ -402,12 +412,14 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
...
@@ -402,12 +412,14 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
lora_dtype
=
torch
.
float16
)
lora_dtype
=
torch
.
float16
)
def
_pretest
():
def
_pretest
():
linear
=
ParallelLMHead
(
32000
+
lora_config
.
lora_extra_vocab_size
,
linear
=
ParallelLMHead
(
vocab_size
+
lora_config
.
lora_extra_vocab_size
,
1024
,
32000
)
1024
,
vocab_size
,
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
[:,
32000
:]
=
0
linear
.
weight
.
data
[:,
vocab_size
:]
=
0
logits_processor
=
LogitsProcessor
(
logits_processor
=
LogitsProcessor
(
32000
+
lora_config
.
lora_extra_vocab_size
,
32000
)
vocab_size
+
lora_config
.
lora_extra_vocab_size
,
vocab_size
)
lora_logits_processor
=
LogitsProcessorWithLoRA
(
lora_logits_processor
=
LogitsProcessorWithLoRA
(
logits_processor
,
1024
,
linear
.
weight
.
dtype
,
linear
.
weight
.
device
)
logits_processor
,
1024
,
linear
.
weight
.
dtype
,
linear
.
weight
.
device
)
lora_logits_processor
.
create_lora_weights
(
max_loras
,
lora_config
)
lora_logits_processor
.
create_lora_weights
(
max_loras
,
lora_config
)
...
@@ -435,7 +447,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
...
@@ -435,7 +447,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
num_inputs
=
8
*
num_loras
,
# * 3,
num_inputs
=
8
*
num_loras
,
# * 3,
input_size
=
(
1
,
1024
),
input_size
=
(
1
,
1024
),
input_range
=
(
0
,
1
),
input_range
=
(
0
,
1
),
input_type
=
torch
.
float
32
,
input_type
=
torch
.
float
16
,
)
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
...
@@ -444,7 +456,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
...
@@ -444,7 +456,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
lora_mapping
,
lora_mapping
,
id_to_index
,
id_to_index
,
max_loras
,
max_loras
,
32000
,
vocab_size
,
lora_config
.
lora_extra_vocab_size
,
lora_config
.
lora_extra_vocab_size
,
)
)
lora_logits_processor
.
set_mapping
(
*
mapping_info
,
)
lora_logits_processor
.
set_mapping
(
*
mapping_info
,
)
...
@@ -460,7 +472,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
...
@@ -460,7 +472,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
org_vocab_size
:
logits_processor
.
org_vocab_size
+
org_vocab_size
:
logits_processor
.
org_vocab_size
+
embeddings_tensor_len
]
=
embeddings_tensor
embeddings_tensor_len
]
=
embeddings_tensor
logits_processor
.
org_vocab_size
=
(
32000
+
logits_processor
.
org_vocab_size
=
(
vocab_size
+
lora_config
.
lora_extra_vocab_size
)
lora_config
.
lora_extra_vocab_size
)
expected_results
=
[]
expected_results
=
[]
for
input_
,
lora_id
in
zip
(
inputs
,
prompt_mapping
):
for
input_
,
lora_id
in
zip
(
inputs
,
prompt_mapping
):
...
@@ -468,11 +480,11 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
...
@@ -468,11 +480,11 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
result
=
logits_processor
.
_get_logits
(
hidden_states
=
input_
,
result
=
logits_processor
.
_get_logits
(
hidden_states
=
input_
,
embedding
=
linear
.
weight
,
embedding
=
linear
.
weight
,
embedding_bias
=
None
)
embedding_bias
=
None
)
result
[:,
32000
+
embeddings_tensor_len
:]
=
float
(
"-inf"
)
result
[:,
vocab_size
+
embeddings_tensor_len
:]
=
float
(
"-inf"
)
result
+=
input_
@
lora
.
lora_a
@
lora
.
lora_b
*
lora
.
scaling
result
+=
input_
@
lora
.
lora_a
@
lora
.
lora_b
*
lora
.
scaling
expected_results
.
append
(
result
)
expected_results
.
append
(
result
)
expected_result
=
torch
.
cat
(
expected_results
)
expected_result
=
torch
.
cat
(
expected_results
)
logits_processor
.
org_vocab_size
=
32000
logits_processor
.
org_vocab_size
=
vocab_size
# Check that resetting the lora weights succeeds
# Check that resetting the lora weights succeeds
...
@@ -484,19 +496,19 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
...
@@ -484,19 +496,19 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
num_inputs
=
8
*
num_loras
*
3
,
num_inputs
=
8
*
num_loras
*
3
,
input_size
=
(
1
,
1024
),
input_size
=
(
1
,
1024
),
input_range
=
(
0
,
1
),
input_range
=
(
0
,
1
),
input_type
=
torch
.
float
32
,
input_type
=
torch
.
float
16
,
)
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
32000
,
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
lora_config
.
lora_extra_vocab_size
)
lora_logits_processor
.
set_mapping
(
*
mapping_info
,
)
lora_logits_processor
.
set_mapping
(
*
mapping_info
,
)
lora_result
=
lora_logits_processor
.
_get_logits
(
lora_result
=
lora_logits_processor
.
_get_logits
(
hidden_states
=
torch
.
cat
(
inputs
),
hidden_states
=
torch
.
cat
(
inputs
),
embedding
=
original_weight
,
embedding
=
original_weight
,
embedding_bias
=
None
)[:,
:
32000
]
embedding_bias
=
None
)[:,
:
vocab_size
]
expected_result
=
logits_processor
.
_get_logits
(
expected_result
=
logits_processor
.
_get_logits
(
hidden_states
=
torch
.
cat
(
inputs
),
hidden_states
=
torch
.
cat
(
inputs
),
embedding
=
original_weight
,
embedding
=
original_weight
,
...
@@ -523,11 +535,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
...
@@ -523,11 +535,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
def
create_random_linear_parallel_layer
():
def
create_random_linear_parallel_layer
():
if
orientation
==
"row"
:
if
orientation
==
"row"
:
linear
=
RowParallelLinear
(
4096
,
4096
,
bias
=
False
)
linear
=
RowParallelLinear
(
4096
,
4096
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
lora_linear
=
RowParallelLinearWithLoRA
(
linear
)
lora_linear
=
RowParallelLinearWithLoRA
(
linear
)
else
:
else
:
linear
=
ColumnParallelLinear
(
4096
,
4096
,
bias
=
False
)
linear
=
ColumnParallelLinear
(
4096
,
4096
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
lora_linear
=
ColumnParallelLinearWithLoRA
(
linear
)
lora_linear
=
ColumnParallelLinearWithLoRA
(
linear
)
lora_linear
.
create_lora_weights
(
max_loras
,
lora_config
)
lora_linear
.
create_lora_weights
(
max_loras
,
lora_config
)
...
@@ -551,7 +569,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
...
@@ -551,7 +569,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
num_inputs
=
32
*
num_loras
,
num_inputs
=
32
*
num_loras
,
input_size
=
(
1
,
4096
),
input_size
=
(
1
,
4096
),
input_range
=
(
0
,
1
),
input_range
=
(
0
,
1
),
input_type
=
torch
.
float
32
,
input_type
=
torch
.
float
16
,
)
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
...
@@ -590,7 +608,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
...
@@ -590,7 +608,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
num_inputs
=
32
*
num_loras
,
num_inputs
=
32
*
num_loras
,
input_size
=
(
1
,
4096
),
input_size
=
(
1
,
4096
),
input_range
=
(
0
,
1
),
input_range
=
(
0
,
1
),
input_type
=
torch
.
float
32
,
input_type
=
torch
.
float
16
,
)
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
...
@@ -623,15 +641,24 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
...
@@ -623,15 +641,24 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
def
create_column_parallel_packed_layer
():
def
create_column_parallel_packed_layer
():
if
repeats
==
2
:
if
repeats
==
2
:
linear
=
MergedColumnParallelLinear
(
4096
,
[
4096
]
*
repeats
,
linear
=
MergedColumnParallelLinear
(
4096
,
[
4096
]
*
repeats
,
bias
=
False
)
bias
=
False
,
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
lora_linear
=
MergedColumnParallelLinearWithLoRA
(
linear
)
lora_linear
=
MergedColumnParallelLinearWithLoRA
(
linear
)
elif
repeats
==
3
:
elif
repeats
==
3
:
linear
=
QKVParallelLinear
(
4096
,
64
,
32
,
bias
=
False
)
linear
=
QKVParallelLinear
(
4096
,
64
,
32
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
lora_linear
=
MergedQKVParallelLinearWithLora
(
linear
)
lora_linear
=
MergedQKVParallelLinearWithLora
(
linear
)
else
:
else
:
linear
=
QKVParallelLinear
(
4096
,
64
,
32
,
bias
=
False
)
linear
=
QKVParallelLinear
(
4096
,
64
,
32
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
lora_linear
=
QKVParallelLinearWithLora
(
linear
)
lora_linear
=
QKVParallelLinearWithLora
(
linear
)
...
@@ -666,7 +693,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
...
@@ -666,7 +693,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
num_inputs
=
32
*
num_loras
,
num_inputs
=
32
*
num_loras
,
input_size
=
(
1
,
4096
),
input_size
=
(
1
,
4096
),
input_range
=
(
0
,
1
),
input_range
=
(
0
,
1
),
input_type
=
torch
.
float
32
,
input_type
=
torch
.
float
16
,
)
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
...
@@ -706,7 +733,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
...
@@ -706,7 +733,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
num_inputs
=
32
*
num_loras
,
num_inputs
=
32
*
num_loras
,
input_size
=
(
1
,
4096
),
input_size
=
(
1
,
4096
),
input_range
=
(
0
,
1
),
input_range
=
(
0
,
1
),
input_type
=
torch
.
float
32
,
input_type
=
torch
.
float
16
,
)
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
...
...
tests/lora/test_lora_checkpoints.py
0 → 100644
View file @
99b471c2
import
pytest
from
vllm.lora.models
import
LoRAModel
from
vllm.model_executor.models.baichuan
import
BaiChuanBaseForCausalLM
lora_lst
=
[
"baichuan7B"
,
"baichuan7B-zero"
,
"chatglm3-6b"
]
@
pytest
.
mark
.
parametrize
(
"lora_name"
,
lora_lst
)
def
test_load_checkpoints
(
lora_name
,
baichuan_lora_files
,
baichuan_zero_lora_files
,
chatglm3_lora_files
,
):
supported_lora_modules
=
BaiChuanBaseForCausalLM
.
supported_lora_modules
packed_modules_mapping
=
BaiChuanBaseForCausalLM
.
packed_modules_mapping
embedding_modules
=
BaiChuanBaseForCausalLM
.
embedding_modules
embed_padding_modules
=
BaiChuanBaseForCausalLM
.
embedding_padding_modules
expected_lora_modules
=
[]
for
module
in
supported_lora_modules
:
if
module
in
packed_modules_mapping
:
expected_lora_modules
.
extend
(
packed_modules_mapping
[
module
])
else
:
expected_lora_modules
.
append
(
module
)
if
lora_name
==
"baichuan7B"
:
# For the baichuan7B model, load it's LoRA,
# and the test should pass.
LoRAModel
.
from_local_checkpoint
(
baichuan_lora_files
,
expected_lora_modules
,
lora_model_id
=
1
,
device
=
"cpu"
,
embedding_modules
=
embedding_modules
,
embedding_padding_modules
=
embed_padding_modules
)
elif
lora_name
==
"baichuan7B-zero"
:
#Test that the target_modules contain prefix
# such as "model.layers.0.self_atten.W_pack", and
# the test should pass.
LoRAModel
.
from_local_checkpoint
(
baichuan_zero_lora_files
,
expected_lora_modules
,
lora_model_id
=
1
,
device
=
"cpu"
,
embedding_modules
=
embedding_modules
,
embedding_padding_modules
=
embed_padding_modules
)
else
:
# For the baichuan7B model, load chatglm3-6b's LoRA,
# and the test should raise the following error.
expected_error
=
"Please verify that the loaded LoRA module is correct"
# noqa: E501
with
pytest
.
raises
(
ValueError
,
match
=
expected_error
):
LoRAModel
.
from_local_checkpoint
(
chatglm3_lora_files
,
expected_lora_modules
,
lora_model_id
=
1
,
device
=
"cpu"
,
embedding_modules
=
embedding_modules
,
embedding_padding_modules
=
embed_padding_modules
)
tests/lora/test_punica.py
View file @
99b471c2
...
@@ -43,10 +43,53 @@ def _lora_ref_impl(
...
@@ -43,10 +43,53 @@ def _lora_ref_impl(
H1
=
H2
=
[
H1
=
H2
=
[
128
,
256
,
512
,
1024
,
1152
,
1280
,
1536
,
2048
,
2304
,
2560
,
2752
,
3072
,
3456
,
128
,
3584
,
4096
,
4608
,
5120
,
5504
,
5632
,
6144
,
6848
,
6912
,
7168
,
8192
,
9216
,
256
,
10240
,
11008
,
13824
,
14336
,
22016
,
24576
,
27392
,
32000
,
32256
,
32512
,
512
,
32768
,
33024
1024
,
1152
,
1280
,
1536
,
2048
,
2304
,
2560
,
2752
,
3072
,
3456
,
3584
,
4096
,
4608
,
5120
,
5504
,
5632
,
6144
,
6848
,
6912
,
7168
,
8192
,
9216
,
10240
,
11008
,
13824
,
14336
,
15360
,
22016
,
24576
,
27392
,
32000
,
32256
,
32512
,
32768
,
33024
,
36864
,
43264
,
49152
,
64000
,
64256
,
102400
,
102656
,
128000
,
128256
,
]
]
SEED
=
[
0xabcdabcd987
]
SEED
=
[
0xabcdabcd987
]
CUDA_DEVICES
=
[
CUDA_DEVICES
=
[
...
...
tests/lora/test_quant_model.py
0 → 100644
View file @
99b471c2
# Adapted from
# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
from
dataclasses
import
dataclass
from
typing
import
List
import
pytest
import
vllm
from
vllm.lora.request
import
LoRARequest
from
.conftest
import
cleanup
@
dataclass
class
ModelWithQuantization
:
model_path
:
str
quantization
:
str
MODELS
:
List
[
ModelWithQuantization
]
=
[
ModelWithQuantization
(
model_path
=
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
,
quantization
=
"AWQ"
),
ModelWithQuantization
(
model_path
=
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
,
quantization
=
"GPTQ"
),
]
def
do_sample
(
llm
,
lora_path
:
str
,
lora_id
:
int
,
max_tokens
=
256
):
raw_prompts
=
[
"Give me an orange-ish brown color"
,
"Give me a neon pink color"
,
]
def
format_prompt_tuples
(
prompt
):
return
f
"<|im_start|>user
\n
{
prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
prompts
=
[
format_prompt_tuples
(
p
)
for
p
in
raw_prompts
]
sampling_params
=
vllm
.
SamplingParams
(
temperature
=
0
,
max_tokens
=
max_tokens
,
stop
=
[
"<|im_end|>"
])
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
LoRARequest
(
str
(
lora_id
),
lora_id
,
lora_path
)
if
lora_id
else
None
)
# Print the outputs.
generated_texts
=
[]
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
generated_texts
.
append
(
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
return
generated_texts
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
def
test_quant_model_lora
(
tinyllama_lora_files
,
model
,
tp_size
):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < tp_size:
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm
=
vllm
.
LLM
(
model
=
model
.
model_path
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
max_model_len
=
400
,
tensor_parallel_size
=
tp_size
,
quantization
=
model
.
quantization
,
trust_remote_code
=
True
)
if
model
.
quantization
is
None
:
expected_no_lora_output
=
[
"Here are some examples of orange-brown colors"
,
"I'm sorry, I don't have"
]
expected_lora_output
=
[
"#ff8050"
,
"#ff8080"
,
]
elif
model
.
quantization
==
"AWQ"
:
expected_no_lora_output
=
[
"I'm sorry, I don't understand"
,
"I'm sorry, I don't understand"
,
]
expected_lora_output
=
[
"#f07700: A v"
,
"#f00000: A v"
,
]
elif
model
.
quantization
==
"GPTQ"
:
expected_no_lora_output
=
[
"I'm sorry, I don't have"
,
"I'm sorry, I don't have"
,
]
expected_lora_output
=
[
"#f08800: This is"
,
"#f07788
\n
#"
,
]
def
expect_match
(
output
,
expected_output
):
# HACK: GPTQ lora outputs are just incredibly unstable.
# Assert that the outputs changed.
if
(
model
.
quantization
==
"GPTQ"
and
expected_output
is
expected_lora_output
):
assert
output
!=
expected_no_lora_output
for
i
,
o
in
enumerate
(
output
):
assert
o
.
startswith
(
'#'
),
f
"Expected example
{
i
}
to start with # but got
{
o
}
"
return
assert
output
==
expected_output
max_tokens
=
10
print
(
"lora adapter created"
)
output
=
do_sample
(
llm
,
tinyllama_lora_files
,
lora_id
=
0
,
max_tokens
=
max_tokens
)
expect_match
(
output
,
expected_no_lora_output
)
print
(
"lora 1"
)
output
=
do_sample
(
llm
,
tinyllama_lora_files
,
lora_id
=
1
,
max_tokens
=
max_tokens
)
expect_match
(
output
,
expected_lora_output
)
print
(
"no lora"
)
output
=
do_sample
(
llm
,
tinyllama_lora_files
,
lora_id
=
0
,
max_tokens
=
max_tokens
)
expect_match
(
output
,
expected_no_lora_output
)
print
(
"lora 2"
)
output
=
do_sample
(
llm
,
tinyllama_lora_files
,
lora_id
=
2
,
max_tokens
=
max_tokens
)
expect_match
(
output
,
expected_lora_output
)
print
(
"removing lora"
)
del
llm
cleanup
()
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
skip
(
"Requires multiple GPUs"
)
def
test_quant_model_tp_equality
(
tinyllama_lora_files
,
model
):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < 2:
# pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
llm_tp1
=
vllm
.
LLM
(
model
=
model
.
model_path
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
tensor_parallel_size
=
1
,
quantization
=
model
.
quantization
,
trust_remote_code
=
True
)
output_tp1
=
do_sample
(
llm_tp1
,
tinyllama_lora_files
,
lora_id
=
1
)
del
llm_tp1
cleanup
()
llm_tp2
=
vllm
.
LLM
(
model
=
model
.
model_path
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
tensor_parallel_size
=
2
,
quantization
=
model
.
quantization
)
output_tp2
=
do_sample
(
llm_tp2
,
tinyllama_lora_files
,
lora_id
=
1
)
del
llm_tp2
cleanup
()
assert
output_tp1
==
output_tp2
tests/lora/test_worker.py
View file @
99b471c2
...
@@ -3,8 +3,8 @@ import random
...
@@ -3,8 +3,8 @@ import random
import
tempfile
import
tempfile
from
unittest.mock
import
patch
from
unittest.mock
import
patch
from
vllm.config
import
(
Devic
eConfig
,
LoRA
Config
,
Model
Config
,
Parallel
Config
,
from
vllm.config
import
(
Cach
eConfig
,
Device
Config
,
Load
Config
,
LoRA
Config
,
SchedulerConfig
)
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.lora.models
import
LoRAMapping
from
vllm.lora.models
import
LoRAMapping
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.worker.worker
import
Worker
from
vllm.worker.worker
import
Worker
...
@@ -18,15 +18,21 @@ def test_worker_apply_lora(sql_lora_files):
...
@@ -18,15 +18,21 @@ def test_worker_apply_lora(sql_lora_files):
"meta-llama/Llama-2-7b-hf"
,
"meta-llama/Llama-2-7b-hf"
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
trust_remote_code
=
False
,
download_dir
=
None
,
load_format
=
"dummy"
,
seed
=
0
,
seed
=
0
,
dtype
=
"float16"
,
dtype
=
"float16"
,
revision
=
None
,
revision
=
None
,
),
),
load_config
=
LoadConfig
(
download_dir
=
None
,
load_format
=
"dummy"
,
),
parallel_config
=
ParallelConfig
(
1
,
1
,
False
),
parallel_config
=
ParallelConfig
(
1
,
1
,
False
),
scheduler_config
=
SchedulerConfig
(
32
,
32
,
32
),
scheduler_config
=
SchedulerConfig
(
32
,
32
,
32
),
device_config
=
DeviceConfig
(
"cuda"
),
device_config
=
DeviceConfig
(
"cuda"
),
cache_config
=
CacheConfig
(
block_size
=
16
,
gpu_memory_utilization
=
1.
,
swap_space
=
0
,
cache_dtype
=
"auto"
),
local_rank
=
0
,
local_rank
=
0
,
rank
=
0
,
rank
=
0
,
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
32
,
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
32
,
...
...
tests/model_executor/weight_utils.py
0 → 100644
View file @
99b471c2
import
os
import
huggingface_hub.constants
import
pytest
from
vllm.model_executor.model_loader.weight_utils
import
enable_hf_transfer
def
test_hf_transfer_auto_activation
():
if
"HF_HUB_ENABLE_HF_TRANSFER"
in
os
.
environ
:
# in case it is already set, we can't test the auto activation
pytest
.
skip
(
"HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation"
)
enable_hf_transfer
()
try
:
# enable hf hub transfer if available
import
hf_transfer
# type: ignore # noqa
HF_TRANFER_ACTIVE
=
True
except
ImportError
:
HF_TRANFER_ACTIVE
=
False
assert
(
huggingface_hub
.
constants
.
HF_HUB_ENABLE_HF_TRANSFER
==
HF_TRANFER_ACTIVE
)
if
__name__
==
"__main__"
:
test_hf_transfer_auto_activation
()
tests/models/test_aqlm.py
0 → 100644
View file @
99b471c2
"""Compare the outputs of a AQLM model between vLLM and HF Transformers
Run `pytest tests/models/test_aqlm.py`.
"""
import
pytest
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
aqlm_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"aqlm"
].
get_min_capability
())
# In this test we hardcode prompts and generations for the model so we don't
# need to require the AQLM package as a dependency
example_prompts
=
[
'vLLM is a high-throughput and memory-efficient inference and serving '
'engine for LLMs.
\n
'
,
'Briefly describe the major milestones in the development of artificial '
'intelligence from 1950 to 2020.
\n
'
,
'Compare and contrast artificial intelligence with human intelligence in '
'terms of processing information.
\n
'
,
'Describe the basic components of a neural network and how it can be '
'trained.
\n
'
,
'Write a short story about a robot that dreams for the first time.
\n
'
,
'Analyze the impact of the COVID-19 pandemic on global economic structures '
'and future business models.
\n
'
,
'Explain the cultural significance of the Mona Lisa painting, and how its '
'perception might vary in Western versus Eastern societies.
\n
'
,
"Translate the following English sentence into Japanese, French, and "
"Swahili: 'The early bird catches the worm.'
\n
"
]
# These ground truth generations were generated using `transformers==4.38.1
# aqlm==1.1.0 torch==2.2.0`
# and the below code:
# ```python
# from transformers import AutoTokenizer, AutoModelForCausalLM
# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
# torch_dtype="auto", device_map="cuda").cuda()
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# outputs = []
# for prompt in example_prompts:
# input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
# hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
# outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
# print(outputs)
# ```
ground_truth_generations
=
[
'
\n
### Features
\n\n
- **High-throughput**: v'
,
'The major milestones in the development of artificial intelligence from '
'195'
,
'Compare and contrast artificial intelligence with human intelligence in '
'terms of processing information. The'
,
'Explain the difference between supervised and unsupervised learning.'
'
\n
Explain'
,
'Write a short story about a robot that dreams for the first time. The'
,
'Analyze the impact of the COVID-19 pandemic on global economic'
,
'The Mona Lisa is a painting by Leonardo da Vinci, and it'
,
'The early bird catches the worm.
\n
The early bird catches the'
]
@
pytest
.
mark
.
skipif
(
aqlm_not_supported
,
reason
=
"AQLM is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
1
])
def
test_models
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
# loop through the prompts to compare against the ground truth generations
for
prompt_idx
in
range
(
len
(
example_prompts
)):
vllm_output_ids
,
vllm_output_str
,
vllm_logprobs
=
vllm_outputs
[
prompt_idx
]
print
(
"Prompt: "
,
repr
(
example_prompts
[
prompt_idx
]))
print
(
"Reference output:"
,
repr
(
ground_truth_generations
[
prompt_idx
]))
print
(
"Output output: "
,
repr
(
vllm_output_str
))
assert
vllm_output_str
==
ground_truth_generations
[
prompt_idx
]
tests/models/test_marlin.py
View file @
99b471c2
...
@@ -16,13 +16,12 @@ from dataclasses import dataclass
...
@@ -16,13 +16,12 @@ from dataclasses import dataclass
import
pytest
import
pytest
import
torch
import
torch
from
vllm.model_executor.layers.quantization
import
(
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
_QUANTIZATION_CONFIG_REGISTRY
)
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
marlin_not_supported
=
(
capability
<
capability
<
_QUANTIZATION_CONFIG_REGISTRY
[
"marlin"
].
get_min_capability
())
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
@
dataclass
@
dataclass
...
@@ -47,7 +46,7 @@ model_pairs = [
...
@@ -47,7 +46,7 @@ model_pairs = [
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
3
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
def
test_models
(
vllm_runner
,
vllm_runner
,
example_prompts
,
example_prompts
,
...
...
tests/models/test_models.py
View file @
99b471c2
...
@@ -12,7 +12,7 @@ MODELS = [
...
@@ -12,7 +12,7 @@ MODELS = [
"gpt2"
,
"gpt2"
,
"bigcode/tiny_starcoder_py"
,
"bigcode/tiny_starcoder_py"
,
"EleutherAI/pythia-70m"
,
"EleutherAI/pythia-70m"
,
"bigscience/bloom-560m"
,
"bigscience/bloom-560m"
,
# Testing alibi slopes.
"microsoft/phi-2"
,
"microsoft/phi-2"
,
"stabilityai/stablelm-3b-4e1t"
,
"stabilityai/stablelm-3b-4e1t"
,
# "allenai/OLMo-1B", # Broken
# "allenai/OLMo-1B", # Broken
...
...
tests/models/test_oot_registration.py
0 → 100644
View file @
99b471c2
import
torch
from
vllm
import
LLM
,
ModelRegistry
,
SamplingParams
from
vllm.model_executor.models.opt
import
OPTForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
class
MyOPTForCausalLM
(
OPTForCausalLM
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
return
logits
def
test_oot_registration
():
# register our dummy model
ModelRegistry
.
register_model
(
"OPTForCausalLM"
,
MyOPTForCausalLM
)
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
"facebook/opt-125m"
)
first_token
=
llm
.
get_tokenizer
().
decode
(
0
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
first_token
,
""
)
assert
rest
==
""
tests/quantization/test_autogptq_marlin_configs.py
0 → 100644
View file @
99b471c2
"""Tests whether Marlin models can be loaded from the autogptq config.
Run `pytest tests/quantization/test_autogptq_marlin_configs.py --forked`.
"""
from
dataclasses
import
dataclass
import
pytest
from
vllm.config
import
ModelConfig
@
dataclass
class
ModelPair
:
model_marlin
:
str
model_gptq
:
str
# Model Id // Expected Kernel
MODELS_QUANT_TYPE
=
[
# compat: autogptq <=0.7.1 is_marlin_format: bool
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
"marlin"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"gptq"
),
# compat: autogptq >=0.8.0 use checkpoint_format: str
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
"marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"gptq"
)
]
@
pytest
.
mark
.
parametrize
(
"model_quant_type"
,
MODELS_QUANT_TYPE
)
def
test_auto_gptq
(
model_quant_type
:
str
,
)
->
None
:
model_path
,
quant_type
=
model_quant_type
model_config_no_quant_arg
=
ModelConfig
(
model_path
,
model_path
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"float16"
,
revision
=
None
,
quantization
=
None
# case 1
)
model_config_quant_arg
=
ModelConfig
(
model_path
,
model_path
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"float16"
,
revision
=
None
,
quantization
=
"gptq"
# case 2
)
assert
model_config_no_quant_arg
.
quantization
==
quant_type
,
(
f
"Expected quant_type ==
{
quant_type
}
for
{
model_path
}
, "
f
"but found
{
model_config_no_quant_arg
.
quantization
}
"
"for no --quantization None case"
)
assert
model_config_quant_arg
.
quantization
==
quant_type
,
(
f
"Expected quant_type ==
{
quant_type
}
for
{
model_path
}
, "
f
"but found
{
model_config_quant_arg
.
quantization
}
"
"for --quantization gptq case"
)
tests/quantization/test_fp8.py
0 → 100644
View file @
99b471c2
"""Tests whether FP8 computation is enabled correctly.
Run `pytest tests/quantization/test_fp8.py --forked`.
"""
import
pytest
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.model_executor.layers.quantization.fp8
import
Fp8LinearMethod
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
@
pytest
.
mark
.
skipif
(
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
(),
reason
=
"FP8 is not supported on this GPU type."
)
def
test_load_fp16_model
(
vllm_runner
)
->
None
:
llm
=
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
assert
isinstance
(
fc1
.
linear_method
,
Fp8LinearMethod
)
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
tests/samplers/test_logits_processor.py
0 → 100644
View file @
99b471c2
import
pytest
import
torch
from
vllm
import
SamplingParams
MODELS
=
[
"facebook/opt-125m"
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_logits_processor_force_generate
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
)
->
None
:
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
repeat_times
=
2
enforced_answers
=
" vLLM"
vllm_token_ids
=
tokenizer
.
encode
(
enforced_answers
,
add_special_tokens
=
False
)
max_tokens
=
len
(
vllm_token_ids
)
*
repeat_times
def
pick_vllm
(
token_ids
,
logits
):
token_id
=
vllm_token_ids
[
len
(
token_ids
)
%
len
(
vllm_token_ids
)]
logits
[
token_id
]
=
torch
.
finfo
(
logits
.
dtype
).
max
return
logits
params_with_logprobs
=
SamplingParams
(
logits_processors
=
[
pick_vllm
],
prompt_logprobs
=
3
,
max_tokens
=
max_tokens
,
)
# test logits_processors when prompt_logprobs is not None
vllm_model
.
model
.
_add_request
(
prompt
=
example_prompts
[
0
],
sampling_params
=
params_with_logprobs
,
prompt_token_ids
=
None
,
)
# test prompt_logprobs is not None
vllm_model
.
model
.
_add_request
(
prompt
=
example_prompts
[
1
],
sampling_params
=
SamplingParams
(
prompt_logprobs
=
3
,
max_tokens
=
max_tokens
,
),
prompt_token_ids
=
None
,
)
# test grouped requests
vllm_model
.
model
.
_add_request
(
prompt
=
example_prompts
[
2
],
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
),
prompt_token_ids
=
None
,
)
outputs
=
vllm_model
.
model
.
_run_engine
(
False
)
assert
outputs
[
0
].
outputs
[
0
].
text
==
enforced_answers
*
repeat_times
tests/samplers/test_rejection_sampler.py
View file @
99b471c2
...
@@ -91,12 +91,16 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
...
@@ -91,12 +91,16 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
bonus_token_ids
,
bonus_token_ids
,
)
)
# Bonus tokens are currently disabled. Verify they're set to -1.
# See https://github.com/vllm-project/vllm/issues/4212
expected_bonus_token_ids
=
bonus_token_ids
.
clone
()
*
0
-
1
if
which_tokens_accepted
==
"all_tokens_accepted"
:
if
which_tokens_accepted
==
"all_tokens_accepted"
:
# Expect all tokens to be equal to draft tokens.
# Expect all tokens to be equal to draft tokens.
assert
torch
.
equal
(
output_token_ids
[:,
:
-
1
],
draft_token_ids
)
assert
torch
.
equal
(
output_token_ids
[:,
:
-
1
],
draft_token_ids
)
# Expect all bonus tokens to be included.
# Expect all bonus tokens to be included.
assert
torch
.
equal
(
output_token_ids
[:,
-
1
:],
bonus_token_ids
)
assert
torch
.
equal
(
output_token_ids
[:,
-
1
:],
expected_
bonus_token_ids
)
elif
which_tokens_accepted
==
"no_tokens_accepted"
:
elif
which_tokens_accepted
==
"no_tokens_accepted"
:
# Expect first token to be equal to recovered tokens.
# Expect first token to be equal to recovered tokens.
assert
torch
.
equal
(
output_token_ids
[:,
0
],
recovered_token_ids
[:,
0
])
assert
torch
.
equal
(
output_token_ids
[:,
0
],
recovered_token_ids
[:,
0
])
...
@@ -106,7 +110,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
...
@@ -106,7 +110,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
torch
.
ones_like
(
output_token_ids
[:,
1
:])
*
-
1
)
torch
.
ones_like
(
output_token_ids
[:,
1
:])
*
-
1
)
elif
which_tokens_accepted
==
"some_tokens_accepted"
:
elif
which_tokens_accepted
==
"some_tokens_accepted"
:
recovered_plus_bonus
=
torch
.
cat
(
recovered_plus_bonus
=
torch
.
cat
(
(
recovered_token_ids
,
bonus_token_ids
),
dim
=-
1
)
(
recovered_token_ids
,
expected_
bonus_token_ids
),
dim
=-
1
)
# Assert first rejected token is a recovered token or bonus token.
# Assert first rejected token is a recovered token or bonus token.
assert
torch
.
equal
(
assert
torch
.
equal
(
recovered_plus_bonus
[
torch
.
arange
(
0
,
batch_size
),
recovered_plus_bonus
[
torch
.
arange
(
0
,
batch_size
),
...
...
tests/samplers/test_sampler.py
View file @
99b471c2
import
itertools
import
random
import
random
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
List
,
Optional
,
Tuple
from
unittest.mock
import
patch
from
unittest.mock
import
patch
...
@@ -31,7 +32,12 @@ def _prepare_test(
...
@@ -31,7 +32,12 @@ def _prepare_test(
1e-2
,
1e-2
,
dtype
=
input_tensor
.
dtype
)
dtype
=
input_tensor
.
dtype
)
sampler
=
MockLogitsSampler
(
fake_logits
)
sampler
=
MockLogitsSampler
(
fake_logits
)
model_runner
=
ModelRunner
(
None
,
None
,
None
,
None
,
None
)
model_runner
=
ModelRunner
(
model_config
=
None
,
parallel_config
=
None
,
scheduler_config
=
None
,
device_config
=
None
,
load_config
=
None
,
lora_config
=
None
)
return
input_tensor
,
fake_logits
,
sampler
,
model_runner
return
input_tensor
,
fake_logits
,
sampler
,
model_runner
...
@@ -194,11 +200,15 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
...
@@ -194,11 +200,15 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
def
create_sampling_params
(
min_tokens
,
def
create_sampling_params
(
min_tokens
,
eos_token_id
=
0
,
eos_token_id
=
0
,
stop_token_ids
=
None
):
*
,
stop_token_ids
:
Optional
[
List
[
str
]]
=
None
,
prompt_logprobs
:
Optional
[
int
]
=
None
):
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
min_tokens
=
min_tokens
,
min_tokens
=
min_tokens
,
max_tokens
=
9999
,
# keep higher than max of min_tokens
max_tokens
=
9999
,
# keep higher than max of min_tokens
stop_token_ids
=
stop_token_ids
,
stop_token_ids
=
stop_token_ids
,
# requesting prompt_logprobs changes the structure of `logits`
prompt_logprobs
=
prompt_logprobs
,
)
)
sampling_params
.
eos_token_id
=
eos_token_id
sampling_params
.
eos_token_id
=
eos_token_id
return
sampling_params
return
sampling_params
...
@@ -217,9 +227,9 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
...
@@ -217,9 +227,9 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
expected_penalization
=
[]
expected_penalization
=
[]
sequence_metadata_list
=
[]
sequence_metadata_list
=
[]
# 20% chance to generate seq group metadata list with all prompts
is_prompt
=
random
.
random
()
<
0.2
while
batch_size
>
0
:
while
batch_size
>
0
:
# 20% chance to generate prompt seq group with single sequence
is_prompt
=
random
.
random
()
<
0.2
num_seqs
=
1
if
is_prompt
else
random
.
randint
(
1
,
batch_size
)
num_seqs
=
1
if
is_prompt
else
random
.
randint
(
1
,
batch_size
)
eos_token_id
=
random
.
randint
(
0
,
VOCAB_SIZE
-
1
)
eos_token_id
=
random
.
randint
(
0
,
VOCAB_SIZE
-
1
)
...
@@ -240,7 +250,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
...
@@ -240,7 +250,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
seq_group_penalization
=
[]
seq_group_penalization
=
[]
for
_
in
range
(
num_seqs
):
for
_
in
range
(
num_seqs
):
num_input
=
random
.
randint
(
1
,
100
)
num_input
=
random
.
randint
(
1
,
100
)
num_generated
=
random
.
randint
(
1
,
100
)
if
not
is_prompt
else
0
num_generated
=
0
if
is_prompt
else
random
.
randint
(
1
,
100
)
seq_data
[
next
(
seq_id_counter
)]
=
create_sequence_data
(
seq_data
[
next
(
seq_id_counter
)]
=
create_sequence_data
(
num_input
=
num_input
,
num_generated
=
num_generated
)
num_input
=
num_input
,
num_generated
=
num_generated
)
seq_group_penalization
.
append
(
num_generated
<
min_tokens
)
seq_group_penalization
.
append
(
num_generated
<
min_tokens
)
...
@@ -292,6 +302,21 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
...
@@ -292,6 +302,21 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
]
]
}
}
prompt_with_penalization_and_prompt_logprobs
=
{
"expected_penalization"
:
[
False
,
False
,
True
],
"seq_group_metadata_list"
:
[
SequenceGroupMetadata
(
request_id
=
"test_1"
,
is_prompt
=
True
,
seq_data
=
{
next
(
seq_id_counter
):
create_sequence_data
(
num_input
=
3
),
},
sampling_params
=
create_sampling_params
(
1
,
prompt_logprobs
=
3
),
block_tables
=
{},
),
]
}
stop_penalizing_after_min_tokens
=
{
stop_penalizing_after_min_tokens
=
{
"expected_penalization"
:
[
False
],
"expected_penalization"
:
[
False
],
"seq_group_metadata_list"
:
[
"seq_group_metadata_list"
:
[
...
@@ -309,8 +334,34 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
...
@@ -309,8 +334,34 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
}
}
stop_token_ids
=
[
42
,
99
,
42
,
0
]
# intentional duplication
stop_token_ids
=
[
42
,
99
,
42
,
0
]
# intentional duplication
simple_combination
=
{
prompt_combination
=
{
"expected_penalization"
:
[
True
,
False
,
False
],
"expected_penalization"
:
[
False
,
True
,
False
],
"seq_group_metadata_list"
:
[
SequenceGroupMetadata
(
request_id
=
"test_2"
,
is_prompt
=
True
,
seq_data
=
{
next
(
seq_id_counter
):
create_sequence_data
(
num_input
=
2
),
},
sampling_params
=
create_sampling_params
(
1
,
prompt_logprobs
=
3
),
block_tables
=
{},
),
SequenceGroupMetadata
(
request_id
=
"test_3"
,
is_prompt
=
True
,
seq_data
=
{
next
(
seq_id_counter
):
create_sequence_data
(),
},
sampling_params
=
create_sampling_params
(
0
,
stop_token_ids
=
stop_token_ids
),
block_tables
=
{},
)
]
}
stop_token_ids
=
[
1
,
999
,
37
,
37
]
# intentional duplication
decode_combination
=
{
"expected_penalization"
:
[
True
,
False
,
False
,
True
,
False
],
"seq_group_metadata_list"
:
[
"seq_group_metadata_list"
:
[
SequenceGroupMetadata
(
SequenceGroupMetadata
(
request_id
=
"test_1"
,
request_id
=
"test_1"
,
...
@@ -327,14 +378,19 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
...
@@ -327,14 +378,19 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
),
),
SequenceGroupMetadata
(
SequenceGroupMetadata
(
request_id
=
"test_2"
,
request_id
=
"test_2"
,
is_prompt
=
Tru
e
,
is_prompt
=
Fals
e
,
seq_data
=
{
seq_data
=
{
next
(
seq_id_counter
):
create_sequence_data
(),
next
(
seq_id_counter
):
create_sequence_data
(
num_generated
=
20
),
next
(
seq_id_counter
):
create_sequence_data
(
num_generated
=
1
),
next
(
seq_id_counter
):
create_sequence_data
(
num_generated
=
10
),
},
},
sampling_params
=
create_sampling_params
(
sampling_params
=
create_sampling_params
(
0
,
stop_token_ids
=
stop_token_ids
),
10
,
prompt_logprobs
=
5
,
stop_token_ids
=
stop_token_ids
),
block_tables
=
{},
block_tables
=
{},
)
)
,
]
]
}
}
...
@@ -342,8 +398,10 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
...
@@ -342,8 +398,10 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
test_cases
=
[
test_cases
=
[
prompt_without_penalization
,
prompt_without_penalization
,
prompt_with_penalization
,
prompt_with_penalization
,
prompt_with_penalization_and_prompt_logprobs
,
stop_penalizing_after_min_tokens
,
stop_penalizing_after_min_tokens
,
simple_combination
,
prompt_combination
,
decode_combination
,
]
]
else
:
else
:
test_cases
=
[
generate_test_case
()]
test_cases
=
[
generate_test_case
()]
...
@@ -351,30 +409,49 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
...
@@ -351,30 +409,49 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
def
run_test_case
(
*
,
def
run_test_case
(
*
,
expected_penalization
=
None
,
expected_penalization
=
None
,
seq_group_metadata_list
=
None
):
seq_group_metadata_list
=
None
):
assert
expected_penalization
,
"Invalid test case"
assert
expected_penalization
,
\
assert
seq_group_metadata_list
,
"Invalid test case"
"Invalid test case, need expected_penalization"
assert
seq_group_metadata_list
,
\
"Invalid test case, need seq_group_metadata_list"
batch_size
=
0
batch_size
=
0
prompt_lens
=
[]
prompt_lens
=
[]
sampling_params_per_
seq
=
[]
sampling_params_per_
row
=
[]
for
sgm
in
seq_group_metadata_list
:
for
sgm
in
seq_group_metadata_list
:
num_seqs
=
len
(
sgm
.
seq_data
)
batch_size
+=
num_seqs
sampling_params
=
sgm
.
sampling_params
sampling_params
=
sgm
.
sampling_params
for
seq_id
in
sgm
.
seq_data
:
prompt_lens
.
append
(
sgm
.
seq_data
[
seq_id
].
get_prompt_len
())
num_rows
=
len
(
sgm
.
seq_data
)
sampling_params_per_seq
.
append
(
sampling_params
)
if
sgm
.
is_prompt
:
# a prompt seq_group has only one sequence
seq_data
=
next
(
iter
(
sgm
.
seq_data
.
values
()))
prompt_len
=
seq_data
.
get_prompt_len
()
prompt_lens
.
append
(
prompt_len
)
if
sgm
.
sampling_params
.
prompt_logprobs
:
# with prompt_logprobs each token in the prompt has a row in
# logits
num_rows
=
prompt_len
batch_size
+=
num_rows
sampling_params_per_row
.
extend
(
itertools
.
repeat
(
sampling_params
,
num_rows
))
assert
len
(
expected_penalization
)
==
batch_size
,
\
(
"Invalid test case, expected_penalization does not match computed"
"batch size"
)
_
,
fake_logits
,
sampler
,
model_runner
=
_prepare_test
(
batch_size
)
_
,
fake_logits
,
sampler
,
model_runner
=
_prepare_test
(
batch_size
)
sampling_metadata
=
model_runner
.
_prepare_sample
(
sampling_metadata
=
model_runner
.
_prepare_sample
(
seq_group_metadata_list
,
seq_group_metadata_list
,
prompt_lens
=
prompt_lens
,
prompt_lens
=
prompt_lens
if
prompt_lens
else
None
,
subquery_lens
=
prompt_lens
)
subquery_lens
=
prompt_lens
if
prompt_lens
else
None
)
# the logits tensor is modified in-place by the sampler
# the logits tensor is modified in-place by the sampler
_
=
sampler
(
logits
=
fake_logits
,
sampling_metadata
=
sampling_metadata
)
_
=
sampler
(
logits
=
fake_logits
,
sampling_metadata
=
sampling_metadata
)
for
logits_idx
,
(
should_penalize
,
sampling_params
)
in
enumerate
(
for
logits_idx
,
(
should_penalize
,
sampling_params
)
in
enumerate
(
zip
(
expected_penalization
,
sampling_params_per_
seq
)):
zip
(
expected_penalization
,
sampling_params_per_
row
)):
tokens_to_check
=
[
sampling_params
.
eos_token_id
]
tokens_to_check
=
[
sampling_params
.
eos_token_id
]
if
sampling_params
.
stop_token_ids
:
if
sampling_params
.
stop_token_ids
:
...
@@ -519,7 +596,12 @@ def test_sampler_top_k_top_p(seed: int, device: str):
...
@@ -519,7 +596,12 @@ def test_sampler_top_k_top_p(seed: int, device: str):
device
=
input_tensor
.
device
,
device
=
input_tensor
.
device
,
dtype
=
input_tensor
.
dtype
)
dtype
=
input_tensor
.
dtype
)
sampler
=
MockLogitsSampler
(
fake_logits
)
sampler
=
MockLogitsSampler
(
fake_logits
)
model_runner
=
ModelRunner
(
None
,
None
,
None
,
None
,
None
)
model_runner
=
ModelRunner
(
model_config
=
None
,
parallel_config
=
None
,
scheduler_config
=
None
,
device_config
=
None
,
load_config
=
None
,
lora_config
=
None
)
generation_model
=
GenerationMixin
()
generation_model
=
GenerationMixin
()
generation_config
=
GenerationConfig
(
top_k
=
top_k
,
generation_config
=
GenerationConfig
(
top_k
=
top_k
,
...
@@ -554,7 +636,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
...
@@ -554,7 +636,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
def
mock_sample
(
probs
,
*
args
,
**
kwargs
):
def
mock_sample
(
probs
,
*
args
,
**
kwargs
):
nonlocal
sample_probs
nonlocal
sample_probs
sample_probs
=
probs
sample_probs
=
probs
return
[[
prob
.
topk
(
1
,
dim
=-
1
).
indices
.
tolist
(),
[
0
]]
for
prob
in
probs
]
return
([[
prob
.
topk
(
1
,
dim
=-
1
).
indices
.
tolist
(),
[
0
]]
for
prob
in
probs
],
None
)
with
patch
(
"vllm.model_executor.layers.sampler._sample"
,
mock_sample
):
with
patch
(
"vllm.model_executor.layers.sampler._sample"
,
mock_sample
):
sampler
(
logits
=
fake_logits
,
sampling_metadata
=
sampling_metadata
)
sampler
(
logits
=
fake_logits
,
sampling_metadata
=
sampling_metadata
)
...
...
vllm/model_executor/parallel_utils
/__init__.py
→
tests/spec_decode/e2e
/__init__.py
View file @
99b471c2
File moved
tests/spec_decode/e2e/conftest.py
0 → 100644
View file @
99b471c2
from
typing
import
List
,
Tuple
import
pytest
from
tests.conftest
import
cleanup
from
vllm
import
LLM
from
vllm.model_executor.utils
import
set_random_seed
@
pytest
.
fixture
def
baseline_llm_generator
(
request
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
seed
):
return
create_llm_generator
(
"baseline"
,
request
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
seed
)
@
pytest
.
fixture
def
test_llm_generator
(
request
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
test_llm_kwargs
,
seed
):
return
create_llm_generator
(
"test"
,
request
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
test_llm_kwargs
,
seed
)
def
create_llm_generator
(
baseline_or_test
,
request
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
distinct_llm_kwargs
,
seed
):
kwargs
=
{
**
common_llm_kwargs
,
**
per_test_common_llm_kwargs
,
**
distinct_llm_kwargs
,
}
test_name
=
request
.
node
.
name
def
generator_inner
():
print
(
f
'Creating
{
baseline_or_test
=
}
LLM for
{
test_name
=
}
.
{
kwargs
=
}
'
)
llm
=
LLM
(
**
kwargs
)
set_random_seed
(
seed
)
yield
llm
del
llm
cleanup
()
def
generator_outer
():
for
llm
in
generator_inner
():
yield
llm
del
llm
return
generator_outer
def
get_output_from_llm_generator
(
llm_generator
,
prompts
,
sampling_params
)
->
Tuple
[
List
[
str
],
List
[
List
[
int
]]]:
tokens
=
[]
token_ids
=
[]
for
llm
in
llm_generator
():
outputs
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
token_ids
=
[
output
.
outputs
[
0
].
token_ids
for
output
in
outputs
]
tokens
=
[
output
.
outputs
[
0
].
text
for
output
in
outputs
]
del
llm
return
tokens
,
token_ids
Prev
1
…
4
5
6
7
8
9
10
11
12
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment