Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66b809cc
Commit
66b809cc
authored
Feb 08, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.2' into v0.7.2-dev
parents
37b63c24
0408efc6
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
156 additions
and
0 deletions
+156
-0
tests/lora/test_punica_ops_sizes.py
tests/lora/test_punica_ops_sizes.py
+1
-0
tests/lora/test_punica_ops_variation.py
tests/lora/test_punica_ops_variation.py
+1
-0
tests/lora/test_quant_model.py
tests/lora/test_quant_model.py
+2
-0
tests/lora/test_qwen2vl.py
tests/lora/test_qwen2vl.py
+2
-0
tests/lora/test_tokenizer_group.py
tests/lora/test_tokenizer_group.py
+2
-0
tests/lora/test_ultravox.py
tests/lora/test_ultravox.py
+123
-0
tests/lora/test_utils.py
tests/lora/test_utils.py
+2
-0
tests/lora/test_worker.py
tests/lora/test_worker.py
+2
-0
tests/lora/utils.py
tests/lora/utils.py
+2
-0
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+2
-0
tests/model_executor/conftest.py
tests/model_executor/conftest.py
+2
-0
tests/model_executor/test_enabled_custom_ops.py
tests/model_executor/test_enabled_custom_ops.py
+2
-0
tests/model_executor/test_guided_processors.py
tests/model_executor/test_guided_processors.py
+2
-0
tests/model_executor/test_model_load_with_params.py
tests/model_executor/test_model_load_with_params.py
+2
-0
tests/model_executor/weight_utils.py
tests/model_executor/weight_utils.py
+2
-0
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/audio_language/test_ultravox.py
+2
-0
tests/models/decoder_only/language/test_aqlm.py
tests/models/decoder_only/language/test_aqlm.py
+1
-0
tests/models/decoder_only/language/test_fp8.py
tests/models/decoder_only/language/test_fp8.py
+2
-0
tests/models/decoder_only/language/test_gguf.py
tests/models/decoder_only/language/test_gguf.py
+1
-0
tests/models/decoder_only/language/test_gptq_marlin.py
tests/models/decoder_only/language/test_gptq_marlin.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
tests/lora/test_punica_ops_sizes.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
"""
This script is mainly used to tests various hidden_sizes. We have collected the
This script is mainly used to tests various hidden_sizes. We have collected the
hidden_sizes included in the LoRA models currently supported by vLLM. It tests
hidden_sizes included in the LoRA models currently supported by vLLM. It tests
...
...
tests/lora/test_punica_ops_variation.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
"""
This script is mainly used to test whether trtion kernels can run normally
This script is mainly used to test whether trtion kernels can run normally
under different conditions, including various batches, numbers of LoRA , and
under different conditions, including various batches, numbers of LoRA , and
...
...
tests/lora/test_quant_model.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# Adapted from
# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
...
...
tests/lora/test_qwen2vl.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
from
typing
import
List
import
pytest
import
pytest
...
...
tests/lora/test_tokenizer_group.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
pytest
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizerBase
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizerBase
...
...
tests/lora/test_ultravox.py
0 → 100644
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
shutil
from
os
import
path
from
tempfile
import
TemporaryDirectory
from
typing
import
List
,
Tuple
import
torch
from
huggingface_hub
import
snapshot_download
from
safetensors.torch
import
load_file
,
save_file
from
transformers
import
AutoTokenizer
from
vllm.lora.request
import
LoRARequest
from
..models.utils
import
check_outputs_equal
ULTRAVOX_MODEL_NAME
=
"fixie-ai/ultravox-v0_3"
LLMA_MODEL_NAME
=
"meta-llama/Llama-3.1-8B-Instruct"
VLLM_PLACEHOLDER
=
"<|reserved_special_token_0|>"
PROMPT
=
"Tell me about a Fool's mate move in 20 words. Provide the moves!"
def
llama3_1_8b_chess_lora_path
():
return
snapshot_download
(
repo_id
=
"mkopecki/chess-lora-adapter-llama-3.1-8b"
)
# can't use llama lora adapter without module name transformation
# because ultravox nest language model
def
transform_module_names_for_ultravox
(
state_dict
):
transformed_state_dict
=
{}
for
key
,
value
in
state_dict
.
items
():
new_key
=
key
.
replace
(
"base_model.model"
,
"base_model.model.language_model"
)
transformed_state_dict
[
new_key
]
=
value
return
transformed_state_dict
def
mk_llama3_1_8b_ultravox_chess_lora
(
source_repo
,
target_path
):
tensor_file
=
"adapter_model.safetensors"
state_dict
=
load_file
(
path
.
join
(
source_repo
,
tensor_file
))
transformed_state_dict
=
transform_module_names_for_ultravox
(
state_dict
)
save_file
(
transformed_state_dict
,
path
.
join
(
target_path
,
tensor_file
))
config_file
=
"adapter_config.json"
shutil
.
copyfile
(
path
.
join
(
source_repo
,
config_file
),
path
.
join
(
target_path
,
config_file
))
return
target_path
def
_get_prompt
(
audio_count
,
question
,
placeholder
,
model_name
)
->
str
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
placeholder
=
f
"
{
placeholder
}
\n
"
*
audio_count
return
tokenizer
.
apply_chat_template
([{
'role'
:
'user'
,
'content'
:
f
"
{
placeholder
}{
question
}
"
}],
tokenize
=
False
,
add_generation_prompt
=
True
)
def
test_ultravox_lora
(
vllm_runner
):
"""
TODO: Train an Ultravox LoRA instead of using a Llama LoRA.
"""
# Workaround to prevent device mismatch in Whisper.
# Can be removed when it is fixed upstream in transformer
# https://github.com/huggingface/transformers/pull/35866
torch
.
set_default_device
(
"cpu"
)
llama3_1_8b_chess_lora
=
llama3_1_8b_chess_lora_path
()
with
TemporaryDirectory
()
as
temp_ultravox_lora_dir
:
llama3_1_8b_ultravox_chess_lora
=
mk_llama3_1_8b_ultravox_chess_lora
(
llama3_1_8b_chess_lora
,
temp_ultravox_lora_dir
)
with
vllm_runner
(
ULTRAVOX_MODEL_NAME
,
enforce_eager
=
True
,
max_num_seqs
=
2
,
enable_lora
=
True
,
max_loras
=
1
,
max_lora_rank
=
128
,
dtype
=
"bfloat16"
,
max_model_len
=
1024
,
)
as
vllm_model
:
ultravox_outputs
:
List
[
Tuple
[
List
[
int
],
str
]]
=
vllm_model
.
generate_greedy
(
[
_get_prompt
(
0
,
PROMPT
,
VLLM_PLACEHOLDER
,
ULTRAVOX_MODEL_NAME
)
],
256
,
lora_request
=
LoRARequest
(
str
(
1
),
1
,
llama3_1_8b_ultravox_chess_lora
),
)
# run llama with and without lora to compare outputs with above
with
vllm_runner
(
LLMA_MODEL_NAME
,
enforce_eager
=
True
,
max_num_seqs
=
2
,
enable_lora
=
True
,
max_loras
=
1
,
max_lora_rank
=
128
,
dtype
=
"bfloat16"
,
max_model_len
=
1024
,
)
as
vllm_model
:
llama_outputs
:
List
[
Tuple
[
List
[
int
],
str
]]
=
(
vllm_model
.
generate_greedy
(
[
_get_prompt
(
0
,
PROMPT
,
VLLM_PLACEHOLDER
,
LLMA_MODEL_NAME
)],
256
,
lora_request
=
LoRARequest
(
str
(
1
),
1
,
llama3_1_8b_chess_lora
),
))
check_outputs_equal
(
outputs_0_lst
=
ultravox_outputs
,
outputs_1_lst
=
llama_outputs
,
name_0
=
"ultravox"
,
name_1
=
"llama"
,
)
tests/lora/test_utils.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
collections
import
OrderedDict
from
collections
import
OrderedDict
from
unittest.mock
import
patch
from
unittest.mock
import
patch
...
...
tests/lora/test_worker.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
os
import
random
import
random
import
tempfile
import
tempfile
...
...
tests/lora/utils.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Dict
,
List
,
Optional
from
typing
import
Dict
,
List
,
Optional
import
torch
import
torch
...
...
tests/metrics/test_metrics.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
os
import
time
import
time
from
typing
import
List
from
typing
import
List
...
...
tests/model_executor/conftest.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
pytest
...
...
tests/model_executor/test_enabled_custom_ops.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
from
typing
import
List
import
pytest
import
pytest
...
...
tests/model_executor/test_guided_processors.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
pickle
import
pickle
import
pytest
import
pytest
...
...
tests/model_executor/test_model_load_with_params.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
os
import
pytest
import
pytest
...
...
tests/model_executor/weight_utils.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
os
import
tempfile
import
tempfile
...
...
tests/models/decoder_only/audio_language/test_ultravox.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
typing
import
List
,
Optional
,
Tuple
,
Type
import
numpy
as
np
import
numpy
as
np
...
...
tests/models/decoder_only/language/test_aqlm.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of a AQLM model between vLLM and HF Transformers
"""Compare the outputs of a AQLM model between vLLM and HF Transformers
Run `pytest tests/models/test_aqlm.py`.
Run `pytest tests/models/test_aqlm.py`.
...
...
tests/models/decoder_only/language/test_fp8.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# flake8: noqa
# flake8: noqa
"""Tests fp8 models against ground truth generation
"""Tests fp8 models against ground truth generation
Note: these tests will only pass on L4 GPU.
Note: these tests will only pass on L4 GPU.
...
...
tests/models/decoder_only/language/test_gguf.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
"""
Tests gguf models against unquantized models generations
Tests gguf models against unquantized models generations
Note: To pass the test, quantization higher than Q4 should be used
Note: To pass the test, quantization higher than Q4 should be used
...
...
tests/models/decoder_only/language/test_gptq_marlin.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compares the outputs of gptq vs gptq_marlin
"""Compares the outputs of gptq vs gptq_marlin
Note: GPTQ and Marlin do not have bitwise correctness.
Note: GPTQ and Marlin do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
As a result, in this test, we just confirm that the top selected tokens of the
...
...
Prev
1
…
12
13
14
15
16
17
18
19
20
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment