Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4b377d6f
Unverified
Commit
4b377d6f
authored
Sep 27, 2024
by
Nick Hill
Committed by
GitHub
Sep 26, 2024
Browse files
[BugFix] Fix test breakages from transformers 4.45 upgrade (#8829)
parent
71d21c73
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
62 additions
and
49 deletions
+62
-49
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+3
-6
tests/conftest.py
tests/conftest.py
+0
-1
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+0
-7
tests/engine/test_custom_executor.py
tests/engine/test_custom_executor.py
+4
-4
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+6
-0
tests/lora/test_tokenizer_group.py
tests/lora/test_tokenizer_group.py
+2
-2
tests/models/decoder_only/language/test_granite.py
tests/models/decoder_only/language/test_granite.py
+0
-4
tests/models/decoder_only/vision_language/test_llava_next_video.py
...els/decoder_only/vision_language/test_llava_next_video.py
+0
-5
tests/models/decoder_only/vision_language/test_llava_onevision.py
...dels/decoder_only/vision_language/test_llava_onevision.py
+5
-8
tests/models/test_registry.py
tests/models/test_registry.py
+0
-6
tests/samplers/test_sampler.py
tests/samplers/test_sampler.py
+15
-3
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+2
-2
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/tokenizer.py
+25
-1
No files found.
.buildkite/test-pipeline.yaml
View file @
4b377d6f
...
@@ -83,7 +83,6 @@ steps:
...
@@ -83,7 +83,6 @@ steps:
-
label
:
Entrypoints Test
# 20min
-
label
:
Entrypoints Test
# 20min
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
soft_fail
:
true
fast_check
:
true
fast_check
:
true
mirror_hardwares
:
[
amd
]
mirror_hardwares
:
[
amd
]
source_file_dependencies
:
source_file_dependencies
:
...
@@ -96,7 +95,8 @@ steps:
...
@@ -96,7 +95,8 @@ steps:
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_generate_multiple_loras.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_generate_multiple_loras.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_guided_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_guided_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/openai
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
-
pytest -v -s entrypoints/openai/test_oot_registration.py
# it needs a clean process
-
pytest -v -s entrypoints/test_chat_utils.py
-
pytest -v -s entrypoints/test_chat_utils.py
-
pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
-
pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
...
@@ -178,7 +178,6 @@ steps:
...
@@ -178,7 +178,6 @@ steps:
-
pytest -v -s prefix_caching
-
pytest -v -s prefix_caching
-
label
:
Samplers Test
# 18min
-
label
:
Samplers Test
# 18min
soft_fail
:
true
source_file_dependencies
:
source_file_dependencies
:
-
vllm/model_executor/layers
-
vllm/model_executor/layers
-
vllm/sampling_metadata.py
-
vllm/sampling_metadata.py
...
@@ -206,7 +205,6 @@ steps:
...
@@ -206,7 +205,6 @@ steps:
-
label
:
LoRA Test %N
# 30min each
-
label
:
LoRA Test %N
# 30min each
mirror_hardwares
:
[
amd
]
mirror_hardwares
:
[
amd
]
soft_fail
:
true
source_file_dependencies
:
source_file_dependencies
:
-
vllm/lora
-
vllm/lora
-
tests/lora
-
tests/lora
...
@@ -311,7 +309,6 @@ steps:
...
@@ -311,7 +309,6 @@ steps:
-
pytest -v -s models/decoder_only/language
-
pytest -v -s models/decoder_only/language
-
label
:
Decoder-only Multi-Modal Models Test
# 56min
-
label
:
Decoder-only Multi-Modal Models Test
# 56min
soft_fail
:
true
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
...
@@ -463,7 +460,7 @@ steps:
...
@@ -463,7 +460,7 @@ steps:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
# see https://github.com/vllm-project/vllm/pull/5689 for details
-
pytest -v -s distributed/test_custom_all_reduce.py
-
pytest -v -s distributed/test_custom_all_reduce.py
-
TARGET_TEST_SUITE=A100 pytest -v -s distributed
/test_basic_distributed_correctness.py
-
TARGET_TEST_SUITE=A100 pytest
basic_correctness/
-v -s
-m
distributed
_2_gpus
-
pytest -v -s -x lora/test_mixtral.py
-
pytest -v -s -x lora/test_mixtral.py
-
label
:
LM Eval Large Models
# optional
-
label
:
LM Eval Large Models
# optional
...
...
tests/conftest.py
View file @
4b377d6f
...
@@ -699,7 +699,6 @@ class VllmRunner:
...
@@ -699,7 +699,6 @@ class VllmRunner:
if
videos
is
not
None
:
if
videos
is
not
None
:
for
i
,
video
in
enumerate
(
videos
):
for
i
,
video
in
enumerate
(
videos
):
inputs
[
i
][
"multi_modal_data"
]
=
{
"video"
:
video
}
inputs
[
i
][
"multi_modal_data"
]
=
{
"video"
:
video
}
print
(
f
"[INPUTS!!!!]:
{
inputs
}
,
{
sampling_params
}
"
)
req_outputs
=
self
.
model
.
generate
(
inputs
,
req_outputs
=
self
.
model
.
generate
(
inputs
,
sampling_params
=
sampling_params
)
sampling_params
=
sampling_params
)
...
...
tests/distributed/test_pipeline_parallel.py
View file @
4b377d6f
...
@@ -8,8 +8,6 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
...
@@ -8,8 +8,6 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
import
os
import
os
import
pytest
import
pytest
from
packaging
import
version
from
transformers
import
__version__
as
transformers_version
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -49,11 +47,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
...
@@ -49,11 +47,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
"multiprocessing distributed backend"
)
# Skip tests that require transformers>=4.45.0
if
"Qwen2-VL"
in
MODEL_NAME
and
version
.
parse
(
transformers_version
)
<
version
.
parse
(
"4.45.0.dev0"
):
pytest
.
skip
(
"This test requires transformers>=4.45.0"
)
pp_args
=
[
pp_args
=
[
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
...
...
tests/engine/test_custom_executor.py
View file @
4b377d6f
...
@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
...
@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor
(
model
,
tmp
dir
):
def
test_custom_executor
(
model
,
tmp
_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp
dir
)
os
.
chdir
(
tmp
_path
)
try
:
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
assert
not
os
.
path
.
exists
(
".marker"
)
...
@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
...
@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor_async
(
model
,
tmp
dir
):
def
test_custom_executor_async
(
model
,
tmp
_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp
dir
)
os
.
chdir
(
tmp
_path
)
try
:
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
assert
not
os
.
path
.
exists
(
".marker"
)
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
4b377d6f
...
@@ -15,6 +15,11 @@ CHAT_TEMPLATE = "Dummy chat template for testing {}"
...
@@ -15,6 +15,11 @@ CHAT_TEMPLATE = "Dummy chat template for testing {}"
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
@
dataclass
class
MockHFConfig
:
model_type
:
str
=
"any"
@
dataclass
@
dataclass
class
MockModelConfig
:
class
MockModelConfig
:
tokenizer
=
MODEL_NAME
tokenizer
=
MODEL_NAME
...
@@ -24,6 +29,7 @@ class MockModelConfig:
...
@@ -24,6 +29,7 @@ class MockModelConfig:
tokenizer_revision
=
None
tokenizer_revision
=
None
embedding_mode
=
False
embedding_mode
=
False
multimodal_config
=
MultiModalConfig
()
multimodal_config
=
MultiModalConfig
()
hf_config
=
MockHFConfig
()
@
dataclass
@
dataclass
...
...
tests/lora/test_tokenizer_group.py
View file @
4b377d6f
...
@@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
...
@@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
lora_request
)
lora_request
)
def
test_get_lora_tokenizer
(
sql_lora_files
,
tmp
dir
):
def
test_get_lora_tokenizer
(
sql_lora_files
,
tmp
_path
):
lora_request
=
None
lora_request
=
None
tokenizer
=
get_lora_tokenizer
(
lora_request
)
tokenizer
=
get_lora_tokenizer
(
lora_request
)
assert
not
tokenizer
assert
not
tokenizer
...
@@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir):
...
@@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir):
tokenizer
=
get_lora_tokenizer
(
lora_request
)
tokenizer
=
get_lora_tokenizer
(
lora_request
)
assert
tokenizer
.
get_added_vocab
()
assert
tokenizer
.
get_added_vocab
()
lora_request
=
LoRARequest
(
"1"
,
1
,
str
(
tmp
dir
))
lora_request
=
LoRARequest
(
"1"
,
1
,
str
(
tmp
_path
))
tokenizer
=
get_lora_tokenizer
(
lora_request
)
tokenizer
=
get_lora_tokenizer
(
lora_request
)
assert
not
tokenizer
assert
not
tokenizer
tests/models/decoder_only/language/test_granite.py
View file @
4b377d6f
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
Run `pytest tests/models/test_granite.py`.
Run `pytest tests/models/test_granite.py`.
"""
"""
import
pytest
import
pytest
import
transformers
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
...
@@ -12,9 +11,6 @@ MODELS = [
...
@@ -12,9 +11,6 @@ MODELS = [
]
]
# GraniteForCausalLM will be in transformers >= 4.45
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
<
"4.45"
,
reason
=
"granite model test requires transformers >= 4.45"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
...
...
tests/models/decoder_only/vision_language/test_llava_next_video.py
View file @
4b377d6f
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
overload
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
overload
import
pytest
import
pytest
import
transformers
from
transformers
import
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
from
transformers
import
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
from
vllm.multimodal.utils
import
(
rescale_video_size
,
resize_video
,
from
vllm.multimodal.utils
import
(
rescale_video_size
,
resize_video
,
...
@@ -158,8 +157,6 @@ def run_test(
...
@@ -158,8 +157,6 @@ def run_test(
)
)
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
<
"4.45"
,
reason
=
"Waiting for next transformers release"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
"size_factors"
,
...
@@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
...
@@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
)
)
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
<
"4.45"
,
reason
=
"Waiting for next transformers release"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"sizes"
,
"sizes"
,
...
...
tests/models/decoder_only/vision_language/test_llava_onevision.py
View file @
4b377d6f
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
overload
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
overload
import
pytest
import
pytest
import
transformers
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
BatchEncoding
)
BatchEncoding
)
...
@@ -166,8 +165,6 @@ def run_video_test(
...
@@ -166,8 +165,6 @@ def run_video_test(
)
)
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
<
"4.45"
,
reason
=
"Waiting for next transformers release"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
"size_factors"
,
...
@@ -211,8 +208,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
...
@@ -211,8 +208,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
)
)
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
<
"4.45"
,
reason
=
"Waiting for next transformers release"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"sizes"
,
"sizes"
,
...
@@ -259,7 +254,9 @@ def run_image_test(
...
@@ -259,7 +254,9 @@ def run_image_test(
# max_model_len should be greater than image_feature_size
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
with
vllm_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
max_model_len
=
32768
,
max_num_seqs
=
1
,
max_model_len
=
16384
,
gpu_memory_utilization
=
0.98
,
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
enforce_eager
=
True
,
...
@@ -305,8 +302,8 @@ def run_image_test(
...
@@ -305,8 +302,8 @@ def run_image_test(
)
)
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
<
"4.45"
,
# FIXME: Swap to a smaller model for this architecture
reason
=
"Waiting for next transformers release
"
)
@
pytest
.
mark
.
skip
(
reason
=
"Model OOMing on CI
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
...
...
tests/models/test_registry.py
View file @
4b377d6f
import
pytest
import
pytest
import
transformers
from
vllm.model_executor.models
import
_MODELS
,
ModelRegistry
from
vllm.model_executor.models
import
_MODELS
,
ModelRegistry
@
pytest
.
mark
.
parametrize
(
"model_cls"
,
_MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_cls"
,
_MODELS
)
def
test_registry_imports
(
model_cls
):
def
test_registry_imports
(
model_cls
):
if
(
model_cls
in
(
"LlavaOnevisionForConditionalGeneration"
,
"Qwen2VLForConditionalGeneration"
)
and
transformers
.
__version__
<
"4.45"
):
pytest
.
skip
(
"Waiting for next transformers release"
)
# Ensure all model classes can be imported successfully
# Ensure all model classes can be imported successfully
ModelRegistry
.
resolve_model_cls
([
model_cls
])
ModelRegistry
.
resolve_model_cls
([
model_cls
])
tests/samplers/test_sampler.py
View file @
4b377d6f
import
itertools
import
itertools
import
random
import
random
from
dataclasses
import
dataclass
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
unittest.mock
import
Mock
,
patch
from
unittest.mock
import
Mock
,
patch
...
@@ -596,8 +597,19 @@ def test_sampler_top_k_top_p(seed: int, device: str):
...
@@ -596,8 +597,19 @@ def test_sampler_top_k_top_p(seed: int, device: str):
generation_config
=
GenerationConfig
(
top_k
=
top_k
,
generation_config
=
GenerationConfig
(
top_k
=
top_k
,
top_p
=
top_p
,
top_p
=
top_p
,
do_sample
=
True
)
do_sample
=
True
)
warpers
=
generation_model
.
_get_logits_warper
(
generation_config
,
device
)
assert
len
(
warpers
)
==
2
# top_p and top_k
@
dataclass
class
MockConfig
:
is_encoder_decoder
:
bool
=
False
generation_model
.
config
=
MockConfig
()
# needed by the following method
generation_model
.
_prepare_special_tokens
(
generation_config
,
device
=
device
)
processors
=
generation_model
.
_get_logits_processor
(
generation_config
,
None
,
None
,
None
,
[],
device
=
device
)
assert
len
(
processors
)
==
2
# top_p and top_k
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
]
=
[]
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
]
=
[]
seq_lens
:
List
[
int
]
=
[]
seq_lens
:
List
[
int
]
=
[]
...
@@ -639,7 +651,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
...
@@ -639,7 +651,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
assert
sample_probs
is
not
None
assert
sample_probs
is
not
None
hf_probs
=
warpe
rs
(
torch
.
zeros_like
(
fake_logits
),
fake_logits
.
clone
())
hf_probs
=
processo
rs
(
torch
.
zeros_like
(
fake_logits
),
fake_logits
.
clone
())
hf_probs
=
torch
.
softmax
(
hf_probs
,
dim
=-
1
,
dtype
=
torch
.
float
)
hf_probs
=
torch
.
softmax
(
hf_probs
,
dim
=-
1
,
dtype
=
torch
.
float
)
torch
.
testing
.
assert_close
(
hf_probs
,
sample_probs
,
rtol
=
0.0
,
atol
=
1e-5
)
torch
.
testing
.
assert_close
(
hf_probs
,
sample_probs
,
rtol
=
0.0
,
atol
=
1e-5
)
assert
torch
.
equal
(
hf_probs
.
eq
(
0
),
sample_probs
.
eq
(
0
))
assert
torch
.
equal
(
hf_probs
.
eq
(
0
),
sample_probs
.
eq
(
0
))
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
4b377d6f
...
@@ -152,13 +152,13 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -152,13 +152,13 @@ class OpenAIServingChat(OpenAIServing):
**
(
request
.
chat_template_kwargs
or
{}),
**
(
request
.
chat_template_kwargs
or
{}),
)
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
e
rror
(
"Error in applying chat template from request
: %s"
,
e
)
logger
.
e
xception
(
"Error in applying chat template from request
"
)
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
str
(
e
))
try
:
try
:
mm_data
=
await
mm_data_future
mm_data
=
await
mm_data_future
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
e
rror
(
"Error in loading multi-modal data
: %s"
,
e
)
logger
.
e
xception
(
"Error in loading multi-modal data
"
)
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
str
(
e
))
# validation for OpenAI tools
# validation for OpenAI tools
...
...
vllm/transformers_utils/tokenizer.py
View file @
4b377d6f
import
os
import
os
import
warnings
import
warnings
from
pathlib
import
Path
from
pathlib
import
Path
from
types
import
MethodType
from
typing
import
Optional
,
Union
from
typing
import
Optional
,
Union
import
huggingface_hub
import
huggingface_hub
...
@@ -152,6 +153,29 @@ def get_tokenizer(
...
@@ -152,6 +153,29 @@ def get_tokenizer(
else
:
else
:
raise
e
raise
e
# NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
if
type
(
tokenizer
).
__name__
in
(
"ChatGLMTokenizer"
,
"ChatGLM4Tokenizer"
):
assert
isinstance
(
tokenizer
,
PreTrainedTokenizer
)
orig_pad
=
tokenizer
.
_pad
# Patch _pad method to accept `padding_side`
def
_pad
(
self
:
PreTrainedTokenizer
,
*
args
,
padding_side
:
Optional
[
str
]
=
None
,
**
kwargs
,
):
if
(
padding_side
is
not
None
and
padding_side
!=
self
.
padding_side
):
msg
=
(
"`padding_side` argument is not supported by "
"ChatGLMTokenizer and will be ignored."
)
warnings
.
warn
(
msg
,
stacklevel
=
2
)
return
orig_pad
(
*
args
,
**
kwargs
)
tokenizer
.
_pad
=
MethodType
(
_pad
,
tokenizer
)
if
not
isinstance
(
tokenizer
,
PreTrainedTokenizerFast
):
if
not
isinstance
(
tokenizer
,
PreTrainedTokenizerFast
):
logger
.
warning
(
logger
.
warning
(
"Using a slow tokenizer. This might cause a significant "
"Using a slow tokenizer. This might cause a significant "
...
@@ -167,7 +191,7 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
...
@@ -167,7 +191,7 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
return
None
return
None
try
:
try
:
tokenizer
=
get_tokenizer
(
lora_request
.
lora_path
,
*
args
,
**
kwargs
)
tokenizer
=
get_tokenizer
(
lora_request
.
lora_path
,
*
args
,
**
kwargs
)
except
OSError
as
e
:
except
Exception
as
e
:
# No tokenizer was found in the LoRA folder,
# No tokenizer was found in the LoRA folder,
# use base model tokenizer
# use base model tokenizer
logger
.
warning
(
logger
.
warning
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment