Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
73b9083e
Unverified
Commit
73b9083e
authored
Nov 10, 2024
by
youkaichao
Committed by
GitHub
Nov 11, 2024
Browse files
[misc] improve cloudpickle registration and tests (#10202)
Signed-off-by:
youkaichao
<
youkaichao@gmail.com
>
parent
20cf2f55
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
50 additions
and
31 deletions
+50
-31
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+20
-6
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+0
-4
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+30
-21
No files found.
tests/distributed/test_pipeline_parallel.py
View file @
73b9083e
...
...
@@ -32,6 +32,8 @@ class PPTestOptions(NamedTuple):
multi_node_only
:
bool
trust_remote_code
:
bool
tokenizer_mode
:
Optional
[
str
]
load_format
:
Optional
[
str
]
=
None
hf_overrides
:
Optional
[
str
]
=
None
@
dataclass
...
...
@@ -50,6 +52,8 @@ class PPTestSettings:
task
:
TaskOption
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
hf_overrides
:
Optional
[
str
]
=
None
,
):
return
PPTestSettings
(
parallel_setups
=
[
...
...
@@ -78,7 +82,9 @@ class PPTestSettings:
task
=
task
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
),
tokenizer_mode
=
tokenizer_mode
,
load_format
=
load_format
,
hf_overrides
=
hf_overrides
),
)
@
staticmethod
...
...
@@ -90,6 +96,8 @@ class PPTestSettings:
multi_node_only
:
bool
=
False
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
hf_overrides
:
Optional
[
str
]
=
None
,
):
return
PPTestSettings
(
parallel_setups
=
[
...
...
@@ -102,7 +110,9 @@ class PPTestSettings:
task
=
task
,
test_options
=
PPTestOptions
(
multi_node_only
=
multi_node_only
,
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
),
tokenizer_mode
=
tokenizer_mode
,
load_format
=
load_format
,
hf_overrides
=
hf_overrides
),
)
def
iter_params
(
self
,
model_name
:
str
):
...
...
@@ -161,9 +171,8 @@ TEXT_GENERATION_MODELS = {
"facebook/opt-iml-max-1.3b"
:
PPTestSettings
.
fast
(),
"OrionStarAI/Orion-14B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"microsoft/phi-2"
:
PPTestSettings
.
fast
(),
"microsoft/Phi-3
-mini-4k
-instruct"
:
PPTestSettings
.
detailed
(
trust_remote_code
=
True
,
multi_node_only
=
True
),
# noqa: E501
"microsoft/Phi-3
.5-MoE
-instruct"
:
PPTestSettings
.
detailed
(
trust_remote_code
=
True
,
multi_node_only
=
True
,
load_format
=
"dummy"
,
hf_overrides
=
'{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'
),
# noqa: E501
"microsoft/Phi-3-small-8k-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"microsoft/Phi-3.5-MoE-instruct"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
# noqa: E501
"adept/persimmon-8b-chat"
:
PPTestSettings
.
fast
(),
"Qwen/Qwen-7B-Chat"
:
PPTestSettings
.
fast
(
trust_remote_code
=
True
),
"Qwen/Qwen2-7B-Instruct"
:
PPTestSettings
.
fast
(),
...
...
@@ -214,9 +223,9 @@ MULTIMODAL_MODELS = {
# NOTE: You can update this on your local machine to run specific tests
TEST_MODELS
=
[
# [LANGUAGE GENERATION]
"microsoft/Phi-3.5-MoE-instruct"
,
"meta-llama/Meta-Llama-3-8B"
,
"ibm/PowerLM-3b"
,
"microsoft/Phi-3-mini-4k-instruct"
,
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct"
,
"BAAI/bge-multilingual-gemma2"
,
...
...
@@ -238,7 +247,8 @@ def _compare_tp(
method
:
Literal
[
"generate"
,
"encode"
],
):
tp_size
,
pp_size
,
eager_mode
,
chunked_prefill
=
parallel_setup
multi_node_only
,
trust_remote_code
,
tokenizer_mode
=
test_options
multi_node_only
,
trust_remote_code
,
tokenizer_mode
,
\
load_format
,
hf_overrides
=
test_options
if
num_gpus_available
<
tp_size
*
pp_size
:
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
...
...
@@ -267,6 +277,10 @@ def _compare_tp(
common_args
.
append
(
"--trust-remote-code"
)
if
tokenizer_mode
:
common_args
.
extend
([
"--tokenizer-mode"
,
tokenizer_mode
])
if
load_format
:
common_args
.
extend
([
"--load-format"
,
load_format
])
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
hf_overrides
])
if
(
distributed_backend
==
"ray"
and
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
):
...
...
vllm/engine/arg_utils.py
View file @
73b9083e
...
...
@@ -19,8 +19,6 @@ from vllm.logger import init_logger
from
vllm.model_executor.layers.pooler
import
PoolingType
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.platforms
import
current_platform
from
vllm.transformers_utils.config
import
(
maybe_register_config_serialize_by_value
)
from
vllm.transformers_utils.utils
import
check_gguf_file
from
vllm.utils
import
FlexibleArgumentParser
,
StoreBoolean
...
...
@@ -1013,8 +1011,6 @@ class EngineArgs:
"supported for multimodal models and has been disabled."
)
self
.
enable_prefix_caching
=
False
maybe_register_config_serialize_by_value
(
self
.
trust_remote_code
)
cache_config
=
CacheConfig
(
# neuron needs block_size = max_model_len
block_size
=
self
.
block_size
if
self
.
device
!=
"neuron"
else
...
...
vllm/transformers_utils/config.py
View file @
73b9083e
...
...
@@ -234,6 +234,9 @@ def get_config(
patch_rope_scaling
(
config
)
if
trust_remote_code
:
maybe_register_config_serialize_by_value
()
return
config
...
...
@@ -389,33 +392,39 @@ def get_sentence_transformer_tokenizer_config(model: str,
return
None
def
maybe_register_config_serialize_by_value
(
trust_remote_code
:
bool
)
->
None
:
def
maybe_register_config_serialize_by_value
()
->
None
:
"""Try to register HF model configuration class to serialize by value
With trust_remote_code, the config class is typically an instance of a
custom class imported from the HF modules cache. The class will not be
importable in spawned workers by default (and won't exist at all on
other nodes), which breaks serialization of the config.
If trust_remote_code is set, and the model's config file specifies an
`AutoConfig` class, then the config class is typically an instance of
a custom class imported from the HF modules cache.
Examples:
>>> from transformers import AutoConfig
>>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
>>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
>>> import transformers_modules # error, not initialized
>>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
>>> import transformers_modules # success, initialized
>>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
In the DeepSeek example, the config class is an instance of a custom
class that is not serializable by default. This class will not be
importable in spawned workers, and won't exist at all on
other nodes, which breaks serialization of the config.
In this function we tell the cloudpickle serialization library to pass
instances of these generated classes by value instead of by reference,
i.e. the class definition is serialized along with its data so that the
class module does not need to be importable on the receiving end. This
registration only works if the modules cache has already been
initialized.
class module does not need to be importable on the receiving end.
See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
"""
if
not
trust_remote_code
:
return
"""
# noqa
try
:
import
transformers_modules
except
ImportError
:
logger
.
debug
(
"Could not import transformers_modules used for remote"
" code. If remote code is not needed remove"
" `--trust-remote-code`."
)
# the config does not need trust_remote_code
return
try
:
...
...
@@ -428,19 +437,19 @@ def maybe_register_config_serialize_by_value(trust_remote_code: bool) -> None:
ray
.
cloudpickle
.
register_pickle_by_value
(
transformers_modules
)
# multiprocessing uses pickle to serialize arguments when using spawn
# Here we get pickle to use cloudpickle to serialize
ModelC
onfig objects
# Here we get pickle to use cloudpickle to serialize
c
onfig objects
# that contain instances of the custom config class to avoid
# serialization problems if the generated module (and model) has a `.`
# in its name
import
multiprocessing
import
pickle
from
vllm.config
import
Model
Config
from
vllm.config
import
Vllm
Config
def
_reduce_
model
config
(
mc
:
Model
Config
):
return
(
pickle
.
loads
,
(
cloudpickle
.
dumps
(
m
c
),
))
def
_reduce_config
(
config
:
Vllm
Config
):
return
(
pickle
.
loads
,
(
cloudpickle
.
dumps
(
c
onfig
),
))
multiprocessing
.
reducer
.
register
(
Model
Config
,
_reduce_
model
config
)
multiprocessing
.
reducer
.
register
(
Vllm
Config
,
_reduce_config
)
except
Exception
as
e
:
logger
.
warning
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment