Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c222f479
Unverified
Commit
c222f479
authored
Jan 20, 2025
by
youkaichao
Committed by
GitHub
Jan 20, 2025
Browse files
[core][bugfix] configure env var during import vllm (#12209)
Signed-off-by:
youkaichao
<
youkaichao@gmail.com
>
parent
170eb350
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
37 additions
and
45 deletions
+37
-45
examples/offline_inference/rlhf.py
examples/offline_inference/rlhf.py
+1
-6
vllm/__init__.py
vllm/__init__.py
+13
-36
vllm/plugins/__init__.py
vllm/plugins/__init__.py
+23
-0
vllm/worker/worker_base.py
vllm/worker/worker_base.py
+0
-3
No files found.
examples/offline_inference/rlhf.py
View file @
c222f479
...
...
@@ -19,7 +19,7 @@ from ray.util.placement_group import placement_group
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
from
transformers
import
AutoModelForCausalLM
from
vllm
import
LLM
,
SamplingParams
,
configure_as_vllm_process
from
vllm
import
LLM
,
SamplingParams
from
vllm.utils
import
get_ip
,
get_open_port
from
vllm.worker.worker
import
Worker
...
...
@@ -98,12 +98,7 @@ class MyLLM(LLM):
"""
Start the training process, here we use huggingface transformers
as an example to hold a model on GPU 0.
It is important for all the processes outside of vLLM to call
`configure_as_vllm_process` to set some common environment variables
the same as vLLM workers.
"""
configure_as_vllm_process
()
train_model
=
AutoModelForCausalLM
.
from_pretrained
(
"facebook/opt-125m"
)
train_model
.
to
(
"cuda:0"
)
...
...
vllm/__init__.py
View file @
c222f479
"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
import
os
import
torch
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
...
...
@@ -17,43 +20,18 @@ from vllm.sampling_params import SamplingParams
from
.version
import
__version__
,
__version_tuple__
# set some common config/environment variables that should be set
# for all processes created by vllm and all processes
# that interact with vllm workers.
# they are executed whenever `import vllm` is called.
def
configure_as_vllm_process
():
"""
set some common config/environment variables that should be set
for all processes created by vllm and all processes
that interact with vllm workers.
"""
import
os
import
torch
# see https://github.com/NVIDIA/nccl/issues/1234
os
.
environ
[
'NCCL_CUMEM_ENABLE'
]
=
'0'
# see https://github.com/vllm-project/vllm/issues/10480
os
.
environ
[
'TORCHINDUCTOR_COMPILE_THREADS'
]
=
'1'
# see https://github.com/vllm-project/vllm/issues/10619
torch
.
_inductor
.
config
.
compile_threads
=
1
from
vllm.platforms
import
current_platform
if
current_platform
.
is_xpu
():
# see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
torch
.
_dynamo
.
config
.
disable
=
True
elif
current_platform
.
is_hpu
():
# NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
# does not support torch.compile
# Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
# torch.compile support
is_lazy
=
os
.
environ
.
get
(
'PT_HPU_LAZY_MODE'
,
'1'
)
==
'1'
if
is_lazy
:
torch
.
_dynamo
.
config
.
disable
=
True
# NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
# requires enabling lazy collectives
# see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
os
.
environ
[
'PT_HPU_ENABLE_LAZY_COLLECTIVES'
]
=
'true'
# see https://github.com/NVIDIA/nccl/issues/1234
os
.
environ
[
'NCCL_CUMEM_ENABLE'
]
=
'0'
# see https://github.com/vllm-project/vllm/issues/10480
os
.
environ
[
'TORCHINDUCTOR_COMPILE_THREADS'
]
=
'1'
# see https://github.com/vllm-project/vllm/issues/10619
torch
.
_inductor
.
config
.
compile_threads
=
1
__all__
=
[
"__version__"
,
...
...
@@ -80,5 +58,4 @@ __all__ = [
"AsyncEngineArgs"
,
"initialize_ray_cluster"
,
"PoolingParams"
,
"configure_as_vllm_process"
,
]
vllm/plugins/__init__.py
View file @
c222f479
import
logging
import
os
from
typing
import
Callable
,
Dict
import
torch
import
vllm.envs
as
envs
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -51,6 +54,26 @@ def load_general_plugins():
if
plugins_loaded
:
return
plugins_loaded
=
True
# some platform-specific configurations
from
vllm.platforms
import
current_platform
if
current_platform
.
is_xpu
():
# see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
torch
.
_dynamo
.
config
.
disable
=
True
elif
current_platform
.
is_hpu
():
# NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
# does not support torch.compile
# Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
# torch.compile support
is_lazy
=
os
.
environ
.
get
(
'PT_HPU_LAZY_MODE'
,
'1'
)
==
'1'
if
is_lazy
:
torch
.
_dynamo
.
config
.
disable
=
True
# NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
# requires enabling lazy collectives
# see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
os
.
environ
[
'PT_HPU_ENABLE_LAZY_COLLECTIVES'
]
=
'true'
plugins
=
load_plugins_by_group
(
group
=
'vllm.general_plugins'
)
# general plugins, we only need to execute the loaded functions
for
func
in
plugins
.
values
():
...
...
vllm/worker/worker_base.py
View file @
c222f479
...
...
@@ -535,9 +535,6 @@ class WorkerWrapperBase:
kwargs
=
all_kwargs
[
self
.
rpc_rank
]
enable_trace_function_call_for_thread
(
self
.
vllm_config
)
from
vllm
import
configure_as_vllm_process
configure_as_vllm_process
()
from
vllm.plugins
import
load_general_plugins
load_general_plugins
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment