Unverified Commit 4c69e228 authored by Rui Qiao's avatar Rui Qiao Committed by GitHub
Browse files

[Misc] Increase RayDistributedExecutor RAY_CGRAPH_get_timeout (#15301)


Signed-off-by: default avatarRui Qiao <ruisearch42@gmail.com>
parent 790b7975
......@@ -561,6 +561,15 @@ class RayDistributedExecutor(DistributedExecutorBase):
envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
# Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
# (it is 10 seconds by default). This is a Ray environment variable to
# control the timeout of getting result from a compiled graph execution,
# i.e., the distributed execution that includes model forward runs and
# intermediate tensor communications, in the case of vllm.
os.environ.setdefault("RAY_CGRAPH_get_timeout", "300") # noqa: SIM112
logger.info("RAY_CGRAPH_get_timeout is set to %s",
os.environ["RAY_CGRAPH_get_timeout"]) # noqa: SIM112
with InputNode() as input_data:
# Example DAG: PP=2, TP=4
#
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment