Unverified Commit e9ba519f authored by shaharmor98's avatar shaharmor98 Committed by GitHub
Browse files

[DP][Ray] Pin DP control bundle to same node as first GPU bundle (#39167)


Signed-off-by: default avatarShahar Mor <smor@nvidia.com>
Co-authored-by: default avatarRobert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: default avatarmergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
parent 5ef33ab2
......@@ -80,6 +80,21 @@ class EngineHandshakeMetadata:
parallel_config: dict[str, int | str | list[int]]
def _make_control_bundle(node_ip: str) -> dict[str, float]:
# The engine actor is scheduled on the final CPU-only bundle. Keep that
# bundle colocated with the group's first GPU bundle so the actor does not
# float to an unrelated node and reorder worker ranks away from the
# advertised DP bootstrap host.
return {"CPU": 1.0, "node:" + node_ip: 0.001}
def _get_bundle_node_ip(bundle: dict[str, float]) -> str:
for key in bundle:
if key.startswith("node:"):
return key.split(":", 1)[1]
raise ValueError(f"Missing node affinity in placement bundle: {bundle}")
class CoreEngineProcManager:
"""
Utility class to handle creation, readiness, and shutdown
......@@ -597,10 +612,20 @@ class CoreEngineActorManager:
if len(collected_bundles) < world_size:
continue
bundles = collected_bundles + [{"CPU": 1.0}]
control_node_ip = _get_bundle_node_ip(collected_bundles[0])
bundles = collected_bundles + [
_make_control_bundle(control_node_ip)
]
collected_bundles = []
else:
bundles = device_bundle * world_size + [{"CPU": 1.0}]
# STRICT_PACK already keeps every bundle in the placement
# group on one node, so the explicit node affinity on the
# control bundle is redundant for correctness here. Keep it
# anyway for consistency with the span path and to preserve
# intent if this scheduling strategy changes later.
bundles = device_bundle * world_size + [
_make_control_bundle(node_ip)
]
pg = ray.util.placement_group(
name=f"dp_rank_{len(placement_groups)}",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment