Unverified Commit 93f71673 authored by Li, Jiang's avatar Li, Jiang Committed by GitHub
Browse files

[BugFix][CPU] Fix x86 SHM distributed module initialization (#18536)


Signed-off-by: default avatarjiang.li <jiang1.li@intel.com>
parent 3f505233
...@@ -22,8 +22,10 @@ class CpuCommunicator(DeviceCommunicatorBase): ...@@ -22,8 +22,10 @@ class CpuCommunicator(DeviceCommunicatorBase):
super().__init__(cpu_group, device, device_group, unique_name) super().__init__(cpu_group, device, device_group, unique_name)
self.dist_module = torch.distributed self.dist_module = torch.distributed
if (current_platform.get_cpu_architecture() == CpuArchEnum.X86) \ if (current_platform.get_cpu_architecture()
and hasattr(torch.ops._C, "init_shm_manager"): == CpuArchEnum.X86) and hasattr(
torch.ops._C,
"init_shm_manager") and unique_name.startswith("tp"):
self.dist_module = _CPUSHMDistributed(self) self.dist_module = _CPUSHMDistributed(self)
def all_reduce(self, input_): def all_reduce(self, input_):
...@@ -96,6 +98,8 @@ class _CPUSHMDistributed: ...@@ -96,6 +98,8 @@ class _CPUSHMDistributed:
def __init__(self, communicator: CpuCommunicator): def __init__(self, communicator: CpuCommunicator):
instance_identifier = os.environ["VLLM_DIST_IDENT"] instance_identifier = os.environ["VLLM_DIST_IDENT"]
unique_name = communicator.unique_name
instance_identifier = f"{instance_identifier}-{unique_name}"
self.communicator = communicator self.communicator = communicator
group_ranks = [str(rank) for rank in self.communicator.ranks] group_ranks = [str(rank) for rank in self.communicator.ranks]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment