Unverified Commit 733446dd authored by pansicheng's avatar pansicheng Committed by GitHub
Browse files

fix io group (#9154)


Co-authored-by: default avatarZhiqiang Xie <xiezhq@stanford.edu>
parent 4c22897a
......@@ -296,6 +296,9 @@ class HiCacheController:
self.prefetch_tp_group = torch.distributed.new_group(
group_ranks, backend="gloo"
)
self.prefetch_io_tp_group = torch.distributed.new_group(
group_ranks, backend="gloo"
)
self.backup_tp_group = torch.distributed.new_group(
group_ranks, backend="gloo"
)
......@@ -602,7 +605,7 @@ class HiCacheController:
if self.tp_world_size > 1:
# to ensure all TP workers release the host memory at the same time
torch.distributed.barrier(group=self.prefetch_tp_group)
torch.distributed.barrier(group=self.prefetch_io_tp_group)
# operation terminated by controller, release pre-allocated memory
self.mem_pool_host.free(
operation.host_indices[operation.completed_tokens :]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment