Unverified Commit 733446dd authored by pansicheng's avatar pansicheng Committed by GitHub
Browse files

fix io group (#9154)


Co-authored-by: default avatarZhiqiang Xie <xiezhq@stanford.edu>
parent 4c22897a
...@@ -296,6 +296,9 @@ class HiCacheController: ...@@ -296,6 +296,9 @@ class HiCacheController:
self.prefetch_tp_group = torch.distributed.new_group( self.prefetch_tp_group = torch.distributed.new_group(
group_ranks, backend="gloo" group_ranks, backend="gloo"
) )
self.prefetch_io_tp_group = torch.distributed.new_group(
group_ranks, backend="gloo"
)
self.backup_tp_group = torch.distributed.new_group( self.backup_tp_group = torch.distributed.new_group(
group_ranks, backend="gloo" group_ranks, backend="gloo"
) )
...@@ -602,7 +605,7 @@ class HiCacheController: ...@@ -602,7 +605,7 @@ class HiCacheController:
if self.tp_world_size > 1: if self.tp_world_size > 1:
# to ensure all TP workers release the host memory at the same time # to ensure all TP workers release the host memory at the same time
torch.distributed.barrier(group=self.prefetch_tp_group) torch.distributed.barrier(group=self.prefetch_io_tp_group)
# operation terminated by controller, release pre-allocated memory # operation terminated by controller, release pre-allocated memory
self.mem_pool_host.free( self.mem_pool_host.free(
operation.host_indices[operation.completed_tokens :] operation.host_indices[operation.completed_tokens :]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment