Unverified Commit 397448eb authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[Auto Sync] Update parallel_state.py, few_shot_gsm8k.py (20250903) (#9986)


Co-authored-by: default avatargithub-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: default avatarLeon Gao <leon.gao19@gmail.com>
parent 66d5d042
...@@ -879,17 +879,16 @@ class GroupCoordinator: ...@@ -879,17 +879,16 @@ class GroupCoordinator:
size_tensor = torch.tensor( size_tensor = torch.tensor(
[object_tensor.numel()], [object_tensor.numel()],
dtype=torch.long, dtype=torch.long,
device=torch.cuda.current_device(), device="cpu",
) )
# Send object size # Send object size
torch.distributed.send( torch.distributed.send(size_tensor, dst=self.ranks[dst], group=self.cpu_group)
size_tensor, dst=self.ranks[dst], group=self.device_group
)
# Send object # Send object
torch.distributed.send( torch.distributed.send(
object_tensor, dst=self.ranks[dst], group=self.device_group object_tensor,
dst=self.ranks[dst],
group=self.device_group,
) )
return None return None
...@@ -904,13 +903,11 @@ class GroupCoordinator: ...@@ -904,13 +903,11 @@ class GroupCoordinator:
src != self.rank_in_group src != self.rank_in_group
), "Invalid source rank. Source rank is the same as the current rank." ), "Invalid source rank. Source rank is the same as the current rank."
size_tensor = torch.empty( size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
1, dtype=torch.long, device=torch.cuda.current_device()
)
# Receive object size # Receive object size
rank_size = torch.distributed.recv( rank_size = torch.distributed.recv(
size_tensor, src=self.ranks[src], group=self.device_group size_tensor, src=self.ranks[src], group=self.cpu_group
) )
# Tensor to receive serialized objects into. # Tensor to receive serialized objects into.
...@@ -928,7 +925,7 @@ class GroupCoordinator: ...@@ -928,7 +925,7 @@ class GroupCoordinator:
rank_object == rank_size rank_object == rank_size
), "Received object sender rank does not match the size sender rank." ), "Received object sender rank does not match the size sender rank."
obj = pickle.loads(object_tensor.cpu().numpy().tobytes()) obj = pickle.loads(object_tensor.cpu().numpy())
return obj return obj
......
...@@ -129,6 +129,7 @@ def run_eval(args): ...@@ -129,6 +129,7 @@ def run_eval(args):
return { return {
"accuracy": acc, "accuracy": acc,
"invalid": invalid,
"latency": latency, "latency": latency,
"output_throughput": output_throughput, "output_throughput": output_throughput,
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment