"vscode:/vscode.git/clone" did not exist on "d904350a2c9a9fb2e476b45a486cc72fa6c2bd8f"
Unverified Commit 05826c88 authored by youkaichao's avatar youkaichao Committed by GitHub
Browse files

[misc] fix custom allreduce p2p cache file generation (#7853)

parent dd9857f5
...@@ -4,6 +4,7 @@ import os ...@@ -4,6 +4,7 @@ import os
import pickle import pickle
import subprocess import subprocess
import sys import sys
import tempfile
from itertools import product from itertools import product
from typing import Dict, List, Optional, Sequence from typing import Dict, List, Optional, Sequence
...@@ -211,7 +212,13 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: ...@@ -211,7 +212,13 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
# However, `can_actually_p2p` requires spawn method. # However, `can_actually_p2p` requires spawn method.
# The fix is, we use `subprocess` to call the function, # The fix is, we use `subprocess` to call the function,
# where we have `if __name__ == "__main__":` in this file. # where we have `if __name__ == "__main__":` in this file.
input_bytes = pickle.dumps((batch_src, batch_tgt))
# use a temporary file to store the result
# we don't use the output of the subprocess directly,
# because the subprocess might produce logging output
with tempfile.NamedTemporaryFile() as output_file:
input_bytes = pickle.dumps(
(batch_src, batch_tgt, output_file.name))
returned = subprocess.run([sys.executable, __file__], returned = subprocess.run([sys.executable, __file__],
input=input_bytes, input=input_bytes,
capture_output=True) capture_output=True)
...@@ -224,7 +231,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: ...@@ -224,7 +231,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
f"Error happened when batch testing " f"Error happened when batch testing "
f"peer-to-peer access from {batch_src} to {batch_tgt}:\n" f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
f"{returned.stderr.decode()}") from e f"{returned.stderr.decode()}") from e
result = pickle.loads(returned.stdout) with open(output_file.name, "rb") as f:
result = pickle.load(f)
for _i, _j, r in zip(batch_src, batch_tgt, result): for _i, _j, r in zip(batch_src, batch_tgt, result):
cache[f"{_i}->{_j}"] = r cache[f"{_i}->{_j}"] = r
with open(path, "w") as f: with open(path, "w") as f:
...@@ -241,6 +249,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: ...@@ -241,6 +249,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
__all__ = ["gpu_p2p_access_check"] __all__ = ["gpu_p2p_access_check"]
if __name__ == "__main__": if __name__ == "__main__":
batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read()) batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
result = can_actually_p2p(batch_src, batch_tgt) result = can_actually_p2p(batch_src, batch_tgt)
sys.stdout.buffer.write(pickle.dumps(result)) with open(output_file, "wb") as f:
f.write(pickle.dumps(result))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment