Unverified Commit faeee0a7 authored by Yang Wang's avatar Yang Wang Committed by GitHub
Browse files

Auto generate ibstat file for topo aware traffic pattern (#381)

An enhancement for topo-aware IB performance validation #373.
This PR will auto-generate a required ibstate file `ib_traffic_topo_aware_ibstat.txt` which is used as input to build a graph.
parent b5c7c85d
...@@ -18,7 +18,7 @@ steps: ...@@ -18,7 +18,7 @@ steps:
echo "##vso[task.prependpath]$HOME/.local/bin" echo "##vso[task.prependpath]$HOME/.local/bin"
displayName: Export path displayName: Export path
- script: | - script: |
python3 -m pip install .[test,nvidia,torch,ort] python3 -m pip install .[test,nvidia,torch,ort,mpi]
make postinstall make postinstall
displayName: Install dependencies displayName: Install dependencies
- script: | - script: |
......
...@@ -128,6 +128,6 @@ ADD third_party third_party ...@@ -128,6 +128,6 @@ ADD third_party third_party
RUN make -C third_party cuda RUN make -C third_party cuda
ADD . . ADD . .
RUN python3 -m pip install .[nvidia,torch,ort] && \ RUN python3 -m pip install .[nvidia,torch,ort,mpi] && \
make cppbuild && \ make cppbuild && \
make postinstall make postinstall
...@@ -124,6 +124,6 @@ ADD third_party third_party ...@@ -124,6 +124,6 @@ ADD third_party third_party
RUN make -C third_party rocm RUN make -C third_party rocm
ADD . . ADD . .
RUN python3 -m pip install .[torch,ort] && \ RUN python3 -m pip install .[torch,ort,mpi] && \
make cppbuild && \ make cppbuild && \
make postinstall make postinstall
...@@ -139,6 +139,6 @@ ADD third_party third_party ...@@ -139,6 +139,6 @@ ADD third_party third_party
RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm
ADD . . ADD . .
RUN python3 -m pip install .[torch,ort] && \ RUN python3 -m pip install .[torch,ort,mpi] && \
make cppbuild && \ make cppbuild && \
make postinstall make postinstall
...@@ -191,6 +191,7 @@ def run(self): ...@@ -191,6 +191,7 @@ def run(self):
'torchvision>=0.8.0a0', 'torchvision>=0.8.0a0',
'transformers>=4.3.3', 'transformers>=4.3.3',
], ],
'mpi': ['mpi4py>=3.1.3'],
}, },
include_package_data=True, include_package_data=True,
entry_points={ entry_points={
......
...@@ -4,7 +4,11 @@ ...@@ -4,7 +4,11 @@
"""Topology Aware Utilities.""" """Topology Aware Utilities."""
import re import re
import os
from pathlib import Path
import networkx as nx import networkx as nx
from superbench.common.utils import logger from superbench.common.utils import logger
...@@ -31,6 +35,39 @@ def search(self, pattern, string, flags=0): ...@@ -31,6 +35,39 @@ def search(self, pattern, string, flags=0):
return self.matched return self.matched
def gen_ibstat_file(ibstat_file):
"""Generate ibstat file for each node with specified path.
Args:
ibstat_file (str): path of ibstat output.
"""
from mpi4py import MPI
if not MPI.Is_initialized():
MPI.Init()
comm = MPI.COMM_WORLD
name = MPI.Get_processor_name()
# The command to fetch ibstat info
cmd = r"ibstat | grep -Po 'System image GUID: \K\S+$'"
output = os.popen(cmd)
ibstat = 'VM_hostname ' + name + '\n' + str(output.read())
# Fetch all ibstate from each node
ibstats = comm.allgather(ibstat)
ibstate_file_path = Path(ibstat_file)
# Filter the duplicate info
ibstat_infos = set(ibstats)
with ibstate_file_path.open(mode='w') as f:
for ibstat_info in ibstat_infos:
f.write(ibstat_info)
MPI.Finalize()
def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist, max_dist): # noqa: C901 def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist, max_dist): # noqa: C901
"""Generate topology aware config list in specified distance range. """Generate topology aware config list in specified distance range.
...@@ -47,15 +84,24 @@ def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist, ...@@ -47,15 +84,24 @@ def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist,
topology distance (#hops). topology distance (#hops).
""" """
config = [] config = []
if not ibstat_file or not ibnetdiscover_file: # Check validity of input parameters
logger.error('Either ibstat or ibnetdiscover not specified.') if not ibnetdiscover_file:
logger.error('ibnetdiscover file is not specified.')
return config
if not ibstat_file:
ibstat_file = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'ib_traffic_topo_aware_ibstat.txt')
gen_ibstat_file(ibstat_file)
if not Path(ibstat_file).exists():
logger.error('ibstat file does not exist.')
return config return config
if min_dist > max_dist: if min_dist > max_dist:
logger.error('Specified minimum distane ({}) is larger than maximum distance ({}).'.format(min_dist, max_dist)) logger.error('Specified minimum distane ({}) is larger than maximum distance ({}).'.format(min_dist, max_dist))
return config return config
# index each hostname in hostfile # Index each hostname in hostfile
host_idx = dict() host_idx = dict()
idx = 0 idx = 0
for h in host_list: for h in host_list:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment