Unverified Commit 9ca5e7a9 authored by one's avatar one Committed by GitHub
Browse files

Support GPU system info collection via hy-smi (#4)

* Support GPU system info collection via hy-smi

* Fix typos in docs
parent 6d08a565
......@@ -22,7 +22,7 @@ This tool is to collect the system information automatically on the tested GPU n
2. Start to collect the sys info using `sb node info --output-dir ${output-dir}` command using root privilege.
3. After the command finished, you can find the output system info json file `sys-info.json` of local node under \${output_dir}.
3. After the command finished, you can find the output system info json file `sys_info.json` of local node under \${output_dir}.
### Usage on multiple remote machines
......@@ -38,7 +38,7 @@ This tool is to collect the system information automatically on the tested GPU n
sb run --get-info -f host.ini --output-dir ${output-dir} -C superbench.enable=none
```
4. After the command finished, you can find the output system info json file `sys-info.json` of each node under \${output_dir}/nodes/${node_name}.
4. After the command finished, you can find the output system info json file `sys_info.json` of each node under \${output_dir}/nodes/${node_name}.
## Parameter and Details
......
......@@ -286,11 +286,69 @@ def get_gpu_amd(self):
return gpu_dict
def _merge_hygon_gpu_json_info(self, gpu_info, command):
"""Merge Hygon GPU info from json command output.
Args:
gpu_info (dict): GPU info keyed by card id.
command (str): Command to get GPU info in json format.
"""
command_output = self._run_cmd(command)
command_info = json.loads(command_output)
for card, card_info in command_info.items():
if not card.startswith('card'):
continue
if card not in gpu_info:
gpu_info[card] = {}
for key, value in card_info.items():
if key:
gpu_info[card][key] = value
def get_gpu_hygon(self):
"""Get hygon gpu info."""
gpu_dict = self.get_gpu_amd()
if gpu_dict:
gpu_dict['accelerator_vendor'] = 'hygon'
gpu_dict = {
'accelerator_vendor': 'hygon',
'rocm_info': {},
}
hygon_json_info_options = [
'--showid',
'--showproductname',
'--showserial',
'--showvbios',
'--showfwinfo',
'--showbus',
'--showtoponuma',
'--showreplaycount',
'--showmeminfo vram',
'--showmemavailable',
'--showmemvendor',
'--showmemuse',
'--showmemeccinfo',
'--showmemoverdrive',
'--showclocks',
'--showperflevel',
'--showoverdrive',
'--showpower',
'--showmaxpower',
'--showvoltage',
'--showtemp',
'--showuse',
'--showbw',
]
for option in hygon_json_info_options:
command = 'hy-smi --json {}'.format(option)
try:
self._merge_hygon_gpu_json_info(gpu_dict['rocm_info'], command)
except Exception:
logger.exception('Error: get hygon gpu info failed with command: %s', command)
try:
gpu_dict['topo'] = self._run_cmd('hy-smi --showtopo')
except Exception:
logger.exception('Error: get hygon gpu topology info failed')
gpu_dict['gpu_count'] = len(gpu_dict['rocm_info'])
return gpu_dict
def get_gpu(self):
......@@ -430,6 +488,7 @@ def get_nic(self):
nic_list.append(nic_info)
except Exception:
logger.exception('Error: get nic info failed')
return nic_list
def get_network(self):
"""Get network info, including nic info, ib info and ofed version.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment