Unverified Commit 2d85781b authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Tool: Fix bug - Fix function naming issue in system info (#200)

**Description**
Fix function naming issue in system info.

**Major Revision**
- fix function naming issue in system info 
- save to json file
- add timeout for subprocess.run
- revise error handling to print exception message
parent 9106dcf4
......@@ -4,15 +4,19 @@
"""Generate system config."""
import json
import os
import subprocess
import xmltodict
from pathlib import Path
import xmltodict
from superbench.common.utils import logger
class SystemInfo(): # pragma: no cover
"""Systsem info class."""
def run_cmd(self, command):
"""Run the command as root or non-root user and return the stdout string..
def _run_cmd(self, command):
"""Run the command and return the stdout string.
Args:
command (string): the command to run in terminal.
......@@ -21,11 +25,17 @@ def run_cmd(self, command):
string: the stdout string of the command.
"""
output = subprocess.run(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True,
check=False,
universal_newlines=True,
timeout=300
)
return output.stdout
def count_prefix_indent(self, content, symbol='\t'):
def __count_prefix_indent(self, content, symbol='\t'):
r"""Count the number of a specific symbol in the content.
Args:
......@@ -43,7 +53,7 @@ def count_prefix_indent(self, content, symbol='\t'):
break
return count
def parse_key_value_lines(self, lines, required_keywords=None, omitted_values=None, symbol=':'): # noqa: C901
def _parse_key_value_lines(self, lines, required_keywords=None, omitted_values=None, symbol=':'): # noqa: C901
"""Parse the lines like "key:value" and convert them to dict.
if required_keywords is None, include all line. Otherwise,
......@@ -83,7 +93,7 @@ def parse_key_value_lines(self, lines, required_keywords=None, omitted_values=No
while next_indent_index < length and self.__count_prefix_indent(lines[next_indent_index]) > indent:
next_indent_index += 1
value = self.__parse_key_value_lines(lines[i + 1:next_indent_index])
value = self._parse_key_value_lines(lines[i + 1:next_indent_index])
i = next_indent_index - 1
# split line by symbol
elif symbol in line:
......@@ -112,7 +122,7 @@ def parse_key_value_lines(self, lines, required_keywords=None, omitted_values=No
i += 1
return dict
def parse_table_lines(self, lines, key):
def _parse_table_lines(self, lines, key):
"""Parse lines like a table and extract the colomns whose table index are the same as key to list of dict.
Args:
......@@ -125,22 +135,19 @@ def parse_table_lines(self, lines, key):
index = []
list = []
valid = False
try:
for line in lines:
line = line.split()
if key[0] in line:
for i in range(len(key)):
index.append(line.index(key[i]))
valid = True
continue
if valid:
dict = {}
for i in range(len(key)):
if index[i] < len(line):
dict[key[i]] = line[index[i]]
list.append(dict)
except Exception:
print('Error: key error in __parse_table_lines')
for line in lines:
line = line.split()
if key[0] in line:
for i in range(len(key)):
index.append(line.index(key[i]))
valid = True
continue
if valid:
dict = {}
for i in range(len(key)):
if index[i] < len(line):
dict[key[i]] = line[index[i]]
list.append(dict)
return list
def get_cpu(self):
......@@ -152,13 +159,13 @@ def get_cpu(self):
lscpu_dict = {}
try:
# get general cpu information from lscpu
lscpu = self.__run_cmd('lscpu').splitlines()
lscpu = self._run_cmd('lscpu').splitlines()
# get distinct max_speed and current_speed of cpus from dmidecode
speed = self.__run_cmd(r'dmidecode -t processor | grep "Speed"').splitlines()
lscpu_dict = self.__parse_key_value_lines(lscpu)
lscpu_dict.update(self.__parse_key_value_lines(speed))
speed = self._run_cmd(r'dmidecode -t processor | grep "Speed"').splitlines()
lscpu_dict = self._parse_key_value_lines(lscpu)
lscpu_dict.update(self._parse_key_value_lines(speed))
except Exception:
print('Error: get CPU info failed')
logger.exception('Error: get CPU info failed')
return lscpu_dict
def get_system(self):
......@@ -169,27 +176,27 @@ def get_system(self):
"""
system_dict = {}
try:
lsmod = self.__run_cmd('lsmod').splitlines()
lsmod = self.__parse_table_lines(lsmod, key=['Module', 'Size', 'Used', 'by'])
sysctl = self.__run_cmd('sysctl -a').splitlines()
sysctl = self.__parse_key_value_lines(sysctl, None, None, '=')
system_dict['system_manufacturer'] = self.__run_cmd('dmidecode -s system-manufacturer').strip()
system_dict['system_product'] = self.__run_cmd('dmidecode -s system-product-name').strip()
system_dict['os'] = self.__run_cmd('cat /proc/version').strip()
system_dict['uname'] = self.__run_cmd('uname -a').strip()
system_dict['docker'] = self.__get_docker_version()
lsmod = self._run_cmd('lsmod').splitlines()
lsmod = self._parse_table_lines(lsmod, key=['Module', 'Size', 'Used', 'by'])
sysctl = self._run_cmd('sysctl -a').splitlines()
sysctl = self._parse_key_value_lines(sysctl, None, None, '=')
system_dict['system_manufacturer'] = self._run_cmd('dmidecode -s system-manufacturer').strip()
system_dict['system_product'] = self._run_cmd('dmidecode -s system-product-name').strip()
system_dict['os'] = self._run_cmd('cat /proc/version').strip()
system_dict['uname'] = self._run_cmd('uname -a').strip()
system_dict['docker'] = self.get_docker_version()
system_dict['kernel_parameters'] = sysctl
system_dict['kernel_modules'] = lsmod
system_dict['dmidecode'] = self.__run_cmd('dmidecode').strip()
system_dict['dmidecode'] = self._run_cmd('dmidecode').strip()
if system_dict['system_product'] == 'Virtual Machine':
lsvmbus = self.__run_cmd('lsvmbus').splitlines()
lsvmbus = self.__parse_key_value_lines(lsvmbus)
lsvmbus = self._run_cmd('lsvmbus').splitlines()
lsvmbus = self._parse_key_value_lines(lsvmbus)
system_dict['vmbus'] = lsvmbus
except Exception:
print('Error: get system info failed')
logger.exception('Error: get system info failed')
return system_dict
def __get_docker_version(self):
def get_docker_version(self):
"""Get docker version info.
Returns:
......@@ -197,7 +204,7 @@ def __get_docker_version(self):
"""
docker_version_dict = {}
try:
docker_version = self.__run_cmd('docker version')
docker_version = self._run_cmd('docker version')
lines = docker_version.splitlines()
key = ''
......@@ -209,7 +216,7 @@ def __get_docker_version(self):
elif 'Version' in line and key not in docker_version_dict:
docker_version_dict[key] = line.split(':')[1].strip().strip('\t')
except Exception:
print('Error: get docker info failed')
logger.exception('Error: get docker info failed')
return docker_version_dict
def get_memory(self):
......@@ -220,14 +227,14 @@ def get_memory(self):
"""
memory_dict = {}
try:
lsmem = self.__run_cmd('lsmem')
lsmem = self._run_cmd('lsmem')
lsmem = lsmem.splitlines()
lsmem = self.__parse_key_value_lines(lsmem)
lsmem = self._parse_key_value_lines(lsmem)
memory_dict['block_size'] = lsmem.get('Memory block size', '')
memory_dict['total_capacity'] = lsmem.get('Total online memory', '')
dmidecode_memory = self.__run_cmd('dmidecode --type memory')
dmidecode_memory = self._run_cmd('dmidecode --type memory')
dmidecode_memory = dmidecode_memory.splitlines()
model = self.__parse_key_value_lines(
model = self._parse_key_value_lines(
dmidecode_memory, ['Manufacturer', 'Part Number', 'Type', 'Speed', 'Number Of Devices'],
omitted_values=['other', 'unknown']
)
......@@ -236,52 +243,47 @@ def get_memory(self):
memory_dict['clock_frequency'] = model.get('Speed', '')
memory_dict['model'] = model.get('Manufacturer', [''])[0] + ' ' + model.get('Part Number', [''])[0]
except Exception:
print('Error: get memory info failed')
logger.exception('Error: get memory info failed')
return memory_dict
def __get_gpu_nvidia(self):
def get_gpu_nvidia(self):
"""Get nvidia gpu info.
Returns:
dict: nvidia gpu info dict.
"""
gpu_dict = {}
try:
gpu_query = self.__run_cmd('nvidia-smi -q -x')
gpu_query = xmltodict.parse(gpu_query).get('nvidia_smi_log', '')
gpu_dict['gpu_count'] = gpu_query.get('attached_gpus', '')
gpu_dict['nvidia_info'] = gpu_query
gpu_dict['topo'] = self.__run_cmd('nvidia-smi topo -m')
gpu_dict['nvidia-container-runtime_version'] = self.__run_cmd('nvidia-container-runtime -v').strip()
gpu_dict['nvidia-fabricmanager_version'] = self.__run_cmd('nv-fabricmanager --version').strip()
gpu_dict['nv_peer_mem_version'] = self.__run_cmd(
'dpkg -l | grep \'nvidia-peer-memory \' | awk \'$2=="nvidia-peer-memory" {print $3}\''
).strip()
except Exception:
print('Error: get nvidia gpu info failed')
gpu_query = self._run_cmd('nvidia-smi -q -x')
gpu_query = xmltodict.parse(gpu_query).get('nvidia_smi_log', '')
gpu_dict['gpu_count'] = gpu_query.get('attached_gpus', '')
gpu_dict['nvidia_info'] = gpu_query
gpu_dict['topo'] = self._run_cmd('nvidia-smi topo -m')
gpu_dict['nvidia-container-runtime_version'] = self._run_cmd('nvidia-container-runtime -v').strip()
gpu_dict['nvidia-fabricmanager_version'] = self._run_cmd('nv-fabricmanager --version').strip()
gpu_dict['nv_peer_mem_version'] = self._run_cmd(
'dpkg -l | grep \'nvidia-peer-memory \' | awk \'$2=="nvidia-peer-memory" {print $3}\''
).strip()
return gpu_dict
def __get_gpu_amd(self):
def get_gpu_amd(self):
"""Get amd gpu info.
Returns:
dict: amd gpu info dict.
"""
gpu_dict = {}
try:
gpu_query = self.__run_cmd('rocm-smi -a --json')
gpu_query = json.loads(gpu_query)
gpu_per_node = list(filter(lambda x: 'card' in x, gpu_query.keys()))
gpu_dict['gpu_count'] = len(gpu_per_node)
gpu_mem_info = self.__run_cmd('rocm-smi --showmeminfo vram --json')
gpu_mem_info = json.loads(gpu_mem_info)
for card in gpu_per_node:
gpu_query[card].update(gpu_mem_info.get(card))
gpu_dict['rocm_info'] = gpu_query
gpu_dict['topo'] = self.__run_cmd('rocm-smi --showtopo')
except Exception:
print('Error: get amd gpu info failed')
gpu_query = self._run_cmd('rocm-smi -a --json')
gpu_query = json.loads(gpu_query)
gpu_per_node = list(filter(lambda x: 'card' in x, gpu_query.keys()))
gpu_dict['gpu_count'] = len(gpu_per_node)
gpu_mem_info = self._run_cmd('rocm-smi --showmeminfo vram --json')
gpu_mem_info = json.loads(gpu_mem_info)
for card in gpu_per_node:
gpu_query[card].update(gpu_mem_info.get(card))
gpu_dict['rocm_info'] = gpu_query
gpu_dict['topo'] = self._run_cmd('rocm-smi --showtopo')
return gpu_dict
def get_gpu(self):
......@@ -290,10 +292,13 @@ def get_gpu(self):
Returns:
dict: gpu info dict.
"""
if Path('/dev/nvidiactl').is_char_device() and Path('/dev/nvidia-uvm').is_char_device():
return self.__get_gpu_nvidia()
if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
return self.__get_gpu_amd()
try:
if Path('/dev/nvidiactl').is_char_device() and Path('/dev/nvidia-uvm').is_char_device():
return self.get_gpu_nvidia()
if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
return self.get_gpu_amd()
except Exception:
logger.exception('Error: get gpu info failed')
print('Warning: no gpu detected')
return {}
......@@ -305,10 +310,10 @@ def get_pcie(self):
"""
pcie_dict = {}
try:
pcie_dict['pcie_topo'] = self.__run_cmd('lspci -t -vvv')
pcie_dict['pcie_info'] = self.__run_cmd('lspci -vvv')
pcie_dict['pcie_topo'] = self._run_cmd('lspci -t -vvv')
pcie_dict['pcie_info'] = self._run_cmd('lspci -vvv')
except Exception:
print('Error: get pcie gpu info failed')
logger.exception('Error: get pcie info failed')
return pcie_dict
def get_storage(self): # noqa: C901
......@@ -319,44 +324,45 @@ def get_storage(self): # noqa: C901
"""
storage_dict = {}
try:
fs_info = self.__run_cmd("df -Th | grep -v \'^/dev/loop\'").splitlines()
fs_list = self.__parse_table_lines(fs_info, key=['Filesystem', 'Type', 'Size', 'Avail', 'Mounted'])
fs_info = self._run_cmd("df -Th | grep -v \'^/dev/loop\'").splitlines()
fs_list = self._parse_table_lines(fs_info, key=['Filesystem', 'Type', 'Size', 'Avail', 'Mounted'])
for fs in fs_list:
fs_device = fs.get('Filesystem', 'UNKNOWN')
if fs_device.startswith('/dev'):
fs['Block_size'] = self.__run_cmd('blockdev --getbsz {}'.format(fs_device)).strip()
fs['Block_size'] = self._run_cmd('blockdev --getbsz {}'.format(fs_device)).strip()
fs['4k_alignment'] = ''
partition_ids = self.__run_cmd(
'parted {} print | grep -oE "^[[:blank:]]*[0-9]+"'.format(fs_device)
partition_ids = self._run_cmd(
'yes Cancel | parted {} print | grep -oE "^[[:blank:]]*[0-9]+"'.format(fs_device)
).splitlines()
for id in partition_ids:
fs['4k_alignment'] += self.__run_cmd('parted {} align-check opt {}'.format(fs_device,
id)).strip()
fs['4k_alignment'] += self._run_cmd(
'yes Cancel | parted {} align-check opt {}'.format(fs_device, id)
).strip()
storage_dict['file_system'] = fs_list
except Exception:
print('Error: get file system info failed')
logger.exception('Error: get file system info failed')
try:
disk_info = self.__run_cmd("lsblk -e 7 -o NAME,ROTA,SIZE,MODEL | grep -v \'^/dev/loop\'").splitlines()
disk_list = self.__parse_table_lines(disk_info, key=['NAME', 'ROTA', 'SIZE', 'MODEL'])
disk_info = self._run_cmd("lsblk -e 7 -o NAME,ROTA,SIZE,MODEL | grep -v \'^/dev/loop\'").splitlines()
disk_list = self._parse_table_lines(disk_info, key=['NAME', 'ROTA', 'SIZE', 'MODEL'])
for disk in disk_list:
block_device = disk.get('NAME', 'UNKNOWN').strip('\u251c\u2500').strip('\u2514\u2500')
disk['NAME'] = block_device
disk['Rotational'] = disk.pop('ROTA')
disk['Block_size'] = self.__run_cmd('fdisk -l -u /dev/{} | grep "Sector size"'.format(block_device)
).strip()
disk['Block_size'] = self._run_cmd('fdisk -l -u /dev/{} | grep "Sector size"'.format(block_device)
).strip()
if 'nvme' in block_device:
nvme_info = self.__run_cmd('nvme list | grep {}'.format(block_device)).strip().split()
nvme_info = self._run_cmd('nvme list | grep {}'.format(block_device)).strip().split()
if len(nvme_info) >= 15:
disk['Nvme_usage'] = nvme_info[-11] + nvme_info[-10]
storage_dict['block_device'] = disk_list
storage_dict['mapping_bwtween_filesystem_and_blockdevice'] = self.__run_cmd('mount')
storage_dict['mapping_bwtween_filesystem_and_blockdevice'] = self._run_cmd('mount')
except Exception:
print('Error: get block device info failed')
logger.exception('Error: get block device info failed')
return storage_dict
def __get_ib(self):
def get_ib(self):
"""Get available IB devices info.
Return:
......@@ -364,19 +370,19 @@ def __get_ib(self):
"""
ib_dict = {}
try:
ibstat = self.__run_cmd('ibstat').splitlines()
ib_dict['ib_device_status'] = self.__parse_key_value_lines(ibstat)
ibv_devinfo = self.__run_cmd('ibv_devinfo -v').splitlines()
ibstat = self._run_cmd('ibstat').splitlines()
ib_dict['ib_device_status'] = self._parse_key_value_lines(ibstat)
ibv_devinfo = self._run_cmd('ibv_devinfo -v').splitlines()
for i in range(len(ibv_devinfo) - 1, -1, -1):
if ':' not in ibv_devinfo[i]:
ibv_devinfo[i - 1] = ibv_devinfo[i - 1] + ',' + ibv_devinfo[i].strip('\t')
ibv_devinfo.remove(ibv_devinfo[i])
ib_dict['ib_device_info'] = self.__parse_key_value_lines(ibv_devinfo)
except Exception as e:
print('Error: get ib info failed. message: {}.'.format(str(e)))
ib_dict['ib_device_info'] = self._parse_key_value_lines(ibv_devinfo)
except Exception:
logger.exception('Error: get ib info failed')
return ib_dict
def __get_nic(self):
def get_nic(self):
"""Get nic info.
Returns:
......@@ -384,8 +390,10 @@ def __get_nic(self):
"""
nic_list = []
try:
lsnic_xml = self.__run_cmd('lshw -c network -xml')
lsnic_xml = self._run_cmd('lshw -c network -xml')
lsnic_list = xmltodict.parse(lsnic_xml).get('list', {}).get('node', [])
if not isinstance(lsnic_list, list):
lsnic_list = [lsnic_list]
lsnic_list = list(filter(lambda x: 'logicalname' in x, lsnic_list))
for nic in lsnic_list:
......@@ -404,15 +412,15 @@ def __get_nic(self):
'driverversion', ''
)
nic_info['firmware'] = configuration_dict.get('firmware', '')
speed = self.__run_cmd('cat /sys/class/net/{}/speed'.format(nic_info['logical_name'])).strip()
speed = self._run_cmd('cat /sys/class/net/{}/speed'.format(nic_info['logical_name'])).strip()
if speed.isdigit():
nic_info['speed'] = str(int(speed) / 1000) + ' Gbit/s'
except Exception:
print('Error: get nic device {} info failed'.format(nic_info['logical_name']))
logger.exception('Error: get nic device {} info failed')
nic_list.append(nic_info)
except Exception:
print('Error: get nic info failed')
return nic_list
logger.exception('Error: get nic info failed')
def get_network(self):
"""Get network info, including nic info, ib info and ofed version.
......@@ -421,15 +429,21 @@ def get_network(self):
dict: dict of network info.
"""
network_dict = {}
network_dict['nic'] = self.__get_nic()
network_dict['ib'] = self.__get_ib()
ofed_version = self.__run_cmd('ofed_info -s').strip()
network_dict['ofed_version'] = ofed_version
try:
network_dict['nic'] = self.get_nic()
network_dict['ib'] = self.get_ib()
ofed_version = self._run_cmd('ofed_info -s').strip()
network_dict['ofed_version'] = ofed_version
except Exception:
logger.exception('Error: get network info failed')
return network_dict
def get_all(self):
"""Get all system info and save them to file in json format."""
sum_dict = {}
if os.geteuid() != 0:
logger.error('You need to be as a root user to run this tool.')
return sum_dict
sum_dict['System'] = self.get_system()
sum_dict['CPU'] = self.get_cpu()
sum_dict['Memory'] = self.get_memory()
......
......@@ -84,4 +84,4 @@ def test_sb_run_nonexist_host_file(self):
def test_sb_node_info(self):
"""Test sb node info, should fail."""
self.cmd('sb node info', expect_failure=True)
self.cmd('sb node info', expect_failure=False)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment