system_info.py 17.7 KB
Newer Older
1
2
3
4
5
6
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Generate system config."""

import json
7
import os
8
9
10
import subprocess
from pathlib import Path

11
12
13
14
import xmltodict

from superbench.common.utils import logger

15
16
17

class SystemInfo():    # pragma: no cover
    """Systsem info class."""
18
19
    def _run_cmd(self, command):
        """Run the command and return the stdout string.
20
21
22
23
24
25
26
27

        Args:
            command (string): the command to run in terminal.

        Returns:
            string: the stdout string of the command.
        """
        output = subprocess.run(
28
29
30
31
32
33
34
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            shell=True,
            check=False,
            universal_newlines=True,
            timeout=300
35
36
37
        )
        return output.stdout

38
    def __count_prefix_indent(self, content, symbol='\t'):
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
        r"""Count the number of a specific symbol in the content.

        Args:
            content (string): the content for counting the indent.
            symbol (str, optional): the symbol of the indent. Defaults to '\t'.

        Returns:
            int: the indent count of the symbol in the beginning of the content.
        """
        count = 0
        for char in content:
            if char == symbol:
                count += 1
            else:
                break
        return count

56
    def _parse_key_value_lines(self, lines, required_keywords=None, omitted_values=None, symbol=':'):    # noqa: C901
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
        """Parse the lines like "key:value" and convert them to dict.

        if required_keywords is None, include all line. Otherwise,
        only include the line containing one of the keyword in required_keywords.
        If omitted_values is None, accept any value in dict,
        otherwise drop the item whose value in omitted_values.

        Args:
            lines (list): the lines to parse.
            required_keywords (list, optional): list of select keys. Defaults to None.
            omitted_values (list, optional): list of omitted values. Defaults to None.

        Returns:
            dict: the result in dict.
        """
        dict = {}
        key = ''
        value = ''
        i = 0
        length = len(lines)
        while i < length:
            line = lines[i]
            is_selected = True
            if required_keywords is not None:
                is_selected = False
                for key in required_keywords:
                    if key in line:
                        is_selected = True
            if not is_selected:
                i += 1
                continue
            # process with indent recursively
            indent = self.__count_prefix_indent(lines[i])
            if i + 1 < length and self.__count_prefix_indent(lines[i + 1]) > indent:
                key = lines[i].strip().strip('\t')
                next_indent_index = i + 1
                while next_indent_index < length and self.__count_prefix_indent(lines[next_indent_index]) > indent:
                    next_indent_index += 1

96
                value = self._parse_key_value_lines(lines[i + 1:next_indent_index])
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
                i = next_indent_index - 1
            # split line by symbol
            elif symbol in line:
                symbol_index = line.index(symbol)
                line = [line[:symbol_index], line[symbol_index + 1:]]
                key = line[0].strip().strip('\t')
                if len(line) > 1:
                    value = line[1].strip().strip('\t')
                else:
                    value = ''

            is_omit = False
            if omitted_values is not None:
                for omit in omitted_values:
                    if omit in value.lower():
                        is_omit = True
            if not is_omit:
                # save key,value into the dict and merge same key
                if key not in dict:
                    dict[key] = value
                elif dict[key] is not value:
                    if not isinstance(dict[key], list):
                        dict[key] = [dict[key]]
                    if value not in dict[key]:
                        dict[key].append(value)
            i += 1
        return dict

125
    def _parse_table_lines(self, lines, key):
126
127
128
129
130
131
132
133
134
135
136
137
        """Parse lines like a table and extract the colomns whose table index are the same as key to list of dict.

        Args:
            lines (list): the lines to parse.
            key ([type]): A subset of the the table index.

        Returns:
            list: the result in list of dict.
        """
        index = []
        list = []
        valid = False
138
139
140
141
142
143
144
145
146
147
148
149
150
        for line in lines:
            line = line.split()
            if key[0] in line:
                for i in range(len(key)):
                    index.append(line.index(key[i]))
                valid = True
                continue
            if valid:
                dict = {}
                for i in range(len(key)):
                    if index[i] < len(line):
                        dict[key[i]] = line[index[i]]
                list.append(dict)
151
152
153
154
155
156
157
158
159
160
161
        return list

    def get_cpu(self):
        """Get CPU info.

        Returns:
            dict: cpu info dict.
        """
        lscpu_dict = {}
        try:
            # get general cpu information from lscpu
162
            lscpu = self._run_cmd('lscpu').splitlines()
163
            # get distinct max_speed and current_speed of cpus from dmidecode
164
165
166
            speed = self._run_cmd(r'dmidecode -t processor | grep "Speed"').splitlines()
            lscpu_dict = self._parse_key_value_lines(lscpu)
            lscpu_dict.update(self._parse_key_value_lines(speed))
167
        except Exception:
168
            logger.exception('Error: get CPU info failed')
169
170
171
172
173
174
175
176
177
178
        return lscpu_dict

    def get_system(self):
        """Get system info.

        Returns:
            dict: system info dict.
        """
        system_dict = {}
        try:
179
180
181
182
183
184
185
186
187
            lsmod = self._run_cmd('lsmod').splitlines()
            lsmod = self._parse_table_lines(lsmod, key=['Module', 'Size', 'Used', 'by'])
            sysctl = self._run_cmd('sysctl -a').splitlines()
            sysctl = self._parse_key_value_lines(sysctl, None, None, '=')
            system_dict['system_manufacturer'] = self._run_cmd('dmidecode -s system-manufacturer').strip()
            system_dict['system_product'] = self._run_cmd('dmidecode -s system-product-name').strip()
            system_dict['os'] = self._run_cmd('cat /proc/version').strip()
            system_dict['uname'] = self._run_cmd('uname -a').strip()
            system_dict['docker'] = self.get_docker_version()
188
189
            system_dict['kernel_parameters'] = sysctl
            system_dict['kernel_modules'] = lsmod
190
            system_dict['dmidecode'] = self._run_cmd('dmidecode').strip()
191
            if system_dict['system_product'] == 'Virtual Machine':
192
193
                lsvmbus = self._run_cmd('lsvmbus').splitlines()
                lsvmbus = self._parse_key_value_lines(lsvmbus)
194
195
                system_dict['vmbus'] = lsvmbus
        except Exception:
196
            logger.exception('Error: get system info failed')
197
198
        return system_dict

199
    def get_docker_version(self):
200
201
202
203
204
205
206
        """Get docker version info.

        Returns:
            dict: docker version info dict.
        """
        docker_version_dict = {}
        try:
207
            docker_version = self._run_cmd('docker version')
208
209
210
211
212
213
214
215
216
217
218
            lines = docker_version.splitlines()

            key = ''
            for line in lines:
                if 'Client' in line:
                    key = 'docker_client_version'
                elif 'Server' in line:
                    key = 'docker_daemon_version'
                elif 'Version' in line and key not in docker_version_dict:
                    docker_version_dict[key] = line.split(':')[1].strip().strip('\t')
        except Exception:
219
            logger.exception('Error: get docker info failed')
220
221
222
223
224
225
226
227
228
229
        return docker_version_dict

    def get_memory(self):
        """Get memory info.

        Returns:
            dict: memory info dict.
        """
        memory_dict = {}
        try:
230
            lsmem = self._run_cmd('lsmem')
231
            lsmem = lsmem.splitlines()
232
            lsmem = self._parse_key_value_lines(lsmem)
233
234
            memory_dict['block_size'] = lsmem.get('Memory block size', '')
            memory_dict['total_capacity'] = lsmem.get('Total online memory', '')
235
            dmidecode_memory = self._run_cmd('dmidecode --type memory')
236
            dmidecode_memory = dmidecode_memory.splitlines()
237
            model = self._parse_key_value_lines(
238
239
240
241
242
243
244
245
                dmidecode_memory, ['Manufacturer', 'Part Number', 'Type', 'Speed', 'Number Of Devices'],
                omitted_values=['other', 'unknown']
            )
            memory_dict['channels'] = model.get('Number Of Devices', '')
            memory_dict['type'] = model.get('Type', '')
            memory_dict['clock_frequency'] = model.get('Speed', '')
            memory_dict['model'] = model.get('Manufacturer', [''])[0] + ' ' + model.get('Part Number', [''])[0]
        except Exception:
246
            logger.exception('Error: get memory info failed')
247
248
        return memory_dict

249
    def get_gpu_nvidia(self):
250
251
252
253
254
255
        """Get nvidia gpu info.

        Returns:
            dict: nvidia gpu info dict.
        """
        gpu_dict = {}
256
257
258
259
260
261
262
263
264
265
        gpu_query = self._run_cmd('nvidia-smi -q -x')
        gpu_query = xmltodict.parse(gpu_query).get('nvidia_smi_log', '')
        gpu_dict['gpu_count'] = gpu_query.get('attached_gpus', '')
        gpu_dict['nvidia_info'] = gpu_query
        gpu_dict['topo'] = self._run_cmd('nvidia-smi topo -m')
        gpu_dict['nvidia-container-runtime_version'] = self._run_cmd('nvidia-container-runtime -v').strip()
        gpu_dict['nvidia-fabricmanager_version'] = self._run_cmd('nv-fabricmanager --version').strip()
        gpu_dict['nv_peer_mem_version'] = self._run_cmd(
            'dpkg -l | grep \'nvidia-peer-memory \' | awk \'$2=="nvidia-peer-memory" {print $3}\''
        ).strip()
266
267
268

        return gpu_dict

269
    def get_gpu_amd(self):
270
271
272
273
274
275
        """Get amd gpu info.

        Returns:
            dict: amd gpu info dict.
        """
        gpu_dict = {}
276
277
278
279
280
281
282
283
284
285
286
        gpu_query = self._run_cmd('rocm-smi -a --json')
        gpu_query = json.loads(gpu_query)
        gpu_per_node = list(filter(lambda x: 'card' in x, gpu_query.keys()))
        gpu_dict['gpu_count'] = len(gpu_per_node)
        gpu_mem_info = self._run_cmd('rocm-smi --showmeminfo vram --json')
        gpu_mem_info = json.loads(gpu_mem_info)
        for card in gpu_per_node:
            gpu_query[card].update(gpu_mem_info.get(card))
        gpu_dict['rocm_info'] = gpu_query
        gpu_dict['topo'] = self._run_cmd('rocm-smi --showtopo')

287
288
289
290
291
292
293
294
        return gpu_dict

    def get_gpu(self):
        """Get gpu info and identify gpu type(nvidia/amd).

        Returns:
            dict: gpu info dict.
        """
295
296
297
298
299
300
301
        try:
            if Path('/dev/nvidiactl').is_char_device() and Path('/dev/nvidia-uvm').is_char_device():
                return self.get_gpu_nvidia()
            if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
                return self.get_gpu_amd()
        except Exception:
            logger.exception('Error: get gpu info failed')
302
303
304
305
306
307
308
309
310
311
312
        print('Warning: no gpu detected')
        return {}

    def get_pcie(self):
        """Get pcie info dict.

        Returns:
            dict: pcie info dict.
        """
        pcie_dict = {}
        try:
313
314
            pcie_dict['pcie_topo'] = self._run_cmd('lspci -t -vvv')
            pcie_dict['pcie_info'] = self._run_cmd('lspci -vvv')
315
        except Exception:
316
            logger.exception('Error: get pcie info failed')
317
318
319
320
321
322
323
324
325
326
        return pcie_dict

    def get_storage(self):    # noqa: C901
        """Get storage info dict, including file system info, blocl device info and their mapping.

        Returns:
            dict: storage info dict.
        """
        storage_dict = {}
        try:
327
328
            fs_info = self._run_cmd("df -Th | grep -v \'^/dev/loop\'").splitlines()
            fs_list = self._parse_table_lines(fs_info, key=['Filesystem', 'Type', 'Size', 'Avail', 'Mounted'])
329
330
331
            for fs in fs_list:
                fs_device = fs.get('Filesystem', 'UNKNOWN')
                if fs_device.startswith('/dev'):
332
                    fs['Block_size'] = self._run_cmd('blockdev --getbsz {}'.format(fs_device)).strip()
333
                    fs['4k_alignment'] = ''
334
335
                    partition_ids = self._run_cmd(
                        'yes Cancel | parted {} print | grep -oE "^[[:blank:]]*[0-9]+"'.format(fs_device)
336
337
                    ).splitlines()
                    for id in partition_ids:
338
339
340
                        fs['4k_alignment'] += self._run_cmd(
                            'yes Cancel | parted {} align-check opt {}'.format(fs_device, id)
                        ).strip()
341
342
            storage_dict['file_system'] = fs_list
        except Exception:
343
            logger.exception('Error: get file system info failed')
344
345

        try:
346
347
            disk_info = self._run_cmd("lsblk -e 7 -o NAME,ROTA,SIZE,MODEL | grep -v \'^/dev/loop\'").splitlines()
            disk_list = self._parse_table_lines(disk_info, key=['NAME', 'ROTA', 'SIZE', 'MODEL'])
348
349
350
351
            for disk in disk_list:
                block_device = disk.get('NAME', 'UNKNOWN').strip('\u251c\u2500').strip('\u2514\u2500')
                disk['NAME'] = block_device
                disk['Rotational'] = disk.pop('ROTA')
352
353
                disk['Block_size'] = self._run_cmd('fdisk -l -u /dev/{} | grep "Sector size"'.format(block_device)
                                                   ).strip()
354
                if 'nvme' in block_device:
355
                    nvme_info = self._run_cmd('nvme list | grep {}'.format(block_device)).strip().split()
356
357
358
                    if len(nvme_info) >= 15:
                        disk['Nvme_usage'] = nvme_info[-11] + nvme_info[-10]
            storage_dict['block_device'] = disk_list
359
            storage_dict['mapping_bwtween_filesystem_and_blockdevice'] = self._run_cmd('mount')
360
        except Exception:
361
            logger.exception('Error: get block device info failed')
362
363
364

        return storage_dict

365
    def get_ib(self):
366
367
368
369
370
371
372
        """Get available IB devices info.

        Return:
            list: list of available IB device info dict.
        """
        ib_dict = {}
        try:
373
374
375
            ibstat = self._run_cmd('ibstat').splitlines()
            ib_dict['ib_device_status'] = self._parse_key_value_lines(ibstat)
            ibv_devinfo = self._run_cmd('ibv_devinfo -v').splitlines()
376
377
378
379
            for i in range(len(ibv_devinfo) - 1, -1, -1):
                if ':' not in ibv_devinfo[i]:
                    ibv_devinfo[i - 1] = ibv_devinfo[i - 1] + ',' + ibv_devinfo[i].strip('\t')
                    ibv_devinfo.remove(ibv_devinfo[i])
380
381
382
            ib_dict['ib_device_info'] = self._parse_key_value_lines(ibv_devinfo)
        except Exception:
            logger.exception('Error: get ib info failed')
383
384
        return ib_dict

385
    def get_nic(self):
386
387
388
389
390
391
392
        """Get nic info.

        Returns:
            list: list of available nic info dict.
        """
        nic_list = []
        try:
393
            lsnic_xml = self._run_cmd('lshw -c network -xml')
394
            lsnic_list = xmltodict.parse(lsnic_xml).get('list', {}).get('node', [])
395
396
            if not isinstance(lsnic_list, list):
                lsnic_list = [lsnic_list]
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
            lsnic_list = list(filter(lambda x: 'logicalname' in x, lsnic_list))

            for nic in lsnic_list:
                nic_info = {}
                try:
                    nic_info['logical_name'] = nic['logicalname']
                    nic_info['disabled'] = nic.get('@disabled', False)
                    nic_info['model'] = nic.get('vendor', '') + ' ' + nic.get('product', '')
                    nic_info['description'] = nic.get('description', '')
                    configuration = nic.get('configuration', {}).get('setting')
                    configuration_dict = {}
                    for config in configuration:
                        configuration_dict[config['@id']] = config.get('@value', '')
                    if configuration_dict:
                        nic_info['driver'] = configuration_dict.get('driver', '') + ' ' + configuration_dict.get(
                            'driverversion', ''
                        )
                        nic_info['firmware'] = configuration_dict.get('firmware', '')
415
                    speed = self._run_cmd('cat /sys/class/net/{}/speed'.format(nic_info['logical_name'])).strip()
416
417
418
                    if speed.isdigit():
                        nic_info['speed'] = str(int(speed) / 1000) + ' Gbit/s'
                except Exception:
419
420
                    logger.exception('Error: get nic device {} info failed')

421
422
                nic_list.append(nic_info)
        except Exception:
423
            logger.exception('Error: get nic info failed')
424
425
426
427
428
429
430
431

    def get_network(self):
        """Get network info, including nic info, ib info and ofed version.

        Returns:
            dict: dict of network info.
        """
        network_dict = {}
432
433
434
435
436
437
438
        try:
            network_dict['nic'] = self.get_nic()
            network_dict['ib'] = self.get_ib()
            ofed_version = self._run_cmd('ofed_info  -s').strip()
            network_dict['ofed_version'] = ofed_version
        except Exception:
            logger.exception('Error: get network info failed')
439
440
441
442
443
        return network_dict

    def get_all(self):
        """Get all system info and save them to file in json format."""
        sum_dict = {}
444
445
446
        if os.geteuid() != 0:
            logger.error('You need to be as a root user to run this tool.')
            return sum_dict
447
448
449
450
451
452
453
454
        sum_dict['System'] = self.get_system()
        sum_dict['CPU'] = self.get_cpu()
        sum_dict['Memory'] = self.get_memory()
        sum_dict['Storage'] = self.get_storage()
        sum_dict['Network'] = self.get_network()
        sum_dict['PCIe'] = self.get_pcie()
        sum_dict['Accelerator'] = self.get_gpu()
        return sum_dict