system_info.py 20 KB
Newer Older
1
2
3
4
5
6
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Generate system config."""

import json
7
import os
8
9
10
import subprocess
from pathlib import Path

11
12
13
14
import xmltodict

from superbench.common.utils import logger

15
16
17

class SystemInfo():    # pragma: no cover
    """Systsem info class."""
18
19
    def _run_cmd(self, command):
        """Run the command and return the stdout string.
20
21
22
23
24
25
26
27

        Args:
            command (string): the command to run in terminal.

        Returns:
            string: the stdout string of the command.
        """
        output = subprocess.run(
28
29
30
31
32
33
34
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            shell=True,
            check=False,
            universal_newlines=True,
            timeout=300
35
36
37
        )
        return output.stdout

38
    def __count_prefix_indent(self, content, symbol='\t'):
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
        r"""Count the number of a specific symbol in the content.

        Args:
            content (string): the content for counting the indent.
            symbol (str, optional): the symbol of the indent. Defaults to '\t'.

        Returns:
            int: the indent count of the symbol in the beginning of the content.
        """
        count = 0
        for char in content:
            if char == symbol:
                count += 1
            else:
                break
        return count

56
    def _parse_key_value_lines(self, lines, required_keywords=None, omitted_values=None, symbol=':'):    # noqa: C901
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
        """Parse the lines like "key:value" and convert them to dict.

        if required_keywords is None, include all line. Otherwise,
        only include the line containing one of the keyword in required_keywords.
        If omitted_values is None, accept any value in dict,
        otherwise drop the item whose value in omitted_values.

        Args:
            lines (list): the lines to parse.
            required_keywords (list, optional): list of select keys. Defaults to None.
            omitted_values (list, optional): list of omitted values. Defaults to None.

        Returns:
            dict: the result in dict.
        """
        dict = {}
        key = ''
        value = ''
        i = 0
        length = len(lines)
        while i < length:
            line = lines[i]
            is_selected = True
            if required_keywords is not None:
                is_selected = False
                for key in required_keywords:
                    if key in line:
                        is_selected = True
            if not is_selected:
                i += 1
                continue
            # process with indent recursively
            indent = self.__count_prefix_indent(lines[i])
            if i + 1 < length and self.__count_prefix_indent(lines[i + 1]) > indent:
                key = lines[i].strip().strip('\t')
                next_indent_index = i + 1
                while next_indent_index < length and self.__count_prefix_indent(lines[next_indent_index]) > indent:
                    next_indent_index += 1

96
                value = self._parse_key_value_lines(lines[i + 1:next_indent_index])
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
                i = next_indent_index - 1
            # split line by symbol
            elif symbol in line:
                symbol_index = line.index(symbol)
                line = [line[:symbol_index], line[symbol_index + 1:]]
                key = line[0].strip().strip('\t')
                if len(line) > 1:
                    value = line[1].strip().strip('\t')
                else:
                    value = ''

            is_omit = False
            if omitted_values is not None:
                for omit in omitted_values:
                    if omit in value.lower():
                        is_omit = True
            if not is_omit:
                # save key,value into the dict and merge same key
                if key not in dict:
                    dict[key] = value
                elif dict[key] is not value:
                    if not isinstance(dict[key], list):
                        dict[key] = [dict[key]]
                    if value not in dict[key]:
                        dict[key].append(value)
            i += 1
        return dict

125
    def _parse_table_lines(self, lines, key):
126
127
128
129
130
131
132
133
134
135
136
137
        """Parse lines like a table and extract the colomns whose table index are the same as key to list of dict.

        Args:
            lines (list): the lines to parse.
            key ([type]): A subset of the the table index.

        Returns:
            list: the result in list of dict.
        """
        index = []
        list = []
        valid = False
138
139
140
141
142
143
144
145
146
147
148
149
150
        for line in lines:
            line = line.split()
            if key[0] in line:
                for i in range(len(key)):
                    index.append(line.index(key[i]))
                valid = True
                continue
            if valid:
                dict = {}
                for i in range(len(key)):
                    if index[i] < len(line):
                        dict[key[i]] = line[index[i]]
                list.append(dict)
151
152
153
154
155
156
157
158
159
160
161
        return list

    def get_cpu(self):
        """Get CPU info.

        Returns:
            dict: cpu info dict.
        """
        lscpu_dict = {}
        try:
            # get general cpu information from lscpu
162
            lscpu = self._run_cmd('lscpu').splitlines()
163
            # get distinct max_speed and current_speed of cpus from dmidecode
164
165
166
            speed = self._run_cmd(r'dmidecode -t processor | grep "Speed"').splitlines()
            lscpu_dict = self._parse_key_value_lines(lscpu)
            lscpu_dict.update(self._parse_key_value_lines(speed))
167
        except Exception:
168
            logger.exception('Error: get CPU info failed')
169
170
171
172
173
174
175
176
177
178
        return lscpu_dict

    def get_system(self):
        """Get system info.

        Returns:
            dict: system info dict.
        """
        system_dict = {}
        try:
179
180
181
182
183
184
185
186
187
            lsmod = self._run_cmd('lsmod').splitlines()
            lsmod = self._parse_table_lines(lsmod, key=['Module', 'Size', 'Used', 'by'])
            sysctl = self._run_cmd('sysctl -a').splitlines()
            sysctl = self._parse_key_value_lines(sysctl, None, None, '=')
            system_dict['system_manufacturer'] = self._run_cmd('dmidecode -s system-manufacturer').strip()
            system_dict['system_product'] = self._run_cmd('dmidecode -s system-product-name').strip()
            system_dict['os'] = self._run_cmd('cat /proc/version').strip()
            system_dict['uname'] = self._run_cmd('uname -a').strip()
            system_dict['docker'] = self.get_docker_version()
188
189
            system_dict['kernel_parameters'] = sysctl
            system_dict['kernel_modules'] = lsmod
190
            system_dict['dmidecode'] = self._run_cmd('dmidecode').strip()
191
            if system_dict['system_product'] == 'Virtual Machine':
192
193
                lsvmbus = self._run_cmd('lsvmbus').splitlines()
                lsvmbus = self._parse_key_value_lines(lsvmbus)
194
195
                system_dict['vmbus'] = lsvmbus
        except Exception:
196
            logger.exception('Error: get system info failed')
197
198
        return system_dict

199
    def get_docker_version(self):
200
201
202
203
204
205
206
        """Get docker version info.

        Returns:
            dict: docker version info dict.
        """
        docker_version_dict = {}
        try:
207
            docker_version = self._run_cmd('docker version')
208
209
210
211
212
213
214
215
216
217
218
            lines = docker_version.splitlines()

            key = ''
            for line in lines:
                if 'Client' in line:
                    key = 'docker_client_version'
                elif 'Server' in line:
                    key = 'docker_daemon_version'
                elif 'Version' in line and key not in docker_version_dict:
                    docker_version_dict[key] = line.split(':')[1].strip().strip('\t')
        except Exception:
219
            logger.exception('Error: get docker info failed')
220
221
222
223
224
225
226
227
228
229
        return docker_version_dict

    def get_memory(self):
        """Get memory info.

        Returns:
            dict: memory info dict.
        """
        memory_dict = {}
        try:
230
            lsmem = self._run_cmd('lsmem')
231
            lsmem = lsmem.splitlines()
232
            lsmem = self._parse_key_value_lines(lsmem)
233
234
            memory_dict['block_size'] = lsmem.get('Memory block size', '')
            memory_dict['total_capacity'] = lsmem.get('Total online memory', '')
235
            dmidecode_memory = self._run_cmd('dmidecode --type memory')
236
            dmidecode_memory = dmidecode_memory.splitlines()
237
            model = self._parse_key_value_lines(
238
239
240
241
242
243
244
245
                dmidecode_memory, ['Manufacturer', 'Part Number', 'Type', 'Speed', 'Number Of Devices'],
                omitted_values=['other', 'unknown']
            )
            memory_dict['channels'] = model.get('Number Of Devices', '')
            memory_dict['type'] = model.get('Type', '')
            memory_dict['clock_frequency'] = model.get('Speed', '')
            memory_dict['model'] = model.get('Manufacturer', [''])[0] + ' ' + model.get('Part Number', [''])[0]
        except Exception:
246
            logger.exception('Error: get memory info failed')
247
248
        return memory_dict

249
    def get_gpu_nvidia(self):
250
251
252
253
254
255
        """Get nvidia gpu info.

        Returns:
            dict: nvidia gpu info dict.
        """
        gpu_dict = {}
256
257
258
259
260
261
262
263
264
265
        gpu_query = self._run_cmd('nvidia-smi -q -x')
        gpu_query = xmltodict.parse(gpu_query).get('nvidia_smi_log', '')
        gpu_dict['gpu_count'] = gpu_query.get('attached_gpus', '')
        gpu_dict['nvidia_info'] = gpu_query
        gpu_dict['topo'] = self._run_cmd('nvidia-smi topo -m')
        gpu_dict['nvidia-container-runtime_version'] = self._run_cmd('nvidia-container-runtime -v').strip()
        gpu_dict['nvidia-fabricmanager_version'] = self._run_cmd('nv-fabricmanager --version').strip()
        gpu_dict['nv_peer_mem_version'] = self._run_cmd(
            'dpkg -l | grep \'nvidia-peer-memory \' | awk \'$2=="nvidia-peer-memory" {print $3}\''
        ).strip()
266
267
268

        return gpu_dict

269
    def get_gpu_amd(self):
270
271
272
273
274
275
        """Get amd gpu info.

        Returns:
            dict: amd gpu info dict.
        """
        gpu_dict = {}
276
277
278
279
280
281
282
283
284
285
286
        gpu_query = self._run_cmd('rocm-smi -a --json')
        gpu_query = json.loads(gpu_query)
        gpu_per_node = list(filter(lambda x: 'card' in x, gpu_query.keys()))
        gpu_dict['gpu_count'] = len(gpu_per_node)
        gpu_mem_info = self._run_cmd('rocm-smi --showmeminfo vram --json')
        gpu_mem_info = json.loads(gpu_mem_info)
        for card in gpu_per_node:
            gpu_query[card].update(gpu_mem_info.get(card))
        gpu_dict['rocm_info'] = gpu_query
        gpu_dict['topo'] = self._run_cmd('rocm-smi --showtopo')

287
288
        return gpu_dict

289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
    def _merge_hygon_gpu_json_info(self, gpu_info, command):
        """Merge Hygon GPU info from json command output.

        Args:
            gpu_info (dict): GPU info keyed by card id.
            command (str): Command to get GPU info in json format.
        """
        command_output = self._run_cmd(command)
        command_info = json.loads(command_output)
        for card, card_info in command_info.items():
            if not card.startswith('card'):
                continue
            if card not in gpu_info:
                gpu_info[card] = {}
            for key, value in card_info.items():
                if key:
                    gpu_info[card][key] = value

one's avatar
one committed
307
308
    def get_gpu_hygon(self):
        """Get hygon gpu info."""
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
        gpu_dict = {
            'accelerator_vendor': 'hygon',
            'rocm_info': {},
        }

        hygon_json_info_options = [
            '--showid',
            '--showproductname',
            '--showserial',
            '--showvbios',
            '--showfwinfo',
            '--showbus',
            '--showtoponuma',
            '--showreplaycount',
            '--showmeminfo vram',
            '--showmemavailable',
            '--showmemvendor',
            '--showmemuse',
            '--showmemeccinfo',
            '--showmemoverdrive',
            '--showclocks',
            '--showperflevel',
            '--showoverdrive',
            '--showpower',
            '--showmaxpower',
            '--showvoltage',
            '--showtemp',
            '--showuse',
            '--showbw',
        ]
        for option in hygon_json_info_options:
            command = 'hy-smi --json {}'.format(option)
            try:
                self._merge_hygon_gpu_json_info(gpu_dict['rocm_info'], command)
            except Exception:
                logger.exception('Error: get hygon gpu info failed with command: %s', command)

        try:
            gpu_dict['topo'] = self._run_cmd('hy-smi --showtopo')
        except Exception:
            logger.exception('Error: get hygon gpu topology info failed')

        gpu_dict['gpu_count'] = len(gpu_dict['rocm_info'])
one's avatar
one committed
352
353
        return gpu_dict

354
355
356
357
358
359
    def get_gpu(self):
        """Get gpu info and identify gpu type(nvidia/amd).

        Returns:
            dict: gpu info dict.
        """
360
361
362
363
        try:
            if Path('/dev/nvidiactl').is_char_device() and Path('/dev/nvidia-uvm').is_char_device():
                return self.get_gpu_nvidia()
            if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
one's avatar
one committed
364
365
                if Path('/usr/local/hyhal').exists() or Path('/opt/hyhal').exists():
                    return self.get_gpu_hygon()
366
367
368
                return self.get_gpu_amd()
        except Exception:
            logger.exception('Error: get gpu info failed')
369
370
371
372
373
374
375
376
377
378
379
        print('Warning: no gpu detected')
        return {}

    def get_pcie(self):
        """Get pcie info dict.

        Returns:
            dict: pcie info dict.
        """
        pcie_dict = {}
        try:
380
381
            pcie_dict['pcie_topo'] = self._run_cmd('lspci -t -vvv')
            pcie_dict['pcie_info'] = self._run_cmd('lspci -vvv')
382
        except Exception:
383
            logger.exception('Error: get pcie info failed')
384
385
386
387
388
389
390
391
392
393
        return pcie_dict

    def get_storage(self):    # noqa: C901
        """Get storage info dict, including file system info, blocl device info and their mapping.

        Returns:
            dict: storage info dict.
        """
        storage_dict = {}
        try:
394
395
            fs_info = self._run_cmd("df -Th | grep -v \'^/dev/loop\'").splitlines()
            fs_list = self._parse_table_lines(fs_info, key=['Filesystem', 'Type', 'Size', 'Avail', 'Mounted'])
396
397
398
            for fs in fs_list:
                fs_device = fs.get('Filesystem', 'UNKNOWN')
                if fs_device.startswith('/dev'):
399
                    fs['Block_size'] = self._run_cmd('blockdev --getbsz {}'.format(fs_device)).strip()
400
                    fs['4k_alignment'] = ''
401
402
                    partition_ids = self._run_cmd(
                        'yes Cancel | parted {} print | grep -oE "^[[:blank:]]*[0-9]+"'.format(fs_device)
403
404
                    ).splitlines()
                    for id in partition_ids:
405
406
407
                        fs['4k_alignment'] += self._run_cmd(
                            'yes Cancel | parted {} align-check opt {}'.format(fs_device, id)
                        ).strip()
408
409
            storage_dict['file_system'] = fs_list
        except Exception:
410
            logger.exception('Error: get file system info failed')
411
412

        try:
413
414
            disk_info = self._run_cmd("lsblk -e 7 -o NAME,ROTA,SIZE,MODEL | grep -v \'^/dev/loop\'").splitlines()
            disk_list = self._parse_table_lines(disk_info, key=['NAME', 'ROTA', 'SIZE', 'MODEL'])
415
416
417
418
            for disk in disk_list:
                block_device = disk.get('NAME', 'UNKNOWN').strip('\u251c\u2500').strip('\u2514\u2500')
                disk['NAME'] = block_device
                disk['Rotational'] = disk.pop('ROTA')
419
420
                disk['Block_size'] = self._run_cmd('fdisk -l -u /dev/{} | grep "Sector size"'.format(block_device)
                                                   ).strip()
421
                if 'nvme' in block_device:
422
                    nvme_info = self._run_cmd('nvme list | grep {}'.format(block_device)).strip().split()
423
424
425
                    if len(nvme_info) >= 15:
                        disk['Nvme_usage'] = nvme_info[-11] + nvme_info[-10]
            storage_dict['block_device'] = disk_list
426
            storage_dict['mapping_bwtween_filesystem_and_blockdevice'] = self._run_cmd('mount')
427
        except Exception:
428
            logger.exception('Error: get block device info failed')
429
430
431

        return storage_dict

432
    def get_ib(self):
433
434
435
436
437
438
439
        """Get available IB devices info.

        Return:
            list: list of available IB device info dict.
        """
        ib_dict = {}
        try:
440
441
442
            ibstat = self._run_cmd('ibstat').splitlines()
            ib_dict['ib_device_status'] = self._parse_key_value_lines(ibstat)
            ibv_devinfo = self._run_cmd('ibv_devinfo -v').splitlines()
443
444
445
446
            for i in range(len(ibv_devinfo) - 1, -1, -1):
                if ':' not in ibv_devinfo[i]:
                    ibv_devinfo[i - 1] = ibv_devinfo[i - 1] + ',' + ibv_devinfo[i].strip('\t')
                    ibv_devinfo.remove(ibv_devinfo[i])
447
448
449
            ib_dict['ib_device_info'] = self._parse_key_value_lines(ibv_devinfo)
        except Exception:
            logger.exception('Error: get ib info failed')
450
451
        return ib_dict

452
    def get_nic(self):
453
454
455
456
457
458
459
        """Get nic info.

        Returns:
            list: list of available nic info dict.
        """
        nic_list = []
        try:
460
            lsnic_xml = self._run_cmd('lshw -c network -xml')
461
            lsnic_list = xmltodict.parse(lsnic_xml).get('list', {}).get('node', [])
462
463
            if not isinstance(lsnic_list, list):
                lsnic_list = [lsnic_list]
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
            lsnic_list = list(filter(lambda x: 'logicalname' in x, lsnic_list))

            for nic in lsnic_list:
                nic_info = {}
                try:
                    nic_info['logical_name'] = nic['logicalname']
                    nic_info['disabled'] = nic.get('@disabled', False)
                    nic_info['model'] = nic.get('vendor', '') + ' ' + nic.get('product', '')
                    nic_info['description'] = nic.get('description', '')
                    configuration = nic.get('configuration', {}).get('setting')
                    configuration_dict = {}
                    for config in configuration:
                        configuration_dict[config['@id']] = config.get('@value', '')
                    if configuration_dict:
                        nic_info['driver'] = configuration_dict.get('driver', '') + ' ' + configuration_dict.get(
                            'driverversion', ''
                        )
                        nic_info['firmware'] = configuration_dict.get('firmware', '')
482
                    speed = self._run_cmd('cat /sys/class/net/{}/speed'.format(nic_info['logical_name'])).strip()
483
484
485
                    if speed.isdigit():
                        nic_info['speed'] = str(int(speed) / 1000) + ' Gbit/s'
                except Exception:
486
487
                    logger.exception('Error: get nic device {} info failed')

488
489
                nic_list.append(nic_info)
        except Exception:
490
            logger.exception('Error: get nic info failed')
491
        return nic_list
492
493
494
495
496
497
498
499

    def get_network(self):
        """Get network info, including nic info, ib info and ofed version.

        Returns:
            dict: dict of network info.
        """
        network_dict = {}
500
501
502
503
504
505
506
        try:
            network_dict['nic'] = self.get_nic()
            network_dict['ib'] = self.get_ib()
            ofed_version = self._run_cmd('ofed_info  -s').strip()
            network_dict['ofed_version'] = ofed_version
        except Exception:
            logger.exception('Error: get network info failed')
507
508
509
510
511
        return network_dict

    def get_all(self):
        """Get all system info and save them to file in json format."""
        sum_dict = {}
512
513
514
        if os.geteuid() != 0:
            logger.error('You need to be as a root user to run this tool.')
            return sum_dict
515
516
517
518
519
520
521
522
        sum_dict['System'] = self.get_system()
        sum_dict['CPU'] = self.get_cpu()
        sum_dict['Memory'] = self.get_memory()
        sum_dict['Storage'] = self.get_storage()
        sum_dict['Network'] = self.get_network()
        sum_dict['PCIe'] = self.get_pcie()
        sum_dict['Accelerator'] = self.get_gpu()
        return sum_dict