system_info.py 17.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Generate system config."""

import json
import subprocess
import xmltodict
from pathlib import Path


class SystemInfo():    # pragma: no cover
    """Systsem info class."""
    def run_cmd(self, command):
        """Run the command as root or non-root user and return the stdout string..

        Args:
            command (string): the command to run in terminal.

        Returns:
            string: the stdout string of the command.
        """
        output = subprocess.run(
            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
        )
        return output.stdout

    def count_prefix_indent(self, content, symbol='\t'):
        r"""Count the number of a specific symbol in the content.

        Args:
            content (string): the content for counting the indent.
            symbol (str, optional): the symbol of the indent. Defaults to '\t'.

        Returns:
            int: the indent count of the symbol in the beginning of the content.
        """
        count = 0
        for char in content:
            if char == symbol:
                count += 1
            else:
                break
        return count

    def parse_key_value_lines(self, lines, required_keywords=None, omitted_values=None, symbol=':'):    # noqa: C901
        """Parse the lines like "key:value" and convert them to dict.

        if required_keywords is None, include all line. Otherwise,
        only include the line containing one of the keyword in required_keywords.
        If omitted_values is None, accept any value in dict,
        otherwise drop the item whose value in omitted_values.

        Args:
            lines (list): the lines to parse.
            required_keywords (list, optional): list of select keys. Defaults to None.
            omitted_values (list, optional): list of omitted values. Defaults to None.

        Returns:
            dict: the result in dict.
        """
        dict = {}
        key = ''
        value = ''
        i = 0
        length = len(lines)
        while i < length:
            line = lines[i]
            is_selected = True
            if required_keywords is not None:
                is_selected = False
                for key in required_keywords:
                    if key in line:
                        is_selected = True
            if not is_selected:
                i += 1
                continue
            # process with indent recursively
            indent = self.__count_prefix_indent(lines[i])
            if i + 1 < length and self.__count_prefix_indent(lines[i + 1]) > indent:
                key = lines[i].strip().strip('\t')
                next_indent_index = i + 1
                while next_indent_index < length and self.__count_prefix_indent(lines[next_indent_index]) > indent:
                    next_indent_index += 1

                value = self.__parse_key_value_lines(lines[i + 1:next_indent_index])
                i = next_indent_index - 1
            # split line by symbol
            elif symbol in line:
                symbol_index = line.index(symbol)
                line = [line[:symbol_index], line[symbol_index + 1:]]
                key = line[0].strip().strip('\t')
                if len(line) > 1:
                    value = line[1].strip().strip('\t')
                else:
                    value = ''

            is_omit = False
            if omitted_values is not None:
                for omit in omitted_values:
                    if omit in value.lower():
                        is_omit = True
            if not is_omit:
                # save key,value into the dict and merge same key
                if key not in dict:
                    dict[key] = value
                elif dict[key] is not value:
                    if not isinstance(dict[key], list):
                        dict[key] = [dict[key]]
                    if value not in dict[key]:
                        dict[key].append(value)
            i += 1
        return dict

    def parse_table_lines(self, lines, key):
        """Parse lines like a table and extract the colomns whose table index are the same as key to list of dict.

        Args:
            lines (list): the lines to parse.
            key ([type]): A subset of the the table index.

        Returns:
            list: the result in list of dict.
        """
        index = []
        list = []
        valid = False
        try:
            for line in lines:
                line = line.split()
                if key[0] in line:
                    for i in range(len(key)):
                        index.append(line.index(key[i]))
                    valid = True
                    continue
                if valid:
                    dict = {}
                    for i in range(len(key)):
                        if index[i] < len(line):
                            dict[key[i]] = line[index[i]]
                    list.append(dict)
        except Exception:
            print('Error: key error in __parse_table_lines')
        return list

    def get_cpu(self):
        """Get CPU info.

        Returns:
            dict: cpu info dict.
        """
        lscpu_dict = {}
        try:
            # get general cpu information from lscpu
            lscpu = self.__run_cmd('lscpu').splitlines()
            # get distinct max_speed and current_speed of cpus from dmidecode
            speed = self.__run_cmd(r'dmidecode -t processor | grep "Speed"').splitlines()
            lscpu_dict = self.__parse_key_value_lines(lscpu)
            lscpu_dict.update(self.__parse_key_value_lines(speed))
        except Exception:
            print('Error: get CPU info failed')
        return lscpu_dict

    def get_system(self):
        """Get system info.

        Returns:
            dict: system info dict.
        """
        system_dict = {}
        try:
            lsmod = self.__run_cmd('lsmod').splitlines()
            lsmod = self.__parse_table_lines(lsmod, key=['Module', 'Size', 'Used', 'by'])
            sysctl = self.__run_cmd('sysctl -a').splitlines()
            sysctl = self.__parse_key_value_lines(sysctl, None, None, '=')
            system_dict['system_manufacturer'] = self.__run_cmd('dmidecode -s system-manufacturer').strip()
            system_dict['system_product'] = self.__run_cmd('dmidecode -s system-product-name').strip()
            system_dict['os'] = self.__run_cmd('cat /proc/version').strip()
            system_dict['uname'] = self.__run_cmd('uname -a').strip()
            system_dict['docker'] = self.__get_docker_version()
            system_dict['kernel_parameters'] = sysctl
            system_dict['kernel_modules'] = lsmod
            system_dict['dmidecode'] = self.__run_cmd('dmidecode').strip()
            if system_dict['system_product'] == 'Virtual Machine':
                lsvmbus = self.__run_cmd('lsvmbus').splitlines()
                lsvmbus = self.__parse_key_value_lines(lsvmbus)
                system_dict['vmbus'] = lsvmbus
        except Exception:
            print('Error: get system info failed')
        return system_dict

    def __get_docker_version(self):
        """Get docker version info.

        Returns:
            dict: docker version info dict.
        """
        docker_version_dict = {}
        try:
            docker_version = self.__run_cmd('docker version')
            lines = docker_version.splitlines()

            key = ''
            for line in lines:
                if 'Client' in line:
                    key = 'docker_client_version'
                elif 'Server' in line:
                    key = 'docker_daemon_version'
                elif 'Version' in line and key not in docker_version_dict:
                    docker_version_dict[key] = line.split(':')[1].strip().strip('\t')
        except Exception:
            print('Error: get docker info failed')
        return docker_version_dict

    def get_memory(self):
        """Get memory info.

        Returns:
            dict: memory info dict.
        """
        memory_dict = {}
        try:
            lsmem = self.__run_cmd('lsmem')
            lsmem = lsmem.splitlines()
            lsmem = self.__parse_key_value_lines(lsmem)
            memory_dict['block_size'] = lsmem.get('Memory block size', '')
            memory_dict['total_capacity'] = lsmem.get('Total online memory', '')
            dmidecode_memory = self.__run_cmd('dmidecode --type memory')
            dmidecode_memory = dmidecode_memory.splitlines()
            model = self.__parse_key_value_lines(
                dmidecode_memory, ['Manufacturer', 'Part Number', 'Type', 'Speed', 'Number Of Devices'],
                omitted_values=['other', 'unknown']
            )
            memory_dict['channels'] = model.get('Number Of Devices', '')
            memory_dict['type'] = model.get('Type', '')
            memory_dict['clock_frequency'] = model.get('Speed', '')
            memory_dict['model'] = model.get('Manufacturer', [''])[0] + ' ' + model.get('Part Number', [''])[0]
        except Exception:
            print('Error: get memory info failed')
        return memory_dict

    def __get_gpu_nvidia(self):
        """Get nvidia gpu info.

        Returns:
            dict: nvidia gpu info dict.
        """
        gpu_dict = {}
        try:
            gpu_query = self.__run_cmd('nvidia-smi -q -x')
            gpu_query = xmltodict.parse(gpu_query).get('nvidia_smi_log', '')
            gpu_dict['gpu_count'] = gpu_query.get('attached_gpus', '')
            gpu_dict['nvidia_info'] = gpu_query
            gpu_dict['topo'] = self.__run_cmd('nvidia-smi topo -m')
            gpu_dict['nvidia-container-runtime_version'] = self.__run_cmd('nvidia-container-runtime -v').strip()
            gpu_dict['nvidia-fabricmanager_version'] = self.__run_cmd('nv-fabricmanager --version').strip()
            gpu_dict['nv_peer_mem_version'] = self.__run_cmd(
                'dpkg -l | grep \'nvidia-peer-memory \' | awk \'$2=="nvidia-peer-memory" {print $3}\''
            ).strip()
        except Exception:
            print('Error: get nvidia gpu info failed')

        return gpu_dict

    def __get_gpu_amd(self):
        """Get amd gpu info.

        Returns:
            dict: amd gpu info dict.
        """
        gpu_dict = {}
        try:
            gpu_query = self.__run_cmd('rocm-smi -a --json')
            gpu_query = json.loads(gpu_query)
            gpu_per_node = list(filter(lambda x: 'card' in x, gpu_query.keys()))
            gpu_dict['gpu_count'] = len(gpu_per_node)
            gpu_mem_info = self.__run_cmd('rocm-smi --showmeminfo vram --json')
            gpu_mem_info = json.loads(gpu_mem_info)
            for card in gpu_per_node:
                gpu_query[card].update(gpu_mem_info.get(card))
            gpu_dict['rocm_info'] = gpu_query
            gpu_dict['topo'] = self.__run_cmd('rocm-smi --showtopo')
        except Exception:
            print('Error: get amd gpu info failed')
        return gpu_dict

    def get_gpu(self):
        """Get gpu info and identify gpu type(nvidia/amd).

        Returns:
            dict: gpu info dict.
        """
        if Path('/dev/nvidiactl').is_char_device() and Path('/dev/nvidia-uvm').is_char_device():
            return self.__get_gpu_nvidia()
        if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
            return self.__get_gpu_amd()
        print('Warning: no gpu detected')
        return {}

    def get_pcie(self):
        """Get pcie info dict.

        Returns:
            dict: pcie info dict.
        """
        pcie_dict = {}
        try:
            pcie_dict['pcie_topo'] = self.__run_cmd('lspci -t -vvv')
            pcie_dict['pcie_info'] = self.__run_cmd('lspci -vvv')
        except Exception:
            print('Error: get pcie gpu info failed')
        return pcie_dict

    def get_storage(self):    # noqa: C901
        """Get storage info dict, including file system info, blocl device info and their mapping.

        Returns:
            dict: storage info dict.
        """
        storage_dict = {}
        try:
            fs_info = self.__run_cmd("df -Th | grep -v \'^/dev/loop\'").splitlines()
            fs_list = self.__parse_table_lines(fs_info, key=['Filesystem', 'Type', 'Size', 'Avail', 'Mounted'])
            for fs in fs_list:
                fs_device = fs.get('Filesystem', 'UNKNOWN')
                if fs_device.startswith('/dev'):
                    fs['Block_size'] = self.__run_cmd('blockdev --getbsz {}'.format(fs_device)).strip()
                    fs['4k_alignment'] = ''
                    partition_ids = self.__run_cmd(
                        'parted {} print | grep -oE "^[[:blank:]]*[0-9]+"'.format(fs_device)
                    ).splitlines()
                    for id in partition_ids:
                        fs['4k_alignment'] += self.__run_cmd('parted {} align-check opt {}'.format(fs_device,
                                                                                                   id)).strip()
            storage_dict['file_system'] = fs_list
        except Exception:
            print('Error: get file system info failed')

        try:
            disk_info = self.__run_cmd("lsblk -e 7 -o NAME,ROTA,SIZE,MODEL | grep -v \'^/dev/loop\'").splitlines()
            disk_list = self.__parse_table_lines(disk_info, key=['NAME', 'ROTA', 'SIZE', 'MODEL'])
            for disk in disk_list:
                block_device = disk.get('NAME', 'UNKNOWN').strip('\u251c\u2500').strip('\u2514\u2500')
                disk['NAME'] = block_device
                disk['Rotational'] = disk.pop('ROTA')
                disk['Block_size'] = self.__run_cmd('fdisk -l -u /dev/{} | grep "Sector size"'.format(block_device)
                                                    ).strip()
                if 'nvme' in block_device:
                    nvme_info = self.__run_cmd('nvme list | grep {}'.format(block_device)).strip().split()
                    if len(nvme_info) >= 15:
                        disk['Nvme_usage'] = nvme_info[-11] + nvme_info[-10]
            storage_dict['block_device'] = disk_list
            storage_dict['mapping_bwtween_filesystem_and_blockdevice'] = self.__run_cmd('mount')
        except Exception:
            print('Error: get block device info failed')

        return storage_dict

    def __get_ib(self):
        """Get available IB devices info.

        Return:
            list: list of available IB device info dict.
        """
        ib_dict = {}
        try:
            ibstat = self.__run_cmd('ibstat').splitlines()
            ib_dict['ib_device_status'] = self.__parse_key_value_lines(ibstat)
            ibv_devinfo = self.__run_cmd('ibv_devinfo -v').splitlines()
            for i in range(len(ibv_devinfo) - 1, -1, -1):
                if ':' not in ibv_devinfo[i]:
                    ibv_devinfo[i - 1] = ibv_devinfo[i - 1] + ',' + ibv_devinfo[i].strip('\t')
                    ibv_devinfo.remove(ibv_devinfo[i])
            ib_dict['ib_device_info'] = self.__parse_key_value_lines(ibv_devinfo)
        except Exception as e:
            print('Error: get ib info failed. message: {}.'.format(str(e)))
        return ib_dict

    def __get_nic(self):
        """Get nic info.

        Returns:
            list: list of available nic info dict.
        """
        nic_list = []
        try:
            lsnic_xml = self.__run_cmd('lshw -c network -xml')
            lsnic_list = xmltodict.parse(lsnic_xml).get('list', {}).get('node', [])
            lsnic_list = list(filter(lambda x: 'logicalname' in x, lsnic_list))

            for nic in lsnic_list:
                nic_info = {}
                try:
                    nic_info['logical_name'] = nic['logicalname']
                    nic_info['disabled'] = nic.get('@disabled', False)
                    nic_info['model'] = nic.get('vendor', '') + ' ' + nic.get('product', '')
                    nic_info['description'] = nic.get('description', '')
                    configuration = nic.get('configuration', {}).get('setting')
                    configuration_dict = {}
                    for config in configuration:
                        configuration_dict[config['@id']] = config.get('@value', '')
                    if configuration_dict:
                        nic_info['driver'] = configuration_dict.get('driver', '') + ' ' + configuration_dict.get(
                            'driverversion', ''
                        )
                        nic_info['firmware'] = configuration_dict.get('firmware', '')
                    speed = self.__run_cmd('cat /sys/class/net/{}/speed'.format(nic_info['logical_name'])).strip()
                    if speed.isdigit():
                        nic_info['speed'] = str(int(speed) / 1000) + ' Gbit/s'
                except Exception:
                    print('Error: get nic device {} info failed'.format(nic_info['logical_name']))
                nic_list.append(nic_info)
        except Exception:
            print('Error: get nic info failed')
        return nic_list

    def get_network(self):
        """Get network info, including nic info, ib info and ofed version.

        Returns:
            dict: dict of network info.
        """
        network_dict = {}
        network_dict['nic'] = self.__get_nic()
        network_dict['ib'] = self.__get_ib()
        ofed_version = self.__run_cmd('ofed_info  -s').strip()
        network_dict['ofed_version'] = ofed_version
        return network_dict

    def get_all(self):
        """Get all system info and save them to file in json format."""
        sum_dict = {}
        sum_dict['System'] = self.get_system()
        sum_dict['CPU'] = self.get_cpu()
        sum_dict['Memory'] = self.get_memory()
        sum_dict['Storage'] = self.get_storage()
        sum_dict['Network'] = self.get_network()
        sum_dict['PCIe'] = self.get_pcie()
        sum_dict['Accelerator'] = self.get_gpu()
        return sum_dict