Unverified Commit a9b45a07 authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Monitor - Support cgroup V2 when read system metrics. (#491)

**Description**
Since ubuntu 22.04 will use cgroup V2 and the file structure changed.
Modify the monitor to adapt to cgroup v1 and v2.
parent dbeba805
...@@ -129,7 +129,7 @@ def get_device_row_remapped_info(self, idx): ...@@ -129,7 +129,7 @@ def get_device_row_remapped_info(self, idx):
Return: Return:
remapped_metrics (dict): the row remapped information, None means failed to get the data. remapped_metrics (dict): the row remapped information, None means failed to get the data.
""" """
output = process.run_command('nvidia-smi -i {} -q'.format(idx), quite=True) output = process.run_command('nvidia-smi -i {} -q'.format(idx), quiet=True)
if output.returncode == 0: if output.returncode == 0:
begin = output.stdout.find('Remapped Rows') begin = output.stdout.find('Remapped Rows')
end = output.stdout.find('Temperature', begin) end = output.stdout.find('Temperature', begin)
......
...@@ -10,12 +10,12 @@ ...@@ -10,12 +10,12 @@
from superbench.common.utils import stdout_logger from superbench.common.utils import stdout_logger
def run_command(command, quite=False, flush_output=False): def run_command(command, quiet=False, flush_output=False):
"""Run command in string format, return the result with stdout and stderr. """Run command in string format, return the result with stdout and stderr.
Args: Args:
command (str): command to run. command (str): command to run.
quite (bool): no stdout display of the command if quite is True. quiet (bool): no stdout display of the command if quiet is True.
flush_output (bool): enable real-time output flush or not when running the command. flush_output (bool): enable real-time output flush or not when running the command.
Return: Return:
...@@ -31,7 +31,7 @@ def run_command(command, quite=False, flush_output=False): ...@@ -31,7 +31,7 @@ def run_command(command, quite=False, flush_output=False):
output = '' output = ''
for line in process.stdout: for line in process.stdout:
output += line output += line
if not quite: if not quiet:
stdout_logger.log(line) stdout_logger.log(line)
process.wait() process.wait()
retcode = process.poll() retcode = process.poll()
...@@ -45,6 +45,6 @@ def run_command(command, quite=False, flush_output=False): ...@@ -45,6 +45,6 @@ def run_command(command, quite=False, flush_output=False):
result = subprocess.run( result = subprocess.run(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
) )
if not quite: if not quiet:
stdout_logger.log(result.stdout) stdout_logger.log(result.stdout)
return result return result
...@@ -39,6 +39,16 @@ def __init__(self, container_name, sample_duration, sample_interval, output_file ...@@ -39,6 +39,16 @@ def __init__(self, container_name, sample_duration, sample_interval, output_file
self.__output_handler = open(self.__output_file, 'a') self.__output_handler = open(self.__output_file, 'a')
self.__cgroup = 1
output = run_command('grep cgroup /proc/filesystems', quiet=True)
if output.returncode != 0:
logger.error('Failed to check the cgroup version, will assume using cgroup V1.')
else:
if 'cgroup2' in output.stdout:
self.__cgroup = 2
logger.info('cgroup version: {}.'.format(self.__cgroup))
def __preprocess(self): def __preprocess(self):
"""Preprocess/preparation operations before the monitoring. """Preprocess/preparation operations before the monitoring.
...@@ -67,11 +77,20 @@ def __preprocess(self): ...@@ -67,11 +77,20 @@ def __preprocess(self):
container_pid = output.stdout container_pid = output.stdout
try: try:
if self.__cgroup == 1:
self._cpu_file = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))[0] self._cpu_file = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))[0]
self._mem_file = glob.glob( self._mem_file = glob.glob(
'/sys/fs/cgroup/memory/docker/{}*/memory.usage_in_bytes'.format(container_id) '/sys/fs/cgroup/memory/docker/{}*/memory.usage_in_bytes'.format(container_id)
)[0] )[0]
self._net_file = '/proc/{}/net/dev'.format(container_pid) self._net_file = '/proc/{}/net/dev'.format(container_pid)
else:
self._cpu_file = glob.glob(
'/sys/fs/cgroup/system.slice/docker-{}*.scope/cpu.stat'.format(container_id)
)[0]
self._mem_file = glob.glob(
'/sys/fs/cgroup/system.slice/docker-{}*.scope/memory.stat'.format(container_id)
)[0]
self._net_file = '/proc/net/dev'
except BaseException as e: except BaseException as e:
logger.error( logger.error(
'Faild to get the cpu/mem/net file - container: {}, error message: {}'.format( 'Faild to get the cpu/mem/net file - container: {}, error message: {}'.format(
...@@ -80,8 +99,12 @@ def __preprocess(self): ...@@ -80,8 +99,12 @@ def __preprocess(self):
) )
return False return False
else: else:
if self.__cgroup == 1:
self._cpu_file = '/sys/fs/cgroup/cpuacct/cpuacct.stat' self._cpu_file = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
self._mem_file = '/sys/fs/cgroup/memory/memory.usage_in_bytes' self._mem_file = '/sys/fs/cgroup/memory/memory.usage_in_bytes'
else:
self._cpu_file = '/sys/fs/cgroup/cpu.stat'
self._mem_file = '/sys/fs/cgroup/memory.stat'
self._net_file = '/proc/net/dev' self._net_file = '/proc/net/dev'
return True return True
...@@ -215,12 +238,20 @@ def __get_process_cpu_ticks(self): ...@@ -215,12 +238,20 @@ def __get_process_cpu_ticks(self):
system_time = 0 system_time = 0
try: try:
with open(self._cpu_file, 'r') as f: with open(self._cpu_file, 'r') as f:
if self.__cgroup == 1:
for line in f: for line in f:
items = line.split() items = line.split()
if items[0] == 'user': if items[0] == 'user':
user_time = int(items[1]) user_time = int(items[1])
elif items[1] == 'system': elif items[0] == 'system':
system_time = int(items[1]) system_time = int(items[1])
else:
for line in f:
items = line.split()
if items[0] == 'user_usec':
user_time = int(items[1]) / 10000
elif items[0] == 'system_usec':
system_time = int(items[1]) / 10000
return user_time + system_time return user_time + system_time
except BaseException as e: except BaseException as e:
logger.error('Failed to read process cpu ticks information - error message: {}'.format(str(e))) logger.error('Failed to read process cpu ticks information - error message: {}'.format(str(e)))
......
...@@ -4,16 +4,31 @@ ...@@ -4,16 +4,31 @@
"""Tests for Monitor module.""" """Tests for Monitor module."""
import numbers import numbers
import tempfile
import unittest
import shutil
import pathlib
from tests.helper import decorator from tests.helper import decorator
from superbench.monitor import Monitor from superbench.monitor import Monitor
from superbench.monitor import MonitorRecord from superbench.monitor import MonitorRecord
@decorator.cuda_test class MonitorTestCase(unittest.TestCase):
def test_monitor(): """A class for Monitor test cases."""
def setUp(self):
"""Hook method for setting up the test fixture before exercising it."""
self.sb_output_dir = tempfile.mkdtemp()
def tearDown(self):
"""Hook method for deconstructing the test fixture after testing it."""
shutil.rmtree(self.sb_output_dir)
@decorator.cuda_test
def test_monitor(self):
"""Test the module Monitor.""" """Test the module Monitor."""
monitor = Monitor(None, 1, 10, 'file') log_file = pathlib.Path(self.sb_output_dir) / 'monitor.log'
monitor = Monitor(None, 1, 10, str(log_file))
monitor._Monitor__preprocess() monitor._Monitor__preprocess()
record = MonitorRecord() record = MonitorRecord()
monitor._Monitor__sample_host_metrics(record) monitor._Monitor__sample_host_metrics(record)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment