Commit 8329d30f authored by SparkSnail's avatar SparkSnail Committed by Yan Ni
Browse files

nnictl experiment cleanup (#1186)

* fix remote bug

* add document

* add document

* update

* update

* update

* update

* fix remote issue

* fix forEach

* update doc according to comments

* update

* update

* update

* remove 'any more'

* init

* update for remote and hdfs

* remove unused code

* fix comments

* fix comments

* fix comments

* fix comments
parent c84ba257
...@@ -15,6 +15,7 @@ nnictl support commands: ...@@ -15,6 +15,7 @@ nnictl support commands:
* [nnictl trial](#trial) * [nnictl trial](#trial)
* [nnictl top](#top) * [nnictl top](#top)
* [nnictl experiment](#experiment) * [nnictl experiment](#experiment)
* [nnictl platform](#platform)
* [nnictl config](#config) * [nnictl config](#config)
* [nnictl log](#log) * [nnictl log](#log)
* [nnictl webui](#webui) * [nnictl webui](#webui)
...@@ -370,6 +371,26 @@ Debug mode will disable version check function in Trialkeeper. ...@@ -370,6 +371,26 @@ Debug mode will disable version check function in Trialkeeper.
nnictl experiment list nnictl experiment list
``` ```
* __nnictl experiment delete__
* Description
Delete one or all experiments, it includes log, result, environment information and cache. It uses to delete useless experiment result, or save disk space.
* Usage
```bash
nnictl experiment delete [OPTIONS]
```
* Options
|Name, shorthand|Required|Default|Description|
|------|------|------ |------|
|id| False| |ID of the experiment|
<a name="export"></a> <a name="export"></a>
* __nnictl experiment export__ * __nnictl experiment export__
...@@ -456,6 +477,32 @@ Debug mode will disable version check function in Trialkeeper. ...@@ -456,6 +477,32 @@ Debug mode will disable version check function in Trialkeeper.
nnictl experiment import [experiment_id] -f experiment_data.json nnictl experiment import [experiment_id] -f experiment_data.json
``` ```
<a name="platform"></a>
![](https://placehold.it/15/1589F0/000000?text=+) `Manage platform information`
* __nnictl platform clean__
* Description
It uses to clean up disk on a target platform. The provided YAML file includes the information of target platform, and it follows the same schema as the NNI configuration file.
* Note
if the target platform is being used by other users, it may cause unexpected errors to others.
* Usage
```bash
nnictl platform clean [OPTIONS]
```
* Options
|Name, shorthand|Required|Default|Description|
|------|------|------ |------|
|--config| True| |the path of yaml config file used when create an experiment|
<a name="config"></a> <a name="config"></a>
![](https://placehold.it/15/1589F0/000000?text=+) `nnictl config show` ![](https://placehold.it/15/1589F0/000000?text=+) `nnictl config show`
......
...@@ -121,6 +121,19 @@ def parse_args(): ...@@ -121,6 +121,19 @@ def parse_args():
parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids') parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids')
parser_experiment_list.add_argument('all', nargs='?', help='list all of experiments') parser_experiment_list.add_argument('all', nargs='?', help='list all of experiments')
parser_experiment_list.set_defaults(func=experiment_list) parser_experiment_list.set_defaults(func=experiment_list)
parser_experiment_clean = parser_experiment_subparsers.add_parser('delete', help='clean up the experiment data')
parser_experiment_clean.add_argument('id', nargs='?', help='the id of experiment')
parser_experiment_clean.add_argument('--all', action='store_true', default=False, help='delete all of experiments')
parser_experiment_clean.set_defaults(func=experiment_clean)
#parse experiment command
parser_platform = subparsers.add_parser('platform', help='get platform information')
#add subparsers for parser_experiment
parser_platform_subparsers = parser_platform.add_subparsers()
parser_platform_clean = parser_platform_subparsers.add_parser('clean', help='clean up the platform data')
parser_platform_clean.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file')
parser_platform_clean.set_defaults(func=platform_clean)
#import tuning data #import tuning data
parser_import_data = parser_experiment_subparsers.add_parser('import', help='import additional data') parser_import_data = parser_experiment_subparsers.add_parser('import', help='import additional data')
parser_import_data.add_argument('id', nargs='?', help='the id of experiment') parser_import_data.add_argument('id', nargs='?', help='the id of experiment')
......
...@@ -24,6 +24,10 @@ import psutil ...@@ -24,6 +24,10 @@ import psutil
import json import json
import datetime import datetime
import time import time
import re
from pathlib import Path
from pyhdfs import HdfsClient, HdfsFileNotFoundException
import shutil
from subprocess import call, check_output from subprocess import call, check_output
from nni_annotation import expand_annotations from nni_annotation import expand_annotations
from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response
...@@ -31,8 +35,9 @@ from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_ ...@@ -31,8 +35,9 @@ from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, export_
from .config_utils import Config, Experiments from .config_utils import Config, Experiments
from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \ from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \
EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT
from .common_utils import print_normal, print_error, print_warning, detect_process from .common_utils import print_normal, print_error, print_warning, detect_process, get_yml_content
from .command_utils import check_output_command, kill_command from .command_utils import check_output_command, kill_command
from .ssh_utils import create_ssh_sftp_client, remove_remote_directory
def get_experiment_time(port): def get_experiment_time(port):
'''get the startTime and endTime of an experiment''' '''get the startTime and endTime of an experiment'''
...@@ -73,9 +78,10 @@ def update_experiment(): ...@@ -73,9 +78,10 @@ def update_experiment():
if status: if status:
experiment_config.update_experiment(key, 'status', status) experiment_config.update_experiment(key, 'status', status)
def check_experiment_id(args): def check_experiment_id(args, update=True):
'''check if the id is valid '''check if the id is valid
''' '''
if update:
update_experiment() update_experiment()
experiment_config = Experiments() experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments() experiment_dict = experiment_config.get_all_experiments()
...@@ -170,7 +176,7 @@ def get_config_filename(args): ...@@ -170,7 +176,7 @@ def get_config_filename(args):
'''get the file name of config file''' '''get the file name of config file'''
experiment_id = check_experiment_id(args) experiment_id = check_experiment_id(args)
if experiment_id is None: if experiment_id is None:
print_error('Please set the experiment id!') print_error('Please set correct experiment id!')
exit(1) exit(1)
experiment_config = Experiments() experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments() experiment_dict = experiment_config.get_all_experiments()
...@@ -180,7 +186,7 @@ def get_experiment_port(args): ...@@ -180,7 +186,7 @@ def get_experiment_port(args):
'''get the port of experiment''' '''get the port of experiment'''
experiment_id = check_experiment_id(args) experiment_id = check_experiment_id(args)
if experiment_id is None: if experiment_id is None:
print_error('Please set the experiment id!') print_error('Please set correct experiment id!')
exit(1) exit(1)
experiment_config = Experiments() experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments() experiment_dict = experiment_config.get_all_experiments()
...@@ -373,6 +379,166 @@ def webui_url(args): ...@@ -373,6 +379,166 @@ def webui_url(args):
nni_config = Config(get_config_filename(args)) nni_config = Config(get_config_filename(args))
print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl')))) print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl'))))
def local_clean(directory):
'''clean up local data'''
print_normal('removing folder {0}'.format(directory))
try:
shutil.rmtree(directory)
except FileNotFoundError as err:
print_error('{0} does not exist!'.format(directory))
def remote_clean(machine_list, experiment_id=None):
'''clean up remote data'''
for machine in machine_list:
passwd = machine.get('passwd')
userName = machine.get('username')
host = machine.get('ip')
port = machine.get('port')
if experiment_id:
remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments', experiment_id])
else:
remote_dir = '/' + '/'.join(['tmp', 'nni', 'experiments'])
sftp = create_ssh_sftp_client(host, port, userName, passwd)
print_normal('removing folder {0}'.format(host + ':' + str(port) + remote_dir))
remove_remote_directory(sftp, remote_dir)
def hdfs_clean(host, user_name, output_dir, experiment_id=None):
'''clean up hdfs data'''
hdfs_client = HdfsClient(hosts='{0}:80'.format(host), user_name=user_name, webhdfs_path='/webhdfs/api/v1', timeout=5)
if experiment_id:
full_path = '/' + '/'.join([user_name, 'nni', 'experiments', experiment_id])
else:
full_path = '/' + '/'.join([user_name, 'nni', 'experiments'])
print_normal('removing folder {0} in hdfs'.format(full_path))
hdfs_client.delete(full_path, recursive=True)
if output_dir:
pattern = re.compile('hdfs://(?P<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?P<baseDir>/.*)?')
match_result = pattern.match(output_dir)
if match_result:
output_host = match_result.group('host')
output_dir = match_result.group('baseDir')
#check if the host is valid
if output_host != host:
print_warning('The host in {0} is not consistent with {1}'.format(output_dir, host))
else:
if experiment_id:
output_dir = output_dir + '/' + experiment_id
print_normal('removing folder {0} in hdfs'.format(output_dir))
hdfs_client.delete(output_dir, recursive=True)
def experiment_clean(args):
'''clean up the experiment data'''
experiment_id_list = []
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
if args.all:
experiment_id_list = list(experiment_dict.keys())
else:
if args.id is None:
print_error('please set experiment id!')
exit(1)
if args.id not in experiment_dict:
print_error('can not find id {0}!'.format(args.id))
exit(1)
experiment_id_list.append(args.id)
while True:
print('INFO: This action will delete experiment {0}, and it’s not recoverable.'.format(' '.join(experiment_id_list)))
inputs = input('INFO: do you want to continue?[y/N]:')
if not inputs.lower() or inputs.lower() in ['n', 'no']:
exit(0)
elif inputs.lower() not in ['y', 'n', 'yes', 'no']:
print_warning('please input Y or N!')
else:
break
for experiment_id in experiment_id_list:
nni_config = Config(experiment_dict[experiment_id]['fileName'])
platform = nni_config.get_config('experimentConfig').get('trainingServicePlatform')
experiment_id = nni_config.get_config('experimentId')
if platform == 'remote':
machine_list = nni_config.get_config('experimentConfig').get('machineList')
remote_clean(machine_list, experiment_id)
elif platform == 'pai':
host = nni_config.get_config('experimentConfig').get('paiConfig').get('host')
user_name = nni_config.get_config('experimentConfig').get('paiConfig').get('userName')
output_dir = nni_config.get_config('experimentConfig').get('trial').get('outputDir')
hdfs_clean(host, user_name, output_dir, experiment_id)
elif platform != 'local':
#TODO: support all platforms
print_warning('platform {0} clean up not supported yet!'.format(platform))
exit(0)
#clean local data
home = str(Path.home())
local_dir = nni_config.get_config('experimentConfig').get('logDir')
if not local_dir:
local_dir = os.path.join(home, 'nni', 'experiments', experiment_id)
local_clean(local_dir)
experiment_config = Experiments()
print_normal('removing metadata of experiment {0}'.format(experiment_id))
experiment_config.remove_experiment(experiment_id)
print_normal('Finish!')
def get_platform_dir(config_content):
'''get the dir list to be deleted'''
platform = config_content.get('trainingServicePlatform')
dir_list = []
if platform == 'remote':
machine_list = config_content.get('machineList')
for machine in machine_list:
host = machine.get('ip')
port = machine.get('port')
dir_list.append(host + ':' + str(port) + '/tmp/nni/experiments')
elif platform == 'pai':
pai_config = config_content.get('paiConfig')
host = config_content.get('paiConfig').get('host')
user_name = config_content.get('paiConfig').get('userName')
output_dir = config_content.get('trial').get('outputDir')
dir_list.append('hdfs://{0}:9000/{1}/nni/experiments'.format(host, user_name))
if output_dir:
dir_list.append(output_dir)
return dir_list
def platform_clean(args):
'''clean up the experiment data'''
config_path = os.path.abspath(args.config)
if not os.path.exists(config_path):
print_error('Please set correct config path!')
exit(1)
config_content = get_yml_content(config_path)
platform = config_content.get('trainingServicePlatform')
if platform not in ['remote', 'pai']:
print_normal('platform {0} not supported!'.format(platform))
exit(0)
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
update_experiment()
id_list = list(experiment_dict.keys())
dir_list = get_platform_dir(config_content)
if not dir_list:
print_normal('No folder of NNI caches is found!')
exit(1)
while True:
print_normal('This command will remove below folders of NNI caches. If other users are using experiments on below hosts, it will be broken.')
for dir in dir_list:
print(' ' + dir)
inputs = input('INFO: do you want to continue?[y/N]:')
if not inputs.lower() or inputs.lower() in ['n', 'no']:
exit(0)
elif inputs.lower() not in ['y', 'n', 'yes', 'no']:
print_warning('please input Y or N!')
else:
break
if platform == 'remote':
machine_list = config_content.get('machineList')
for machine in machine_list:
remote_clean(machine_list, None)
elif platform == 'pai':
pai_config = config_content.get('paiConfig')
host = config_content.get('paiConfig').get('host')
user_name = config_content.get('paiConfig').get('userName')
output_dir = config_content.get('trial').get('outputDir')
hdfs_clean(host, user_name, output_dir, None)
print_normal('Done!')
def experiment_list(args): def experiment_list(args):
'''get the information of all experiments''' '''get the information of all experiments'''
experiment_config = Experiments() experiment_config = Experiments()
...@@ -393,7 +559,6 @@ def experiment_list(args): ...@@ -393,7 +559,6 @@ def experiment_list(args):
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!') print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!')
experiment_information = "" experiment_information = ""
for key in experiment_id_list: for key in experiment_id_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\ experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) experiment_dict[key].get('platform'), experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
......
...@@ -57,3 +57,17 @@ def create_ssh_sftp_client(host_ip, port, username, password): ...@@ -57,3 +57,17 @@ def create_ssh_sftp_client(host_ip, port, username, password):
return sftp return sftp
except Exception as exception: except Exception as exception:
print_error('Create ssh client error %s\n' % exception) print_error('Create ssh client error %s\n' % exception)
def remove_remote_directory(sftp, directory):
'''remove a directory in remote machine'''
try:
files = sftp.listdir(directory)
for file in files:
filepath = '/'.join([directory, file])
try:
sftp.remove(filepath)
except IOError:
remove_remote_directory(sftp, filepath)
sftp.rmdir(directory)
except IOError as err:
print_error(err)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment