Commit cfda0dae authored by demianzhang's avatar demianzhang Committed by SparkSnail
Browse files

NNI on Windows for NNI Local mode (#937)

parent 88ceed71
......@@ -25,9 +25,10 @@ import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
import { String } from 'typescript-string-operations';
import { execMkdir, getScriptName, getgpuMetricsCollectorScriptContent, execScript, execTail, execRemove, execKill } from '../common/util'
import { getLogger, Logger } from '../../common/log';
import { delay } from '../../common/utils';
import { GPU_INFO_COLLECTOR_FORMAT, GPUInfo, GPUSummary } from '../common/gpuData';
import { GPUInfo, GPUSummary } from '../common/gpuData';
/**
* GPUScheduler for local training service
......@@ -57,6 +58,19 @@ class GPUScheduler {
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await execMkdir(this.gpuMetricCollectorScriptFolder);
//generate gpu_metrics_collector script
let gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, getScriptName('gpu_metrics_collector'));
const gpuMetricsCollectorScriptContent: string = getgpuMetricsCollectorScriptContent(this.gpuMetricCollectorScriptFolder);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
execScript(gpuMetricsCollectorScriptPath)
}
public getAvailableGPUIndices(): number[] {
if (this.gpuSummary !== undefined) {
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0)
......@@ -78,33 +92,16 @@ class GPUScheduler {
this.stopping = true;
try {
const pid: string = await fs.promises.readFile(path.join(this.gpuMetricCollectorScriptFolder, 'pid'), 'utf8');
await cpp.exec(`pkill -P ${pid}`);
await cpp.exec(`rm -rf ${this.gpuMetricCollectorScriptFolder}`);
await execKill(pid);
await execRemove(this.gpuMetricCollectorScriptFolder);
} catch (error) {
this.log.error(`GPU scheduler error: ${error}`);
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await cpp.exec(`mkdir -p ${this.gpuMetricCollectorScriptFolder}`);
//generate gpu_metrics_collector.sh
const gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics_collector.sh');
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
this.gpuMetricCollectorScriptFolder,
path.join(this.gpuMetricCollectorScriptFolder, 'pid')
);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
cp.exec(`bash ${gpuMetricsCollectorScriptPath}`);
}
private async updateGPUSummary(): Promise<void> {
const cmdresult: cpp.childProcessPromise.Result =
await cpp.exec(`tail -n 1 ${path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics')}`);
await execTail(path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics'));
if (cmdresult && cmdresult.stdout) {
this.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
} else {
......
......@@ -18,7 +18,6 @@
*/
'use strict';
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { EventEmitter } from 'events';
......@@ -32,7 +31,8 @@ import {
HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, uniqueString } from '../../common/utils';
import { delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, uniqueString, isAlive, getNewLine } from '../../common/utils';
import { execMkdir, getScriptName, execScript, setEnvironmentVariable, execNewFile } from '../common/util'
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { GPUScheduler } from './gpuScheduler';
......@@ -169,14 +169,7 @@ class LocalTrainingService implements TrainingService {
return this.getHostJob(trialJobId);
}
if (trialJob.status === 'RUNNING') {
let alive: boolean = false;
try {
await cpp.exec(`kill -0 ${trialJob.pid}`);
alive = true;
} catch (error) {
//ignore
}
let alive: boolean = await isAlive(trialJob.pid);
if (!alive) {
trialJob.endTime = Date.now();
this.setTrialJobStatus(trialJob, 'FAILED');
......@@ -284,7 +277,9 @@ class LocalTrainingService implements TrainingService {
public async setClusterMetadata(key: string, value: string): Promise<void> {
if (!this.initialized) {
this.rootDir = getExperimentRootDir();
await cpp.exec(`mkdir -p ${this.rootDir}`);
if(!fs.existsSync(this.rootDir)){
await cpp.exec(`powershell.exe mkdir ${this.rootDir}`);
}
this.initialized = true;
}
switch (key) {
......@@ -381,7 +376,7 @@ class LocalTrainingService implements TrainingService {
envVariables.push({
key: 'CUDA_VISIBLE_DEVICES',
value: this.gpuScheduler === undefined ? '' : resource.gpuIndices.join(',')
value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
});
return envVariables;
......@@ -465,36 +460,52 @@ class LocalTrainingService implements TrainingService {
}
}
private getScript(localTrailConfig: TrialConfig, workingDirectory: string): string[]{
let script: string[] = [];
if (process.platform === "win32") {
script.push(
`cmd /c ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
`$NOW_DATE = "$NOW_DATE" + "000"`,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File ${path.join(workingDirectory, '.nni', 'state')} -NoNewline -encoding utf8`);
}
else{
script.push(
`eval ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`echo $? \`date +%s000\` >${path.join(workingDirectory, '.nni', 'state')}`);
}
return script;
}
private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource);
const runScriptLines: string[] = [];
if (!this.localTrailConfig) {
throw new Error('trial config is not initialized');
}
runScriptLines.push(
'#!/bin/bash',
`cd ${this.localTrailConfig.codeDir}`);
const runScriptLines: string[] = [];
if (process.platform !== "win32"){
runScriptLines.push('#!/bin/bash');
}
runScriptLines.push(`cd ${this.localTrailConfig.codeDir}`);
for (const variable of variables) {
runScriptLines.push(`export ${variable.key}=${variable.value}`);
runScriptLines.push(setEnvironmentVariable(variable));
}
runScriptLines.push(
`eval ${this.localTrailConfig.command} 2>${path.join(trialJobDetail.workingDirectory, 'stderr')}`,
`echo $? \`date +%s000\` >${path.join(trialJobDetail.workingDirectory, '.nni', 'state')}`);
await cpp.exec(`mkdir -p ${trialJobDetail.workingDirectory}`);
await cpp.exec(`mkdir -p ${path.join(trialJobDetail.workingDirectory, '.nni')}`);
await cpp.exec(`touch ${path.join(trialJobDetail.workingDirectory, '.nni', 'metrics')}`);
await fs.promises.writeFile(
path.join(trialJobDetail.workingDirectory, 'run.sh'), runScriptLines.join('\n'), { encoding: 'utf8', mode: 0o777 });
const scripts: string[] = this.getScript(this.localTrailConfig, trialJobDetail.workingDirectory);
scripts.forEach(script => {
runScriptLines.push(script);
});
await execMkdir(trialJobDetail.workingDirectory);
await execMkdir(path.join(trialJobDetail.workingDirectory, '.nni'));
await execNewFile(path.join(trialJobDetail.workingDirectory, '.nni', 'metrics'));
const scriptName: string = getScriptName('run');
await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, scriptName), runScriptLines.join(getNewLine()), { encoding: 'utf8', mode: 0o777 });
await this.writeParameterFile(trialJobDetail.workingDirectory, (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters);
const process: cp.ChildProcess = cp.exec(`bash ${path.join(trialJobDetail.workingDirectory, 'run.sh')}`);
const trialJobProcess: cp.ChildProcess = execScript(path.join(trialJobDetail.workingDirectory, scriptName));
this.setTrialJobStatus(trialJobDetail, 'RUNNING');
trialJobDetail.startTime = Date.now();
trialJobDetail.pid = process.pid;
trialJobDetail.pid = trialJobProcess.pid;
this.setExtraProperties(trialJobDetail, resource);
let buffer: Buffer = Buffer.alloc(0);
......
......@@ -46,7 +46,7 @@ import {
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager,
RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT
} from './remoteMachineData';
import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX } from '../common/gpuData';
import { SSHClientUtility } from './sshClientUtility';
import { validateCodeDir } from '../common/util';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
......@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
GPU_INFO_COLLECTOR_FORMAT_LINUX,
remoteGPUScriptsDir,
path.join(remoteGPUScriptsDir, 'pid'),
);
......
......@@ -31,7 +31,7 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { LocalTrainingService } from '../local/localTrainingService';
// TODO: copy mockedTrail.py to local folder
const localCodeDir: string = tmp.dirSync().name
const localCodeDir: string = tmp.dirSync().name.split('\\').join('\\\\');
const mockedTrialPath: string = './training_service/test/mockedTrial.py'
fs.copyFileSync(mockedTrialPath, localCodeDir + '/mockedTrial.py')
......
......@@ -33,7 +33,8 @@ log_level_map = {
'debug': logging.DEBUG
}
_time_format = '%m/%d/%Y, %I:%M:%S %P'
_time_format = '%m/%d/%Y, %I:%M:%S %p'
class _LoggerFileWrapper(TextIOBase):
def __init__(self, logger_file):
self.file = logger_file
......
......@@ -19,6 +19,7 @@
# ==================================================================================================
import os
import sys
import json
import time
import subprocess
......@@ -87,6 +88,10 @@ def send_metric(string):
assert len(data) < 1000000, 'Metric too long'
_metric_file.write(b'ME%06d%b' % (len(data), data))
_metric_file.flush()
if sys.platform == "win32":
file = open(_metric_file.name)
file.close()
else:
subprocess.run(['touch', _metric_file.name], check = True)
def get_sequence_id():
......
......@@ -18,6 +18,8 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import sys
import glob
import argparse
from utils import get_yml_content, dump_yml_content
......@@ -69,6 +71,19 @@ def update_training_service_config(args):
dump_yml_content(TRAINING_SERVICE_FILE, config)
def convert_command():
'''convert command by platform'''
if sys.platform != 'win32':
return None
config_files = glob.glob('./**/*.yml') + glob.glob('./**/**/*.yml')
for config_file in config_files:
print('processing {}'.format(config_file))
yml_content = get_yml_content(config_file)
if yml_content.get('trial'):
if yml_content['trial'].get('command'):
yml_content['trial']['command'] = yml_content['trial']['command'].replace('python3', 'python')
dump_yml_content(config_file, yml_content)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote'], default='pai')
......@@ -96,3 +111,4 @@ if __name__ == '__main__':
args = parser.parse_args()
update_training_service_config(args)
convert_command()
jobs:
- job: 'Test'
steps:
- script: |
powershell.exe -file install.ps1
displayName: 'Install nni toolkit via source code'
- script: |
python -m pip install scikit-learn==0.20.0 --user
python -m pip install keras==2.1.6 --user
python -m pip install https://download.pytorch.org/whl/cu90/torch-0.4.1-cp36-cp36m-win_amd64.whl --user
python -m pip install torchvision --user
python -m pip install tensorflow-gpu==1.11.0 --user
displayName: 'Install dependencies for integration tests'
- script: |
cd test
python generate_ts_config.py
displayName: 'generate config files'
- script: |
cd test
python config_test.py --ts local --local_gpu --exclude smac,bohb
displayName: 'Examples and advanced features tests on local machine'
- script: |
cd test
powershell.exe -file unittest.ps1
displayName: 'unit test'
- script: |
cd test
python naive_test.py
displayName: 'Naive test'
- script: |
cd test
python tuner_test.py
displayName: 'Built-in tuners / assessors tests'
- script: |
cd test
python metrics_test.py
displayName: 'Trial job metrics test'
$CWD = $PWD
# -------------For python unittest-------------
## ------Run annotation test------
echo ""
echo "===========================Testing: nni_annotation==========================="
cd $CWD/../tools/
python -m unittest -v nni_annotation/test_annotation.py
## Export certain environment variables for unittest code to work
$env:NNI_TRIAL_JOB_ID="test_trial_job_id"
$env:NNI_PLATFORM="unittest"
## ------Run sdk test------
echo ""
echo "===========================Testing: nni_sdk==========================="
cd $CWD/../src/sdk/pynni/
python -m unittest discover -v tests
# -------------For typescript unittest-------------
cd $CWD/../src/nni_manager
echo ""
echo "===========================Testing: nni_manager==========================="
npm run test
......@@ -22,6 +22,7 @@ import contextlib
import collections
import json
import os
import sys
import subprocess
import requests
import ruamel.yaml as yaml
......@@ -65,7 +66,7 @@ def dump_yml_content(file_path, content):
def setup_experiment(installed=True):
'''setup the experiment if nni is not installed'''
if not installed:
os.environ['PATH'] = os.environ['PATH'] + ':' + os.environ['PWD']
os.environ['PATH'] = os.environ['PATH'] + ':' + os.getcwd()
sdk_path = os.path.abspath('../src/sdk/pynni')
cmd_path = os.path.abspath('../tools')
pypath = os.environ.get('PYTHONPATH')
......@@ -79,7 +80,7 @@ def fetch_nni_log_path(experiment_url):
'''get nni's log path from nni's experiment url'''
experiment_profile = requests.get(experiment_url)
experiment_id = json.loads(experiment_profile.text)['id']
experiment_path = os.path.join(os.environ['HOME'], 'nni/experiments', experiment_id)
experiment_path = os.path.join(os.path.expanduser('~'), 'nni', 'experiments', experiment_id)
nnimanager_log_path = os.path.join(experiment_path, 'log', 'nnimanager.log')
return nnimanager_log_path
......@@ -87,6 +88,9 @@ def fetch_nni_log_path(experiment_url):
def is_experiment_done(nnimanager_log_path):
'''check if the experiment is done successfully'''
assert os.path.exists(nnimanager_log_path), 'Experiment starts failed'
if sys.platform == "win32":
cmds = ['type', nnimanager_log_path, '|', 'find', EXPERIMENT_DONE_SIGNAL]
else:
cmds = ['cat', nnimanager_log_path, '|', 'grep', EXPERIMENT_DONE_SIGNAL]
completed_process = subprocess.run(' '.join(cmds), shell=True)
......@@ -112,6 +116,9 @@ def print_stderr(trial_jobs_url):
for trial_job in trial_jobs:
if trial_job['status'] == 'FAILED':
stderr_path = trial_job['stderrPath'].split(':')[-1]
if sys.platform == "win32":
subprocess.run(['type', stderr_path], shell=True)
else:
subprocess.run(['cat', stderr_path])
def parse_max_duration_time(max_exec_duration):
......
......@@ -20,6 +20,7 @@
import os
import sys
import shutil
from . import code_generator
......@@ -28,6 +29,9 @@ from . import search_space_generator
__all__ = ['generate_search_space', 'expand_annotations']
slash = '/'
if sys.platform == "win32":
slash = '\\'
def generate_search_space(code_dir):
"""Generate search space from Python source code.
......@@ -36,7 +40,7 @@ def generate_search_space(code_dir):
"""
search_space = {}
if code_dir.endswith('/'):
if code_dir.endswith(slash):
code_dir = code_dir[:-1]
for subdir, _, files in os.walk(code_dir):
......@@ -44,9 +48,9 @@ def generate_search_space(code_dir):
if subdir == code_dir:
package = ''
else:
assert subdir.startswith(code_dir + '/'), subdir
assert subdir.startswith(code_dir + slash), subdir
prefix_len = len(code_dir) + 1
package = subdir[prefix_len:].replace('/', '.') + '.'
package = subdir[prefix_len:].replace(slash, '.') + '.'
for file_name in files:
if file_name.endswith('.py'):
......@@ -76,9 +80,10 @@ def expand_annotations(src_dir, dst_dir):
src_dir: directory path of user code (str)
dst_dir: directory to place generated files (str)
"""
if src_dir[-1] == '/':
if src_dir[-1] == slash:
src_dir = src_dir[:-1]
if dst_dir[-1] == '/':
if dst_dir[-1] == slash:
dst_dir = dst_dir[:-1]
annotated = False
......
from subprocess import call, check_output
import sys
import os
import signal
import psutil
from .common_utils import print_error, print_normal, print_warning
def check_output_command(file_path, head=None, tail=None):
'''call check_output command to read content from a file'''
if os.path.exists(file_path):
if sys.platform == 'win32':
cmds = ['powershell.exe', 'type', file_path]
if head:
cmds += ['|', 'select', '-first', str(head)]
elif tail:
cmds += ['|', 'select', '-last', str(tail)]
return check_output(cmds, shell=True).decode('utf-8')
else:
cmds = ['cat', file_path]
if head:
cmds = ['head', '-' + str(head), file_path]
elif tail:
cmds = ['tail', '-' + str(tail), file_path]
return check_output(cmds, shell=False).decode('utf-8')
else:
print_error('{0} does not exist!'.format(file_path))
exit(1)
def kill_command(pid):
'''kill command'''
if sys.platform == 'win32':
process = psutil.Process(pid=pid)
process.send_signal(signal.CTRL_BREAK_EVENT)
else:
cmds = ['kill', str(pid)]
call(cmds)
def install_package_command(package_name):
'''install python package from pip'''
#TODO refactor python logic
if sys.platform == "win32":
cmds = 'python -m pip install --user {0}'.format(package_name)
else:
cmds = 'python3 -m pip install --user {0}'.format(package_name)
call(cmds, shell=True)
def install_requirements_command(requirements_path):
'''install requirements.txt'''
cmds = 'cd ' + requirements_path + ' && {0} -m pip install --user -r requirements.txt'
#TODO refactor python logic
if sys.platform == "win32":
cmds = cmds.format('python')
else:
cmds = cmds.format('python3')
call(cmds, shell=True)
......@@ -18,10 +18,13 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import os
import sys
import json
import ruamel.yaml as yaml
import psutil
import socket
from pathlib import Path
from .constants import ERROR_INFO, NORMAL_INFO, WARNING_INFO, COLOR_RED_FORMAT, COLOR_YELLOW_FORMAT
def get_yml_content(file_path):
......@@ -71,3 +74,15 @@ def detect_port(port):
return True
except:
return False
def get_user():
if sys.platform =='win32':
return os.environ['USERNAME']
else:
return os.environ['USER']
def get_python_dir(sitepackages_path):
if sys.platform == "win32":
return str(Path(sitepackages_path))
else:
return str(Path(sitepackages_path).parents[2])
\ No newline at end of file
......@@ -19,8 +19,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import os
from colorama import Fore
NNICTL_HOME_DIR = os.path.join(os.environ['HOME'], '.local', 'nnictl')
NNICTL_HOME_DIR = os.path.join(os.path.expanduser('~'), '.local', 'nnictl')
ERROR_INFO = 'ERROR: %s'
......@@ -32,7 +33,7 @@ DEFAULT_REST_PORT = 8080
REST_TIME_OUT = 20
EXPERIMENT_SUCCESS_INFO = '\033[1;32;32mSuccessfully started experiment!\n\033[0m' \
EXPERIMENT_SUCCESS_INFO = Fore.GREEN + 'Successfully started experiment!\n' + Fore.RESET + \
'-----------------------------------------------------------------------\n' \
'The experiment id is %s\n'\
'The Web UI urls are: %s\n' \
......@@ -94,11 +95,11 @@ TUNERS_NO_NEED_TO_IMPORT_DATA = {
'Hyperband'
}
COLOR_RED_FORMAT = '\033[1;31;31m%s\033[0m'
COLOR_RED_FORMAT = Fore.RED + '%s'
COLOR_GREEN_FORMAT = '\033[1;32;32m%s\033[0m'
COLOR_GREEN_FORMAT = Fore.GREEN + '%s'
COLOR_YELLOW_FORMAT = '\033[1;33;33m%s\033[0m'
COLOR_YELLOW_FORMAT = Fore.YELLOW + '%s'
SCHEMA_TYPE_ERROR = '%s should be %s type!'
......
......@@ -32,12 +32,13 @@ from .launcher_utils import validate_all_content
from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response
from .url_utils import cluster_metadata_url, experiment_url, get_local_urls
from .config_utils import Config, Experiments
from .common_utils import get_yml_content, get_json_content, print_error, print_normal, print_warning, detect_process, detect_port
from .common_utils import get_yml_content, get_json_content, print_error, print_normal, print_warning, detect_process, detect_port, get_user, get_python_dir
from .constants import *
import random
import site
import time
from pathlib import Path
from .command_utils import check_output_command, kill_command
def get_log_path(config_file_name):
'''generate stdout and stderr log path'''
......@@ -49,14 +50,10 @@ def print_log_content(config_file_name):
'''print log information'''
stdout_full_path, stderr_full_path = get_log_path(config_file_name)
print_normal(' Stdout:')
stdout_cmds = ['cat', stdout_full_path]
stdout_content = check_output(stdout_cmds)
print(stdout_content.decode('utf-8'))
print(check_output_command(stdout_full_path))
print('\n\n')
print_normal(' Stderr:')
stderr_cmds = ['cat', stderr_full_path]
stderr_content = check_output(stderr_cmds)
print(stderr_content.decode('utf-8'))
print(check_output_command(stderr_full_path))
def get_nni_installation_path():
''' Find nni lib from the following locations in order
......@@ -67,7 +64,7 @@ def get_nni_installation_path():
Return None if nothing is found
'''
def _generate_installation_path(sitepackages_path):
python_dir = str(Path(sitepackages_path).parents[2])
python_dir = get_python_dir(sitepackages_path)
entry_file = os.path.join(python_dir, 'nni', 'main.js')
if os.path.isfile(entry_file):
return python_dir
......@@ -132,6 +129,10 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
log_header = LOG_HEADER % str(time_now)
stdout_file.write(log_header)
stderr_file.write(log_header)
if sys.platform == 'win32':
from subprocess import CREATE_NEW_PROCESS_GROUP
process = Popen(cmds, cwd=entry_dir, stdout=stdout_file, stderr=stderr_file, creationflags=CREATE_NEW_PROCESS_GROUP)
else:
process = Popen(cmds, cwd=entry_dir, stdout=stdout_file, stderr=stderr_file)
return process, str(time_now)
......@@ -357,7 +358,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
nni_config.set_config('restServerPid', rest_process.pid)
# Deal with annotation
if experiment_config.get('useAnnotation'):
path = os.path.join(tempfile.gettempdir(), os.environ['USER'], 'nni', 'annotation')
path = os.path.join(tempfile.gettempdir(), get_user(), 'nni', 'annotation')
if not os.path.isdir(path):
os.makedirs(path)
path = tempfile.mkdtemp(dir=path)
......@@ -380,8 +381,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
print_error('Restful server start failed!')
print_log_content(config_file_name)
try:
cmds = ['kill', str(rest_process.pid)]
call(cmds)
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Rest server stopped!')
exit(1)
......@@ -395,8 +395,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
else:
print_error('Failed! Error is: {}'.format(err_msg))
try:
cmds = ['kill', str(rest_process.pid)]
call(cmds)
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Rest server stopped!')
exit(1)
......@@ -409,8 +408,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
else:
print_error('Set local config failed!')
try:
cmds = ['kill', str(rest_process.pid)]
call(cmds)
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Rest server stopped!')
exit(1)
......@@ -425,8 +423,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
if err_msg:
print_error('Failed! Error is: {}'.format(err_msg))
try:
cmds = ['kill', str(rest_process.pid)]
call(cmds)
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Restful server stopped!')
exit(1)
......@@ -441,8 +438,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
if err_msg:
print_error('Failed! Error is: {}'.format(err_msg))
try:
cmds = ['pkill', str(rest_process.pid)]
call(cmds)
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Restful server stopped!')
exit(1)
......@@ -457,8 +453,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
if err_msg:
print_error('Failed! Error is: {}'.format(err_msg))
try:
cmds = ['pkill', str(rest_process.pid)]
call(cmds)
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Restful server stopped!')
exit(1)
......@@ -477,8 +472,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
print_error('Start experiment failed!')
print_log_content(config_file_name)
try:
cmds = ['kill', str(rest_process.pid)]
call(cmds)
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Restful server stopped!')
exit(1)
......
......@@ -27,6 +27,8 @@ from .nnictl_utils import *
from .package_management import *
from .constants import *
from .tensorboard_utils import *
from colorama import init
init(autoreset=True)
if os.environ.get('COVERAGE_PROCESS_START'):
import coverage
......
......@@ -24,7 +24,6 @@ import psutil
import json
import datetime
import time
from subprocess import call, check_output
from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response
from .config_utils import Config, Experiments
......@@ -32,6 +31,7 @@ from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url
from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, \
EXPERIMENT_MONITOR_INFO, TRIAL_MONITOR_HEAD, TRIAL_MONITOR_CONTENT, TRIAL_MONITOR_TAIL, REST_TIME_OUT
from .common_utils import print_normal, print_error, print_warning, detect_process
from .command_utils import check_output_command, kill_command
def get_experiment_time(port):
'''get the startTime and endTime of an experiment'''
......@@ -219,14 +219,12 @@ def stop_experiment(args):
rest_port = nni_config.get_config('restServerPort')
rest_pid = nni_config.get_config('restServerPid')
if rest_pid:
stop_rest_cmds = ['kill', str(rest_pid)]
call(stop_rest_cmds)
kill_command(rest_pid)
tensorboard_pid_list = nni_config.get_config('tensorboardPidList')
if tensorboard_pid_list:
for tensorboard_pid in tensorboard_pid_list:
try:
cmds = ['kill', '-9', str(tensorboard_pid)]
call(cmds)
kill_command(tensorboard_pid)
except Exception as exception:
print_error(exception)
nni_config.set_config('tensorboardPidList', [])
......@@ -303,14 +301,6 @@ def experiment_status(args):
else:
print(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':')))
def get_log_content(file_name, cmds):
'''use cmds to read config content'''
if os.path.exists(file_name):
rest = check_output(cmds)
print(rest.decode('utf-8'))
else:
print_normal('NULL!')
def log_internal(args, filetype):
'''internal function to call get_log_content'''
file_name = get_config_filename(args)
......@@ -318,14 +308,7 @@ def log_internal(args, filetype):
file_full_path = os.path.join(NNICTL_HOME_DIR, file_name, 'stdout')
else:
file_full_path = os.path.join(NNICTL_HOME_DIR, file_name, 'stderr')
if args.head:
get_log_content(file_full_path, ['head', '-' + str(args.head), file_full_path])
elif args.tail:
get_log_content(file_full_path, ['tail', '-' + str(args.tail), file_full_path])
elif args.path:
print_normal('The path of stdout file is: ' + file_full_path)
else:
get_log_content(file_full_path, ['cat', file_full_path])
print(check_output_command(file_full_path, head=args.head, tail=args.tail))
def log_stdout(args):
'''get stdout log'''
......
......@@ -20,17 +20,18 @@
import nni
import os
import sys
from subprocess import call
from .constants import PACKAGE_REQUIREMENTS
from .common_utils import print_normal, print_error
from .command_utils import install_requirements_command
def process_install(package_name):
if PACKAGE_REQUIREMENTS.get(package_name) is None:
print_error('{0} is not supported!' % package_name)
else:
requirements_path = os.path.join(nni.__path__[0], PACKAGE_REQUIREMENTS[package_name])
cmds = 'cd ' + requirements_path + ' && python3 -m pip install --user -r requirements.txt'
call(cmds, shell=True)
install_requirements_command(requirements_path)
def package_install(args):
'''install packages'''
......
......@@ -21,14 +21,14 @@
import os
from .common_utils import print_error
from subprocess import call
from .command_utils import install_package_command
def check_environment():
'''check if paramiko is installed'''
try:
import paramiko
except:
cmds = 'python3 -m pip install --user paramiko'
call(cmds, shell=True)
install_package_command('paramiko')
def copy_remote_directory_to_local(sftp, remote_path, local_path):
'''copy remote directory to local machine'''
......
......@@ -25,6 +25,9 @@ import time
from xml.dom import minidom
def check_ready_to_run():
#TODO check process in windows
if sys.platform == 'win32':
return True
pgrep_output =subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment