Unverified Commit d9c83c0c authored by chicm-ms's avatar chicm-ms Committed by GitHub
Browse files

Configurable nniManager log path and log level (#644)

* Pull code (#22)

* Support distributed job for frameworkcontroller (#612)

support distributed job for frameworkcontroller

* Multiphase doc (#519)

* multiPhase doc

* updates

* updates

* Add time parser for 'nnictl update duration' (#632)

Current nnictl update duration only support seconds unit, add a parser for this command to support {s, m, h, d}

* fix experiment state bug (#629)

* update top README.md (#622)

* Update README.md

* update (#634)

* Integration tests refactoring (#625)

* Integration test refactoring (#21) (#616)

* Integration test refactoring (#21)

* Refactoring integration tests

* test metrics

* update azure pipeline

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* update trigger

* Integration test refactoring (#618)

* updates

* updates

* update pipeline (#619)

* update pipeline

* updates

* updates

* updates

* updates

* updates

* test pipeline (#623)

* test pipeline

* updates

* updates

* updates

* Update integration test (#624)

* Update integration test

* updates

* updates

* updates

* updates

* updates

* updates

* Revert "Pull code (#22)"

This reverts commit 62fc165ad7b2ba724eead3b99f010aa34491e2c7.

* Configurable nniManager log path

* Configure log level

* add --debug command line for nnictl

* updates
parent 0f9fbf87
...@@ -177,8 +177,18 @@ machineList: ...@@ -177,8 +177,18 @@ machineList:
__nniManagerIp__ set the IP address of the machine on which nni manager process runs. This field is optional, and if it's not set, eth0 device IP will be used instead. __nniManagerIp__ set the IP address of the machine on which nni manager process runs. This field is optional, and if it's not set, eth0 device IP will be used instead.
Note: run ifconfig on NNI manager's machine to check if eth0 device exists. If not, we recommend to set nnimanagerIp explicitly. Note: run ifconfig on NNI manager's machine to check if eth0 device exists. If not, we recommend to set nnimanagerIp explicitly.
* __logDir__
* Description
__logDir__ configures the directory to store logs and data of the experiment. The default value is `<user home directory>/nni/experiment`
* __logLevel__
* Description
__logLevel__ sets log level for the experiment, available log levels are: `trace, debug, info, warning, error, fatal`. The default value is `info`.
* __tuner__ * __tuner__
* Description * Description
......
...@@ -43,6 +43,7 @@ nnictl --version ...@@ -43,6 +43,7 @@ nnictl --version
| ------ | ------ | ------ |------ | | ------ | ------ | ------ |------ |
| --config, -c| True| |yaml configure file of the experiment| | --config, -c| True| |yaml configure file of the experiment|
| --port, -p | False| |the port of restful server| | --port, -p | False| |the port of restful server|
| --debug, -d | False| |Set log level to debug|
* __nnictl resume__ * __nnictl resume__
...@@ -62,6 +63,7 @@ nnictl --version ...@@ -62,6 +63,7 @@ nnictl --version
| ------ | ------ | ------ |------ | | ------ | ------ | ------ |------ |
| id| False| |The id of the experiment you want to resume| | id| False| |The id of the experiment you want to resume|
| --port, -p| False| |Rest port of the experiment you want to resume| | --port, -p| False| |Rest port of the experiment you want to resume|
| --debug, -d | False| |Set log level to debug|
* __nnictl stop__ * __nnictl stop__
* Description * Description
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import * as os from 'os';
import * as path from 'path';
import * as component from '../common/component'; import * as component from '../common/component';
@component.Singleton @component.Singleton
...@@ -29,8 +31,10 @@ class ExperimentStartupInfo { ...@@ -29,8 +31,10 @@ class ExperimentStartupInfo {
private basePort: number = -1; private basePort: number = -1;
private initialized: boolean = false; private initialized: boolean = false;
private initTrialSequenceID: number = 0; private initTrialSequenceID: number = 0;
private logDir: string = '';
private logLevel: string = '';
public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void { public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void {
assert(!this.initialized); assert(!this.initialized);
assert(experimentId.trim().length > 0); assert(experimentId.trim().length > 0);
...@@ -38,6 +42,16 @@ class ExperimentStartupInfo { ...@@ -38,6 +42,16 @@ class ExperimentStartupInfo {
this.experimentId = experimentId; this.experimentId = experimentId;
this.basePort = basePort; this.basePort = basePort;
this.initialized = true; this.initialized = true;
if (logDir !== undefined && logDir.length > 0) {
this.logDir = path.join(logDir, getExperimentId());
} else {
this.logDir = path.join(os.homedir(), 'nni', 'experiments', getExperimentId());
}
if (logLevel !== undefined && logLevel.length > 1) {
this.logLevel = logLevel;
}
} }
public getExperimentId(): string { public getExperimentId(): string {
...@@ -58,6 +72,18 @@ class ExperimentStartupInfo { ...@@ -58,6 +72,18 @@ class ExperimentStartupInfo {
return this.newExperiment; return this.newExperiment;
} }
public getLogDir(): string {
assert(this.initialized);
return this.logDir;
}
public getLogLevel(): string {
assert(this.initialized);
return this.logLevel;
}
public setInitTrialSequenceId(initSequenceId: number): void { public setInitTrialSequenceId(initSequenceId: number): void {
assert(this.initialized); assert(this.initialized);
this.initTrialSequenceID = initSequenceId; this.initTrialSequenceID = initSequenceId;
...@@ -90,9 +116,15 @@ function getInitTrialSequenceId(): number { ...@@ -90,9 +116,15 @@ function getInitTrialSequenceId(): number {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getInitTrialSequenceId(); return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getInitTrialSequenceId();
} }
function setExperimentStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void { function getExperimentStartupInfo(): ExperimentStartupInfo {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setStartupInfo(newExperiment, experimentId, basePort); return component.get<ExperimentStartupInfo>(ExperimentStartupInfo);
}
function setExperimentStartupInfo(
newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo)
.setStartupInfo(newExperiment, experimentId, basePort, logDir, logLevel);
} }
export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getExperimentStartupInfo,
setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId }; setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId };
...@@ -26,13 +26,18 @@ import { Writable } from 'stream'; ...@@ -26,13 +26,18 @@ import { Writable } from 'stream';
import { WritableStreamBuffer } from 'stream-buffers'; import { WritableStreamBuffer } from 'stream-buffers';
import { format } from 'util'; import { format } from 'util';
import * as component from '../common/component'; import * as component from '../common/component';
import { getExperimentStartupInfo } from './experimentStartupInfo';
import { getLogDir } from './utils'; import { getLogDir } from './utils';
const CRITICAL: number = 1; const FATAL: number = 1;
const ERROR: number = 2; const ERROR: number = 2;
const WARNING: number = 3; const WARNING: number = 3;
const INFO: number = 4; const INFO: number = 4;
const DEBUG: number = 5; const DEBUG: number = 5;
const TRACE: number = 6;
const logLevelNameMap: Map<string, number> = new Map([['fatal', FATAL],
['error', ERROR], ['warning', WARNING], ['info', INFO], ['debug', DEBUG], ['trace', TRACE]]);
class BufferSerialEmitter { class BufferSerialEmitter {
private buffer: Buffer; private buffer: Buffer;
...@@ -83,12 +88,25 @@ class Logger { ...@@ -83,12 +88,25 @@ class Logger {
autoClose: true autoClose: true
}); });
this.bufferSerialEmitter = new BufferSerialEmitter(this.writable); this.bufferSerialEmitter = new BufferSerialEmitter(this.writable);
const logLevelName: string = getExperimentStartupInfo()
.getLogLevel();
const logLevel: number | undefined = logLevelNameMap.get(logLevelName);
if (logLevel !== undefined) {
this.level = logLevel;
}
} }
public close() { public close() {
this.writable.destroy(); this.writable.destroy();
} }
public trace(...param: any[]): void {
if (this.level >= TRACE) {
this.log('TRACE', param);
}
}
public debug(...param: any[]): void { public debug(...param: any[]): void {
if (this.level >= DEBUG) { if (this.level >= DEBUG) {
this.log('DEBUG', param); this.log('DEBUG', param);
...@@ -113,8 +131,8 @@ class Logger { ...@@ -113,8 +131,8 @@ class Logger {
} }
} }
public critical(...param: any[]): void { public fatal(...param: any[]): void {
this.log('CRITICAL', param); this.log('FATAL', param);
} }
private log(level: string, param: any[]): void { private log(level: string, param: any[]): void {
......
...@@ -30,13 +30,14 @@ import { Container } from 'typescript-ioc'; ...@@ -30,13 +30,14 @@ import { Container } from 'typescript-ioc';
import * as util from 'util'; import * as util from 'util';
import { Database, DataStore } from './datastore'; import { Database, DataStore } from './datastore';
import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from './experimentStartupInfo'; import { ExperimentStartupInfo, getExperimentId, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo';
import { Manager } from './manager'; import { Manager } from './manager';
import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService'; import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService';
import { getLogger } from './log'; import { getLogger } from './log';
function getExperimentRootDir(): string { function getExperimentRootDir(): string {
return path.join(os.homedir(), 'nni', 'experiments', getExperimentId()); return getExperimentStartupInfo()
.getLogDir();
} }
function getLogDir(): string{ function getLogDir(): string{
......
...@@ -35,7 +35,7 @@ import { ...@@ -35,7 +35,7 @@ import {
import { import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService'; } from '../common/trainingService';
import { delay, getCheckpointDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils'; import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils';
import { import {
ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING, ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING,
REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
...@@ -670,7 +670,7 @@ class NNIManager implements Manager { ...@@ -670,7 +670,7 @@ class NNIManager implements Manager {
id: getExperimentId(), id: getExperimentId(),
revision: 0, revision: 0,
execDuration: 0, execDuration: 0,
logDir: getLogDir(), logDir: getExperimentRootDir(),
maxSequenceId: 0, maxSequenceId: 0,
params: { params: {
authorName: '', authorName: '',
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
import { Container, Scope } from 'typescript-ioc'; import { Container, Scope } from 'typescript-ioc';
import * as component from './common/component'; import * as component from './common/component';
import * as fs from 'fs';
import { Database, DataStore } from './common/datastore'; import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo'; import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger } from './common/log'; import { getLogger, Logger } from './common/log';
...@@ -40,10 +41,10 @@ import { PAITrainingService } from './training_service/pai/paiTrainingService'; ...@@ -40,10 +41,10 @@ import { PAITrainingService } from './training_service/pai/paiTrainingService';
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService'; import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) { function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number, logDirectory: string, experimentLogLevel: string) {
const createNew: boolean = (startExpMode === 'new'); const createNew: boolean = (startExpMode === 'new');
const expId: string = createNew ? uniqueString(8) : resumeExperimentId; const expId: string = createNew ? uniqueString(8) : resumeExperimentId;
setExperimentStartupInfo(createNew, expId, basePort); setExperimentStartupInfo(createNew, expId, basePort, logDirectory, experimentLogLevel);
} }
async function initContainer(platformMode: string): Promise<void> { async function initContainer(platformMode: string): Promise<void> {
...@@ -102,7 +103,19 @@ if (startMode === 'resume' && experimentId.trim().length < 1) { ...@@ -102,7 +103,19 @@ if (startMode === 'resume' && experimentId.trim().length < 1) {
process.exit(1); process.exit(1);
} }
initStartupInfo(startMode, experimentId, port); const logDir: string = parseArg(['--log_dir', '-ld']);
if (logDir.length > 0) {
if (!fs.existsSync(logDir)) {
console.log(`FATAL: log_dir ${logDir} does not exist`);
}
}
const logLevel: string = parseArg(['--log_level', '-ll']);
if (logLevel.length > 0 && !['debug', 'info', 'error', 'warning', 'critical'].includes(logLevel)) {
console.log(`FATAL: invalid log_level: ${logLevel}`);
}
initStartupInfo(startMode, experimentId, port, logDir, logLevel);
mkDirP(getLogDir()).then(async () => { mkDirP(getLogDir()).then(async () => {
const log: Logger = getLogger(); const log: Logger = getLogger();
......
...@@ -105,7 +105,7 @@ class NNIRestHandler { ...@@ -105,7 +105,7 @@ class NNIRestHandler {
// If it's a fatal error, exit process // If it's a fatal error, exit process
if(isFatal) { if(isFatal) {
this.log.critical(err); this.log.fatal(err);
process.exit(1); process.exit(1);
} }
......
...@@ -33,6 +33,8 @@ Optional('searchSpacePath'): os.path.exists, ...@@ -33,6 +33,8 @@ Optional('searchSpacePath'): os.path.exists,
Optional('multiPhase'): bool, Optional('multiPhase'): bool,
Optional('multiThread'): bool, Optional('multiThread'): bool,
Optional('nniManagerIp'): str, Optional('nniManagerIp'): str,
Optional('logDir'): os.path.isdir,
Optional('logLevel'): Or('trace', 'debug', 'info', 'warning', 'error', 'fatal'),
'useAnnotation': bool, 'useAnnotation': bool,
Optional('advisor'): Or({ Optional('advisor'): Or({
'builtinAdvisorName': Or('Hyperband'), 'builtinAdvisorName': Or('Hyperband'),
......
...@@ -98,7 +98,7 @@ def get_nni_installation_path(): ...@@ -98,7 +98,7 @@ def get_nni_installation_path():
print_error('Fail to find nni under python library') print_error('Fail to find nni under python library')
exit(1) exit(1)
def start_rest_server(port, platform, mode, config_file_name, experiment_id=None): def start_rest_server(port, platform, mode, config_file_name, experiment_id=None, log_dir=None, log_level=None):
'''Run nni manager process''' '''Run nni manager process'''
nni_config = Config(config_file_name) nni_config = Config(config_file_name)
if detect_port(port): if detect_port(port):
...@@ -118,6 +118,10 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None ...@@ -118,6 +118,10 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
entry_file = os.path.join(entry_dir, 'main.js') entry_file = os.path.join(entry_dir, 'main.js')
cmds = ['node', entry_file, '--port', str(port), '--mode', platform, '--start_mode', mode] cmds = ['node', entry_file, '--port', str(port), '--mode', platform, '--start_mode', mode]
if log_dir is not None:
cmds += ['--log_dir', log_dir]
if log_level is not None:
cmds += ['--log_level', log_level]
if mode == 'resume': if mode == 'resume':
cmds += ['--experiment_id', experiment_id] cmds += ['--experiment_id', experiment_id]
stdout_full_path, stderr_full_path = get_log_path(config_file_name) stdout_full_path, stderr_full_path = get_log_path(config_file_name)
...@@ -317,9 +321,12 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen ...@@ -317,9 +321,12 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
except ModuleNotFoundError as e: except ModuleNotFoundError as e:
print_error('The tuner %s should be installed through nnictl'%(tuner_name)) print_error('The tuner %s should be installed through nnictl'%(tuner_name))
exit(1) exit(1)
log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None
log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None
if log_level not in ['trace', 'debug'] and args.debug:
log_level = 'debug'
# start rest server # start rest server
rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id) rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id, log_dir, log_level)
nni_config.set_config('restServerPid', rest_process.pid) nni_config.set_config('restServerPid', rest_process.pid)
# Deal with annotation # Deal with annotation
if experiment_config.get('useAnnotation'): if experiment_config.get('useAnnotation'):
......
...@@ -51,12 +51,14 @@ def parse_args(): ...@@ -51,12 +51,14 @@ def parse_args():
parser_start = subparsers.add_parser('create', help='create a new experiment') parser_start = subparsers.add_parser('create', help='create a new experiment')
parser_start.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file') parser_start.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file')
parser_start.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') parser_start.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_start.add_argument('--debug', '-d', action='store_true', help=' set log level to debug')
parser_start.set_defaults(func=create_experiment) parser_start.set_defaults(func=create_experiment)
# parse resume command # parse resume command
parser_resume = subparsers.add_parser('resume', help='resume a new experiment') parser_resume = subparsers.add_parser('resume', help='resume a new experiment')
parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume') parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume')
parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_resume.add_argument('--debug', '-d', action='store_true', help=' set log level to debug')
parser_resume.set_defaults(func=resume_experiment) parser_resume.set_defaults(func=resume_experiment)
# parse update command # parse update command
......
...@@ -38,11 +38,12 @@ from .url_utils import gen_send_stdout_url ...@@ -38,11 +38,12 @@ from .url_utils import gen_send_stdout_url
@unique @unique
class LogType(Enum): class LogType(Enum):
Trace = 'TRACE'
Debug = 'DEBUG' Debug = 'DEBUG'
Info = 'INFO' Info = 'INFO'
Warning = 'WARNING' Warning = 'WARNING'
Error = 'ERROR' Error = 'ERROR'
Critical = 'CRITICAL' Fatal = 'FATAL'
@unique @unique
class StdOutputType(Enum): class StdOutputType(Enum):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment