"src/git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "836b2c34226b2394d60b03013ae94b4ecdb15b21"
Unverified Commit d9c83c0c authored by chicm-ms's avatar chicm-ms Committed by GitHub
Browse files

Configurable nniManager log path and log level (#644)

* Pull code (#22)

* Support distributed job for frameworkcontroller (#612)

support distributed job for frameworkcontroller

* Multiphase doc (#519)

* multiPhase doc

* updates

* updates

* Add time parser for 'nnictl update duration' (#632)

Current nnictl update duration only support seconds unit, add a parser for this command to support {s, m, h, d}

* fix experiment state bug (#629)

* update top README.md (#622)

* Update README.md

* update (#634)

* Integration tests refactoring (#625)

* Integration test refactoring (#21) (#616)

* Integration test refactoring (#21)

* Refactoring integration tests

* test metrics

* update azure pipeline

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* update trigger

* Integration test refactoring (#618)

* updates

* updates

* update pipeline (#619)

* update pipeline

* updates

* updates

* updates

* updates

* updates

* test pipeline (#623)

* test pipeline

* updates

* updates

* updates

* Update integration test (#624)

* Update integration test

* updates

* updates

* updates

* updates

* updates

* updates

* Revert "Pull code (#22)"

This reverts commit 62fc165ad7b2ba724eead3b99f010aa34491e2c7.

* Configurable nniManager log path

* Configure log level

* add --debug command line for nnictl

* updates
parent 0f9fbf87
......@@ -178,6 +178,16 @@ machineList:
Note: run ifconfig on NNI manager's machine to check if eth0 device exists. If not, we recommend to set nnimanagerIp explicitly.
* __logDir__
* Description
__logDir__ configures the directory to store logs and data of the experiment. The default value is `<user home directory>/nni/experiment`
* __logLevel__
* Description
__logLevel__ sets log level for the experiment, available log levels are: `trace, debug, info, warning, error, fatal`. The default value is `info`.
* __tuner__
* Description
......
......@@ -43,6 +43,7 @@ nnictl --version
| ------ | ------ | ------ |------ |
| --config, -c| True| |yaml configure file of the experiment|
| --port, -p | False| |the port of restful server|
| --debug, -d | False| |Set log level to debug|
* __nnictl resume__
......@@ -62,6 +63,7 @@ nnictl --version
| ------ | ------ | ------ |------ |
| id| False| |The id of the experiment you want to resume|
| --port, -p| False| |Rest port of the experiment you want to resume|
| --debug, -d | False| |Set log level to debug|
* __nnictl stop__
* Description
......
......@@ -20,6 +20,8 @@
'use strict';
import * as assert from 'assert';
import * as os from 'os';
import * as path from 'path';
import * as component from '../common/component';
@component.Singleton
......@@ -29,8 +31,10 @@ class ExperimentStartupInfo {
private basePort: number = -1;
private initialized: boolean = false;
private initTrialSequenceID: number = 0;
private logDir: string = '';
private logLevel: string = '';
public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void {
public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void {
assert(!this.initialized);
assert(experimentId.trim().length > 0);
......@@ -38,6 +42,16 @@ class ExperimentStartupInfo {
this.experimentId = experimentId;
this.basePort = basePort;
this.initialized = true;
if (logDir !== undefined && logDir.length > 0) {
this.logDir = path.join(logDir, getExperimentId());
} else {
this.logDir = path.join(os.homedir(), 'nni', 'experiments', getExperimentId());
}
if (logLevel !== undefined && logLevel.length > 1) {
this.logLevel = logLevel;
}
}
public getExperimentId(): string {
......@@ -58,6 +72,18 @@ class ExperimentStartupInfo {
return this.newExperiment;
}
public getLogDir(): string {
assert(this.initialized);
return this.logDir;
}
public getLogLevel(): string {
assert(this.initialized);
return this.logLevel;
}
public setInitTrialSequenceId(initSequenceId: number): void {
assert(this.initialized);
this.initTrialSequenceID = initSequenceId;
......@@ -90,9 +116,15 @@ function getInitTrialSequenceId(): number {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getInitTrialSequenceId();
}
function setExperimentStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setStartupInfo(newExperiment, experimentId, basePort);
function getExperimentStartupInfo(): ExperimentStartupInfo {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo);
}
function setExperimentStartupInfo(
newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo)
.setStartupInfo(newExperiment, experimentId, basePort, logDir, logLevel);
}
export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment,
export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getExperimentStartupInfo,
setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId };
......@@ -26,13 +26,18 @@ import { Writable } from 'stream';
import { WritableStreamBuffer } from 'stream-buffers';
import { format } from 'util';
import * as component from '../common/component';
import { getExperimentStartupInfo } from './experimentStartupInfo';
import { getLogDir } from './utils';
const CRITICAL: number = 1;
const FATAL: number = 1;
const ERROR: number = 2;
const WARNING: number = 3;
const INFO: number = 4;
const DEBUG: number = 5;
const TRACE: number = 6;
const logLevelNameMap: Map<string, number> = new Map([['fatal', FATAL],
['error', ERROR], ['warning', WARNING], ['info', INFO], ['debug', DEBUG], ['trace', TRACE]]);
class BufferSerialEmitter {
private buffer: Buffer;
......@@ -83,12 +88,25 @@ class Logger {
autoClose: true
});
this.bufferSerialEmitter = new BufferSerialEmitter(this.writable);
const logLevelName: string = getExperimentStartupInfo()
.getLogLevel();
const logLevel: number | undefined = logLevelNameMap.get(logLevelName);
if (logLevel !== undefined) {
this.level = logLevel;
}
}
public close() {
this.writable.destroy();
}
public trace(...param: any[]): void {
if (this.level >= TRACE) {
this.log('TRACE', param);
}
}
public debug(...param: any[]): void {
if (this.level >= DEBUG) {
this.log('DEBUG', param);
......@@ -113,8 +131,8 @@ class Logger {
}
}
public critical(...param: any[]): void {
this.log('CRITICAL', param);
public fatal(...param: any[]): void {
this.log('FATAL', param);
}
private log(level: string, param: any[]): void {
......
......@@ -30,13 +30,14 @@ import { Container } from 'typescript-ioc';
import * as util from 'util';
import { Database, DataStore } from './datastore';
import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from './experimentStartupInfo';
import { ExperimentStartupInfo, getExperimentId, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo';
import { Manager } from './manager';
import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService';
import { getLogger } from './log';
function getExperimentRootDir(): string {
return path.join(os.homedir(), 'nni', 'experiments', getExperimentId());
return getExperimentStartupInfo()
.getLogDir();
}
function getLogDir(): string{
......
......@@ -35,7 +35,7 @@ import {
import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService';
import { delay, getCheckpointDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils';
import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils';
import {
ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING,
REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
......@@ -670,7 +670,7 @@ class NNIManager implements Manager {
id: getExperimentId(),
revision: 0,
execDuration: 0,
logDir: getLogDir(),
logDir: getExperimentRootDir(),
maxSequenceId: 0,
params: {
authorName: '',
......
......@@ -22,6 +22,7 @@
import { Container, Scope } from 'typescript-ioc';
import * as component from './common/component';
import * as fs from 'fs';
import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger } from './common/log';
......@@ -40,10 +41,10 @@ import { PAITrainingService } from './training_service/pai/paiTrainingService';
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) {
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number, logDirectory: string, experimentLogLevel: string) {
const createNew: boolean = (startExpMode === 'new');
const expId: string = createNew ? uniqueString(8) : resumeExperimentId;
setExperimentStartupInfo(createNew, expId, basePort);
setExperimentStartupInfo(createNew, expId, basePort, logDirectory, experimentLogLevel);
}
async function initContainer(platformMode: string): Promise<void> {
......@@ -102,7 +103,19 @@ if (startMode === 'resume' && experimentId.trim().length < 1) {
process.exit(1);
}
initStartupInfo(startMode, experimentId, port);
const logDir: string = parseArg(['--log_dir', '-ld']);
if (logDir.length > 0) {
if (!fs.existsSync(logDir)) {
console.log(`FATAL: log_dir ${logDir} does not exist`);
}
}
const logLevel: string = parseArg(['--log_level', '-ll']);
if (logLevel.length > 0 && !['debug', 'info', 'error', 'warning', 'critical'].includes(logLevel)) {
console.log(`FATAL: invalid log_level: ${logLevel}`);
}
initStartupInfo(startMode, experimentId, port, logDir, logLevel);
mkDirP(getLogDir()).then(async () => {
const log: Logger = getLogger();
......
......@@ -105,7 +105,7 @@ class NNIRestHandler {
// If it's a fatal error, exit process
if(isFatal) {
this.log.critical(err);
this.log.fatal(err);
process.exit(1);
}
......
......@@ -33,6 +33,8 @@ Optional('searchSpacePath'): os.path.exists,
Optional('multiPhase'): bool,
Optional('multiThread'): bool,
Optional('nniManagerIp'): str,
Optional('logDir'): os.path.isdir,
Optional('logLevel'): Or('trace', 'debug', 'info', 'warning', 'error', 'fatal'),
'useAnnotation': bool,
Optional('advisor'): Or({
'builtinAdvisorName': Or('Hyperband'),
......
......@@ -98,7 +98,7 @@ def get_nni_installation_path():
print_error('Fail to find nni under python library')
exit(1)
def start_rest_server(port, platform, mode, config_file_name, experiment_id=None):
def start_rest_server(port, platform, mode, config_file_name, experiment_id=None, log_dir=None, log_level=None):
'''Run nni manager process'''
nni_config = Config(config_file_name)
if detect_port(port):
......@@ -118,6 +118,10 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
entry_file = os.path.join(entry_dir, 'main.js')
cmds = ['node', entry_file, '--port', str(port), '--mode', platform, '--start_mode', mode]
if log_dir is not None:
cmds += ['--log_dir', log_dir]
if log_level is not None:
cmds += ['--log_level', log_level]
if mode == 'resume':
cmds += ['--experiment_id', experiment_id]
stdout_full_path, stderr_full_path = get_log_path(config_file_name)
......@@ -317,9 +321,12 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
except ModuleNotFoundError as e:
print_error('The tuner %s should be installed through nnictl'%(tuner_name))
exit(1)
log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None
log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None
if log_level not in ['trace', 'debug'] and args.debug:
log_level = 'debug'
# start rest server
rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id)
rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id, log_dir, log_level)
nni_config.set_config('restServerPid', rest_process.pid)
# Deal with annotation
if experiment_config.get('useAnnotation'):
......
......@@ -51,12 +51,14 @@ def parse_args():
parser_start = subparsers.add_parser('create', help='create a new experiment')
parser_start.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file')
parser_start.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_start.add_argument('--debug', '-d', action='store_true', help=' set log level to debug')
parser_start.set_defaults(func=create_experiment)
# parse resume command
parser_resume = subparsers.add_parser('resume', help='resume a new experiment')
parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume')
parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_resume.add_argument('--debug', '-d', action='store_true', help=' set log level to debug')
parser_resume.set_defaults(func=resume_experiment)
# parse update command
......
......@@ -38,11 +38,12 @@ from .url_utils import gen_send_stdout_url
@unique
class LogType(Enum):
Trace = 'TRACE'
Debug = 'DEBUG'
Info = 'INFO'
Warning = 'WARNING'
Error = 'ERROR'
Critical = 'CRITICAL'
Fatal = 'FATAL'
@unique
class StdOutputType(Enum):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment