"ml/git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "548a9f56a6315316481a9a901f77922cb77e0f68"
Unverified Commit d9c83c0c authored by chicm-ms's avatar chicm-ms Committed by GitHub
Browse files

Configurable nniManager log path and log level (#644)

* Pull code (#22)

* Support distributed job for frameworkcontroller (#612)

support distributed job for frameworkcontroller

* Multiphase doc (#519)

* multiPhase doc

* updates

* updates

* Add time parser for 'nnictl update duration' (#632)

Current nnictl update duration only support seconds unit, add a parser for this command to support {s, m, h, d}

* fix experiment state bug (#629)

* update top README.md (#622)

* Update README.md

* update (#634)

* Integration tests refactoring (#625)

* Integration test refactoring (#21) (#616)

* Integration test refactoring (#21)

* Refactoring integration tests

* test metrics

* update azure pipeline

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* updates

* update trigger

* Integration test refactoring (#618)

* updates

* updates

* update pipeline (#619)

* update pipeline

* updates

* updates

* updates

* updates

* updates

* test pipeline (#623)

* test pipeline

* updates

* updates

* updates

* Update integration test (#624)

* Update integration test

* updates

* updates

* updates

* updates

* updates

* updates

* Revert "Pull code (#22)"

This reverts commit 62fc165ad7b2ba724eead3b99f010aa34491e2c7.

* Configurable nniManager log path

* Configure log level

* add --debug command line for nnictl

* updates
parent 0f9fbf87
...@@ -178,6 +178,16 @@ machineList: ...@@ -178,6 +178,16 @@ machineList:
Note: run ifconfig on NNI manager's machine to check if eth0 device exists. If not, we recommend to set nnimanagerIp explicitly. Note: run ifconfig on NNI manager's machine to check if eth0 device exists. If not, we recommend to set nnimanagerIp explicitly.
* __logDir__
* Description
__logDir__ configures the directory to store logs and data of the experiment. The default value is `<user home directory>/nni/experiment`
* __logLevel__
* Description
__logLevel__ sets log level for the experiment, available log levels are: `trace, debug, info, warning, error, fatal`. The default value is `info`.
* __tuner__ * __tuner__
* Description * Description
......
...@@ -43,6 +43,7 @@ nnictl --version ...@@ -43,6 +43,7 @@ nnictl --version
| ------ | ------ | ------ |------ | | ------ | ------ | ------ |------ |
| --config, -c| True| |yaml configure file of the experiment| | --config, -c| True| |yaml configure file of the experiment|
| --port, -p | False| |the port of restful server| | --port, -p | False| |the port of restful server|
| --debug, -d | False| |Set log level to debug|
* __nnictl resume__ * __nnictl resume__
...@@ -62,6 +63,7 @@ nnictl --version ...@@ -62,6 +63,7 @@ nnictl --version
| ------ | ------ | ------ |------ | | ------ | ------ | ------ |------ |
| id| False| |The id of the experiment you want to resume| | id| False| |The id of the experiment you want to resume|
| --port, -p| False| |Rest port of the experiment you want to resume| | --port, -p| False| |Rest port of the experiment you want to resume|
| --debug, -d | False| |Set log level to debug|
* __nnictl stop__ * __nnictl stop__
* Description * Description
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import * as os from 'os';
import * as path from 'path';
import * as component from '../common/component'; import * as component from '../common/component';
@component.Singleton @component.Singleton
...@@ -29,8 +31,10 @@ class ExperimentStartupInfo { ...@@ -29,8 +31,10 @@ class ExperimentStartupInfo {
private basePort: number = -1; private basePort: number = -1;
private initialized: boolean = false; private initialized: boolean = false;
private initTrialSequenceID: number = 0; private initTrialSequenceID: number = 0;
private logDir: string = '';
private logLevel: string = '';
public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void { public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void {
assert(!this.initialized); assert(!this.initialized);
assert(experimentId.trim().length > 0); assert(experimentId.trim().length > 0);
...@@ -38,6 +42,16 @@ class ExperimentStartupInfo { ...@@ -38,6 +42,16 @@ class ExperimentStartupInfo {
this.experimentId = experimentId; this.experimentId = experimentId;
this.basePort = basePort; this.basePort = basePort;
this.initialized = true; this.initialized = true;
if (logDir !== undefined && logDir.length > 0) {
this.logDir = path.join(logDir, getExperimentId());
} else {
this.logDir = path.join(os.homedir(), 'nni', 'experiments', getExperimentId());
}
if (logLevel !== undefined && logLevel.length > 1) {
this.logLevel = logLevel;
}
} }
public getExperimentId(): string { public getExperimentId(): string {
...@@ -58,6 +72,18 @@ class ExperimentStartupInfo { ...@@ -58,6 +72,18 @@ class ExperimentStartupInfo {
return this.newExperiment; return this.newExperiment;
} }
public getLogDir(): string {
assert(this.initialized);
return this.logDir;
}
public getLogLevel(): string {
assert(this.initialized);
return this.logLevel;
}
public setInitTrialSequenceId(initSequenceId: number): void { public setInitTrialSequenceId(initSequenceId: number): void {
assert(this.initialized); assert(this.initialized);
this.initTrialSequenceID = initSequenceId; this.initTrialSequenceID = initSequenceId;
...@@ -90,9 +116,15 @@ function getInitTrialSequenceId(): number { ...@@ -90,9 +116,15 @@ function getInitTrialSequenceId(): number {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getInitTrialSequenceId(); return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getInitTrialSequenceId();
} }
function setExperimentStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void { function getExperimentStartupInfo(): ExperimentStartupInfo {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setStartupInfo(newExperiment, experimentId, basePort); return component.get<ExperimentStartupInfo>(ExperimentStartupInfo);
}
function setExperimentStartupInfo(
newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo)
.setStartupInfo(newExperiment, experimentId, basePort, logDir, logLevel);
} }
export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getExperimentStartupInfo,
setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId }; setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId };
...@@ -26,13 +26,18 @@ import { Writable } from 'stream'; ...@@ -26,13 +26,18 @@ import { Writable } from 'stream';
import { WritableStreamBuffer } from 'stream-buffers'; import { WritableStreamBuffer } from 'stream-buffers';
import { format } from 'util'; import { format } from 'util';
import * as component from '../common/component'; import * as component from '../common/component';
import { getExperimentStartupInfo } from './experimentStartupInfo';
import { getLogDir } from './utils'; import { getLogDir } from './utils';
const CRITICAL: number = 1; const FATAL: number = 1;
const ERROR: number = 2; const ERROR: number = 2;
const WARNING: number = 3; const WARNING: number = 3;
const INFO: number = 4; const INFO: number = 4;
const DEBUG: number = 5; const DEBUG: number = 5;
const TRACE: number = 6;
const logLevelNameMap: Map<string, number> = new Map([['fatal', FATAL],
['error', ERROR], ['warning', WARNING], ['info', INFO], ['debug', DEBUG], ['trace', TRACE]]);
class BufferSerialEmitter { class BufferSerialEmitter {
private buffer: Buffer; private buffer: Buffer;
...@@ -83,12 +88,25 @@ class Logger { ...@@ -83,12 +88,25 @@ class Logger {
autoClose: true autoClose: true
}); });
this.bufferSerialEmitter = new BufferSerialEmitter(this.writable); this.bufferSerialEmitter = new BufferSerialEmitter(this.writable);
const logLevelName: string = getExperimentStartupInfo()
.getLogLevel();
const logLevel: number | undefined = logLevelNameMap.get(logLevelName);
if (logLevel !== undefined) {
this.level = logLevel;
}
} }
public close() { public close() {
this.writable.destroy(); this.writable.destroy();
} }
public trace(...param: any[]): void {
if (this.level >= TRACE) {
this.log('TRACE', param);
}
}
public debug(...param: any[]): void { public debug(...param: any[]): void {
if (this.level >= DEBUG) { if (this.level >= DEBUG) {
this.log('DEBUG', param); this.log('DEBUG', param);
...@@ -113,8 +131,8 @@ class Logger { ...@@ -113,8 +131,8 @@ class Logger {
} }
} }
public critical(...param: any[]): void { public fatal(...param: any[]): void {
this.log('CRITICAL', param); this.log('FATAL', param);
} }
private log(level: string, param: any[]): void { private log(level: string, param: any[]): void {
......
...@@ -30,13 +30,14 @@ import { Container } from 'typescript-ioc'; ...@@ -30,13 +30,14 @@ import { Container } from 'typescript-ioc';
import * as util from 'util'; import * as util from 'util';
import { Database, DataStore } from './datastore'; import { Database, DataStore } from './datastore';
import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from './experimentStartupInfo'; import { ExperimentStartupInfo, getExperimentId, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo';
import { Manager } from './manager'; import { Manager } from './manager';
import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService'; import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService';
import { getLogger } from './log'; import { getLogger } from './log';
function getExperimentRootDir(): string { function getExperimentRootDir(): string {
return path.join(os.homedir(), 'nni', 'experiments', getExperimentId()); return getExperimentStartupInfo()
.getLogDir();
} }
function getLogDir(): string{ function getLogDir(): string{
......
...@@ -35,7 +35,7 @@ import { ...@@ -35,7 +35,7 @@ import {
import { import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService'; } from '../common/trainingService';
import { delay, getCheckpointDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils'; import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils';
import { import {
ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING, ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING,
REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
...@@ -670,7 +670,7 @@ class NNIManager implements Manager { ...@@ -670,7 +670,7 @@ class NNIManager implements Manager {
id: getExperimentId(), id: getExperimentId(),
revision: 0, revision: 0,
execDuration: 0, execDuration: 0,
logDir: getLogDir(), logDir: getExperimentRootDir(),
maxSequenceId: 0, maxSequenceId: 0,
params: { params: {
authorName: '', authorName: '',
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
import { Container, Scope } from 'typescript-ioc'; import { Container, Scope } from 'typescript-ioc';
import * as component from './common/component'; import * as component from './common/component';
import * as fs from 'fs';
import { Database, DataStore } from './common/datastore'; import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo'; import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger } from './common/log'; import { getLogger, Logger } from './common/log';
...@@ -40,10 +41,10 @@ import { PAITrainingService } from './training_service/pai/paiTrainingService'; ...@@ -40,10 +41,10 @@ import { PAITrainingService } from './training_service/pai/paiTrainingService';
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService'; import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) { function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number, logDirectory: string, experimentLogLevel: string) {
const createNew: boolean = (startExpMode === 'new'); const createNew: boolean = (startExpMode === 'new');
const expId: string = createNew ? uniqueString(8) : resumeExperimentId; const expId: string = createNew ? uniqueString(8) : resumeExperimentId;
setExperimentStartupInfo(createNew, expId, basePort); setExperimentStartupInfo(createNew, expId, basePort, logDirectory, experimentLogLevel);
} }
async function initContainer(platformMode: string): Promise<void> { async function initContainer(platformMode: string): Promise<void> {
...@@ -102,7 +103,19 @@ if (startMode === 'resume' && experimentId.trim().length < 1) { ...@@ -102,7 +103,19 @@ if (startMode === 'resume' && experimentId.trim().length < 1) {
process.exit(1); process.exit(1);
} }
initStartupInfo(startMode, experimentId, port); const logDir: string = parseArg(['--log_dir', '-ld']);
if (logDir.length > 0) {
if (!fs.existsSync(logDir)) {
console.log(`FATAL: log_dir ${logDir} does not exist`);
}
}
const logLevel: string = parseArg(['--log_level', '-ll']);
if (logLevel.length > 0 && !['debug', 'info', 'error', 'warning', 'critical'].includes(logLevel)) {
console.log(`FATAL: invalid log_level: ${logLevel}`);
}
initStartupInfo(startMode, experimentId, port, logDir, logLevel);
mkDirP(getLogDir()).then(async () => { mkDirP(getLogDir()).then(async () => {
const log: Logger = getLogger(); const log: Logger = getLogger();
......
...@@ -105,7 +105,7 @@ class NNIRestHandler { ...@@ -105,7 +105,7 @@ class NNIRestHandler {
// If it's a fatal error, exit process // If it's a fatal error, exit process
if(isFatal) { if(isFatal) {
this.log.critical(err); this.log.fatal(err);
process.exit(1); process.exit(1);
} }
......
...@@ -33,6 +33,8 @@ Optional('searchSpacePath'): os.path.exists, ...@@ -33,6 +33,8 @@ Optional('searchSpacePath'): os.path.exists,
Optional('multiPhase'): bool, Optional('multiPhase'): bool,
Optional('multiThread'): bool, Optional('multiThread'): bool,
Optional('nniManagerIp'): str, Optional('nniManagerIp'): str,
Optional('logDir'): os.path.isdir,
Optional('logLevel'): Or('trace', 'debug', 'info', 'warning', 'error', 'fatal'),
'useAnnotation': bool, 'useAnnotation': bool,
Optional('advisor'): Or({ Optional('advisor'): Or({
'builtinAdvisorName': Or('Hyperband'), 'builtinAdvisorName': Or('Hyperband'),
......
...@@ -98,7 +98,7 @@ def get_nni_installation_path(): ...@@ -98,7 +98,7 @@ def get_nni_installation_path():
print_error('Fail to find nni under python library') print_error('Fail to find nni under python library')
exit(1) exit(1)
def start_rest_server(port, platform, mode, config_file_name, experiment_id=None): def start_rest_server(port, platform, mode, config_file_name, experiment_id=None, log_dir=None, log_level=None):
'''Run nni manager process''' '''Run nni manager process'''
nni_config = Config(config_file_name) nni_config = Config(config_file_name)
if detect_port(port): if detect_port(port):
...@@ -118,6 +118,10 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None ...@@ -118,6 +118,10 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
entry_file = os.path.join(entry_dir, 'main.js') entry_file = os.path.join(entry_dir, 'main.js')
cmds = ['node', entry_file, '--port', str(port), '--mode', platform, '--start_mode', mode] cmds = ['node', entry_file, '--port', str(port), '--mode', platform, '--start_mode', mode]
if log_dir is not None:
cmds += ['--log_dir', log_dir]
if log_level is not None:
cmds += ['--log_level', log_level]
if mode == 'resume': if mode == 'resume':
cmds += ['--experiment_id', experiment_id] cmds += ['--experiment_id', experiment_id]
stdout_full_path, stderr_full_path = get_log_path(config_file_name) stdout_full_path, stderr_full_path = get_log_path(config_file_name)
...@@ -317,9 +321,12 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen ...@@ -317,9 +321,12 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
except ModuleNotFoundError as e: except ModuleNotFoundError as e:
print_error('The tuner %s should be installed through nnictl'%(tuner_name)) print_error('The tuner %s should be installed through nnictl'%(tuner_name))
exit(1) exit(1)
log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None
log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None
if log_level not in ['trace', 'debug'] and args.debug:
log_level = 'debug'
# start rest server # start rest server
rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id) rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id, log_dir, log_level)
nni_config.set_config('restServerPid', rest_process.pid) nni_config.set_config('restServerPid', rest_process.pid)
# Deal with annotation # Deal with annotation
if experiment_config.get('useAnnotation'): if experiment_config.get('useAnnotation'):
......
...@@ -51,12 +51,14 @@ def parse_args(): ...@@ -51,12 +51,14 @@ def parse_args():
parser_start = subparsers.add_parser('create', help='create a new experiment') parser_start = subparsers.add_parser('create', help='create a new experiment')
parser_start.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file') parser_start.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file')
parser_start.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') parser_start.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_start.add_argument('--debug', '-d', action='store_true', help=' set log level to debug')
parser_start.set_defaults(func=create_experiment) parser_start.set_defaults(func=create_experiment)
# parse resume command # parse resume command
parser_resume = subparsers.add_parser('resume', help='resume a new experiment') parser_resume = subparsers.add_parser('resume', help='resume a new experiment')
parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume') parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume')
parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_resume.add_argument('--debug', '-d', action='store_true', help=' set log level to debug')
parser_resume.set_defaults(func=resume_experiment) parser_resume.set_defaults(func=resume_experiment)
# parse update command # parse update command
......
...@@ -38,11 +38,12 @@ from .url_utils import gen_send_stdout_url ...@@ -38,11 +38,12 @@ from .url_utils import gen_send_stdout_url
@unique @unique
class LogType(Enum): class LogType(Enum):
Trace = 'TRACE'
Debug = 'DEBUG' Debug = 'DEBUG'
Info = 'INFO' Info = 'INFO'
Warning = 'WARNING' Warning = 'WARNING'
Error = 'ERROR' Error = 'ERROR'
Critical = 'CRITICAL' Fatal = 'FATAL'
@unique @unique
class StdOutputType(Enum): class StdOutputType(Enum):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment