Unverified Commit 8f716170 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Fix nni stop (#368)

Fix "nnictl stop"
parent d6c07948
...@@ -26,6 +26,7 @@ import { Deferred } from 'ts-deferred'; ...@@ -26,6 +26,7 @@ import { Deferred } from 'ts-deferred';
import { getLogger, Logger } from './log'; import { getLogger, Logger } from './log';
import { getBasePort } from './experimentStartupInfo'; import { getBasePort } from './experimentStartupInfo';
/** /**
* Abstraction class to create a RestServer * Abstraction class to create a RestServer
* The module who wants to use a RestServer could <b>extends</b> this abstract class * The module who wants to use a RestServer could <b>extends</b> this abstract class
...@@ -90,6 +91,10 @@ export abstract class RestServer { ...@@ -90,6 +91,10 @@ export abstract class RestServer {
} else { } else {
this.startTask.promise.then( this.startTask.promise.then(
() => { // Started () => { // Started
//Stops the server from accepting new connections and keeps existing connections.
//This function is asynchronous, the server is finally closed when all connections
//are ended and the server emits a 'close' event.
//Refer https://nodejs.org/docs/latest/api/net.html#net_server_close_callback
this.server.close().on('close', () => { this.server.close().on('close', () => {
this.log.info('Rest server stopped.'); this.log.info('Rest server stopped.');
this.stopTask.resolve(); this.stopTask.resolve();
...@@ -103,7 +108,7 @@ export abstract class RestServer { ...@@ -103,7 +108,7 @@ export abstract class RestServer {
} }
); );
} }
this.stopTask.resolve()
return this.stopTask.promise; return this.stopTask.promise;
} }
......
...@@ -219,7 +219,6 @@ class NNIManager implements Manager { ...@@ -219,7 +219,6 @@ class NNIManager implements Manager {
public async stopExperiment(): Promise<void> { public async stopExperiment(): Promise<void> {
this.status.status = 'STOPPING'; this.status.status = 'STOPPING';
await this.experimentDoneCleanUp();
} }
public async getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]> { public async getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]> {
......
...@@ -110,10 +110,20 @@ mkDirP(getLogDir()).then(async () => { ...@@ -110,10 +110,20 @@ mkDirP(getLogDir()).then(async () => {
}); });
process.on('SIGTERM', async () => { process.on('SIGTERM', async () => {
const ds: DataStore = component.get(DataStore);
await ds.close();
const restServer: NNIRestServer = component.get(NNIRestServer);
await restServer.stop();
const log: Logger = getLogger(); const log: Logger = getLogger();
log.close(); let hasError: boolean = false;
try{
const nniManager: Manager = component.get(Manager);
await nniManager.stopExperiment();
const ds: DataStore = component.get(DataStore);
await ds.close();
const restServer: NNIRestServer = component.get(NNIRestServer);
await restServer.stop();
}catch(err){
hasError = true;
log.error(`${err.stack}`);
}finally{
await log.close();
process.exit(hasError?1:0);
}
}) })
\ No newline at end of file
...@@ -65,7 +65,6 @@ class NNIRestHandler { ...@@ -65,7 +65,6 @@ class NNIRestHandler {
this.getExperimentProfile(router); this.getExperimentProfile(router);
this.updateExperimentProfile(router); this.updateExperimentProfile(router);
this.startExperiment(router); this.startExperiment(router);
this.stopExperiment(router);
this.getTrialJobStatistics(router); this.getTrialJobStatistics(router);
this.setClusterMetaData(router); this.setClusterMetaData(router);
this.listTrialJobs(router); this.listTrialJobs(router);
...@@ -158,18 +157,6 @@ class NNIRestHandler { ...@@ -158,18 +157,6 @@ class NNIRestHandler {
}); });
} }
private stopExperiment(router: Router): void {
router.delete('/experiment', async (req: Request, res: Response) => {
try {
await this.tb.cleanUp();
await this.nniManager.stopExperiment();
res.send();
} catch (err) {
this.handle_error(err, res);
}
});
}
private getTrialJobStatistics(router: Router): void { private getTrialJobStatistics(router: Router): void {
router.get('/job-statistics', (req: Request, res: Response) => { router.get('/job-statistics', (req: Request, res: Response) => {
this.nniManager.getTrialJobStatistics().then((statistics: TrialJobStatistics[]) => { this.nniManager.getTrialJobStatistics().then((statistics: TrialJobStatistics[]) => {
......
...@@ -179,18 +179,6 @@ def stop_experiment(args): ...@@ -179,18 +179,6 @@ def stop_experiment(args):
print_normal('Experiment is not running...') print_normal('Experiment is not running...')
experiment_config.update_experiment(experiment_id, 'status', 'stopped') experiment_config.update_experiment(experiment_id, 'status', 'stopped')
return return
running, _ = check_rest_server_quick(rest_port)
stop_rest_result = True
if running:
response = rest_delete(experiment_url(rest_port), 20)
if not response or not check_response(response):
if response:
print_error(response.text)
else:
print_error('No response from restful server!')
stop_rest_result = False
#sleep to wait rest handler done
time.sleep(3)
rest_pid = nni_config.get_config('restServerPid') rest_pid = nni_config.get_config('restServerPid')
if rest_pid: if rest_pid:
stop_rest_cmds = ['kill', str(rest_pid)] stop_rest_cmds = ['kill', str(rest_pid)]
...@@ -204,8 +192,7 @@ def stop_experiment(args): ...@@ -204,8 +192,7 @@ def stop_experiment(args):
except Exception as exception: except Exception as exception:
print_error(exception) print_error(exception)
nni_config.set_config('tensorboardPidList', []) nni_config.set_config('tensorboardPidList', [])
if stop_rest_result: print_normal('Stop experiment success!')
print_normal('Stop experiment success!')
experiment_config.update_experiment(experiment_id, 'status', 'stopped') experiment_config.update_experiment(experiment_id, 'status', 'stopped')
time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
experiment_config.update_experiment(experiment_id, 'endTime', str(time_now)) experiment_config.update_experiment(experiment_id, 'endTime', str(time_now))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment