Unverified Commit 7a1f05ae authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

Merge pull request #3444 from microsoft/v2.1

V2.1 merge back to master
parents 539a7cd7 a0ae02e6
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import collections import collections
from typing import Dict, Any, List from typing import Dict, Any, List
from ..graph import Model from ..graph import Model
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
""" """
Entrypoint for trials. Entrypoint for trials.
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import inspect import inspect
import warnings import warnings
from collections import defaultdict from collections import defaultdict
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import json import json
import os import os
from pathlib import Path
import sys import sys
import string import string
import random import random
...@@ -590,14 +591,14 @@ def create_experiment(args): ...@@ -590,14 +591,14 @@ def create_experiment(args):
except Exception: except Exception:
print_warning('Validation with V1 schema failed. Trying to convert from V2 format...') print_warning('Validation with V1 schema failed. Trying to convert from V2 format...')
try: try:
config = ExperimentConfig(**experiment_config) config = ExperimentConfig(_base_path=Path(config_path).parent, **experiment_config)
experiment_config = convert.to_v1_yaml(config) experiment_config = convert.to_v1_yaml(config)
except Exception as e: except Exception as e:
print_error(f'Conversion from v2 format failed: {repr(e)}') print_error(f'Config in v2 format validation failed, the config error in v2 format is: {repr(e)}')
try: try:
validate_all_content(experiment_config, config_path) validate_all_content(experiment_config, config_path)
except Exception as e: except Exception as e:
print_error(f'Config in v1 format validation failed. {repr(e)}') print_error(f'Config in v1 format validation failed, the config error in v1 format is: {repr(e)}')
exit(1) exit(1)
try: try:
......
authorName: nni authorName: nni
experimentName: default_test experimentName: default_test
maxExecDuration: 5m maxExecDuration: 10m
maxTrialNum: 8 maxTrialNum: 8
trialConcurrency: 8 trialConcurrency: 8
searchSpacePath: ../naive_trial/search_space.json searchSpacePath: ../naive_trial/search_space.json
......
...@@ -260,9 +260,9 @@ def run(args): ...@@ -260,9 +260,9 @@ def run(args):
continue continue
# remote mode need more time to cleanup # remote mode need more time to cleanup
if args.ts == 'remote': if args.ts == 'remote':
wait_for_port_available(8080, 180) wait_for_port_available(8080, 240)
else: else:
wait_for_port_available(8080, 30) wait_for_port_available(8080, 60)
# adl mode need more time to cleanup PVC # adl mode need more time to cleanup PVC
if args.ts == 'adl' and name == 'nnictl-resume-2': if args.ts == 'adl' and name == 'nnictl-resume-2':
......
...@@ -326,22 +326,26 @@ class NNIManager implements Manager { ...@@ -326,22 +326,26 @@ class NNIManager implements Manager {
} }
public async stopExperimentBottomHalf(): Promise<void> { public async stopExperimentBottomHalf(): Promise<void> {
const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs(); try {
const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();
// DON'T try to make it in parallel, the training service may not handle it well.
// If there is performance concern, consider to support batch cancellation on training service. // DON'T try to make it in parallel, the training service may not handle it well.
for (const trialJob of trialJobList) { // If there is performance concern, consider to support batch cancellation on training service.
if (trialJob.status === 'RUNNING' || for (const trialJob of trialJobList) {
trialJob.status === 'WAITING') { if (trialJob.status === 'RUNNING' ||
try { trialJob.status === 'WAITING') {
this.log.info(`cancelTrialJob: ${trialJob.id}`); try {
await this.trainingService.cancelTrialJob(trialJob.id); this.log.info(`cancelTrialJob: ${trialJob.id}`);
} catch (error) { await this.trainingService.cancelTrialJob(trialJob.id);
this.log.debug(`ignorable error on canceling trial ${trialJob.id}. ${error}`); } catch (error) {
this.log.debug(`ignorable error on canceling trial ${trialJob.id}. ${error}`);
}
} }
} }
await this.trainingService.cleanUp();
} catch (err) {
this.log.error(`${err.stack}`);
} }
await this.trainingService.cleanUp();
if (this.experimentProfile.endTime === undefined) { if (this.experimentProfile.endTime === undefined) {
this.setEndtime(); this.setEndtime();
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment