Unverified Commit a5efb4e6 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Fix v2 config version_check and log_collection (#3575)

parent e19f5d26
...@@ -131,12 +131,39 @@ def set_adl_config(experiment_config, port, config_file_name): ...@@ -131,12 +131,39 @@ def set_adl_config(experiment_config, port, config_file_name):
with open(stderr_full_path, 'a+') as fout: with open(stderr_full_path, 'a+') as fout:
fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
return False, err_message return False, err_message
set_V1_common_config(experiment_config, port, config_file_name)
result, message = setNNIManagerIp(experiment_config, port, config_file_name) result, message = setNNIManagerIp(experiment_config, port, config_file_name)
if not result: if not result:
return result, message return result, message
#set trial_config #set trial_config
return set_trial_config(experiment_config, port, config_file_name), None return set_trial_config(experiment_config, port, config_file_name), None
def validate_response(response, config_file_name):
err_message = None
if not response or not response.status_code == 200:
if response is not None:
err_message = response.text
_, stderr_full_path = get_log_path(config_file_name)
with open(stderr_full_path, 'a+') as fout:
fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
print_error('Error:' + err_message)
exit(1)
# hack to fix v1 version_check and log_collection bug, need refactor
def set_V1_common_config(experiment_config, port, config_file_name):
version_check = True
#debug mode should disable version check
if experiment_config.get('debug') is not None:
version_check = not experiment_config.get('debug')
#validate version check
if experiment_config.get('versionCheck') is not None:
version_check = experiment_config.get('versionCheck')
response = rest_put(cluster_metadata_url(port), json.dumps({'version_check': version_check}), REST_TIME_OUT)
validate_response(response, config_file_name)
if experiment_config.get('logCollection'):
response = rest_put(cluster_metadata_url(port), json.dumps({'log_collection': experiment_config.get('logCollection')}), REST_TIME_OUT)
validate_response(response, config_file_name)
def setNNIManagerIp(experiment_config, port, config_file_name): def setNNIManagerIp(experiment_config, port, config_file_name):
'''set nniManagerIp''' '''set nniManagerIp'''
if experiment_config.get('nniManagerIp') is None: if experiment_config.get('nniManagerIp') is None:
...@@ -167,6 +194,7 @@ def set_kubeflow_config(experiment_config, port, config_file_name): ...@@ -167,6 +194,7 @@ def set_kubeflow_config(experiment_config, port, config_file_name):
with open(stderr_full_path, 'a+') as fout: with open(stderr_full_path, 'a+') as fout:
fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
return False, err_message return False, err_message
set_V1_common_config(experiment_config, port, config_file_name)
result, message = setNNIManagerIp(experiment_config, port, config_file_name) result, message = setNNIManagerIp(experiment_config, port, config_file_name)
if not result: if not result:
return result, message return result, message
...@@ -186,6 +214,7 @@ def set_frameworkcontroller_config(experiment_config, port, config_file_name): ...@@ -186,6 +214,7 @@ def set_frameworkcontroller_config(experiment_config, port, config_file_name):
with open(stderr_full_path, 'a+') as fout: with open(stderr_full_path, 'a+') as fout:
fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
return False, err_message return False, err_message
set_V1_common_config(experiment_config, port, config_file_name)
result, message = setNNIManagerIp(experiment_config, port, config_file_name) result, message = setNNIManagerIp(experiment_config, port, config_file_name)
if not result: if not result:
return result, message return result, message
......
...@@ -197,6 +197,8 @@ export namespace ValidationSchemas { ...@@ -197,6 +197,8 @@ export namespace ValidationSchemas {
nni_manager_ip: joi.object({ // eslint-disable-line @typescript-eslint/camelcase nni_manager_ip: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
nniManagerIp: joi.string().min(1) nniManagerIp: joi.string().min(1)
}), }),
version_check: joi.boolean(), // eslint-disable-line @typescript-eslint/camelcase
log_collection: joi.string(), // eslint-disable-line @typescript-eslint/camelcase
remote_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase remote_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
reuse: joi.boolean() reuse: joi.boolean()
}), }),
......
...@@ -19,6 +19,7 @@ import {validateCodeDir} from '../../common/util'; ...@@ -19,6 +19,7 @@ import {validateCodeDir} from '../../common/util';
import {NFSConfig} from '../kubernetesConfig'; import {NFSConfig} from '../kubernetesConfig';
import {KubernetesTrialJobDetail} from '../kubernetesData'; import {KubernetesTrialJobDetail} from '../kubernetesData';
import {KubernetesTrainingService} from '../kubernetesTrainingService'; import {KubernetesTrainingService} from '../kubernetesTrainingService';
import {KubernetesJobRestServer} from '../kubernetesJobRestServer';
import {FrameworkControllerClientFactory} from './frameworkcontrollerApiClient'; import {FrameworkControllerClientFactory} from './frameworkcontrollerApiClient';
import { import {
FrameworkControllerClusterConfig, FrameworkControllerClusterConfig,
...@@ -52,7 +53,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -52,7 +53,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
public async run(): Promise<void> { public async run(): Promise<void> {
this.kubernetesJobRestServer = component.get(FrameworkControllerJobRestServer); this.kubernetesJobRestServer = new KubernetesJobRestServer(this);
if (this.kubernetesJobRestServer === undefined) { if (this.kubernetesJobRestServer === undefined) {
throw new Error('kubernetesJobRestServer not initialized!'); throw new Error('kubernetesJobRestServer not initialized!');
} }
......
...@@ -19,6 +19,7 @@ import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; ...@@ -19,6 +19,7 @@ import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { validateCodeDir } from '../../common/util'; import { validateCodeDir } from '../../common/util';
import { NFSConfig } from '../kubernetesConfig'; import { NFSConfig } from '../kubernetesConfig';
import { KubernetesTrialJobDetail } from '../kubernetesData'; import { KubernetesTrialJobDetail } from '../kubernetesData';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer';
import { KubernetesTrainingService } from '../kubernetesTrainingService'; import { KubernetesTrainingService } from '../kubernetesTrainingService';
import { KubeflowOperatorClientFactory } from './kubeflowApiClient'; import { KubeflowOperatorClientFactory } from './kubeflowApiClient';
import { KubeflowClusterConfig, KubeflowClusterConfigAzure, KubeflowClusterConfigFactory, KubeflowClusterConfigNFS, import { KubeflowClusterConfig, KubeflowClusterConfigAzure, KubeflowClusterConfigFactory, KubeflowClusterConfigNFS,
...@@ -46,7 +47,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -46,7 +47,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
public async run(): Promise<void> { public async run(): Promise<void> {
this.log.info('Run Kubeflow training service.'); this.log.info('Run Kubeflow training service.');
this.kubernetesJobRestServer = component.get(KubeflowJobRestServer); this.kubernetesJobRestServer = new KubernetesJobRestServer(this);
if (this.kubernetesJobRestServer === undefined) { if (this.kubernetesJobRestServer === undefined) {
throw new Error('kubernetesJobRestServer not initialized!'); throw new Error('kubernetesJobRestServer not initialized!');
} }
......
...@@ -16,7 +16,6 @@ import { KubernetesTrainingService } from './kubernetesTrainingService'; ...@@ -16,7 +16,6 @@ import { KubernetesTrainingService } from './kubernetesTrainingService';
export class KubernetesJobRestServer extends ClusterJobRestServer { export class KubernetesJobRestServer extends ClusterJobRestServer {
@Inject @Inject
private readonly kubernetesTrainingService? : KubernetesTrainingService; private readonly kubernetesTrainingService? : KubernetesTrainingService;
/** /**
* constructor to provide NNIRestServer's own rest property, e.g. port * constructor to provide NNIRestServer's own rest property, e.g. port
*/ */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment