"...composable_kernel_rocm.git" did not exist on "0c7b35c4f893b97aa70088c194b57a7a41790fff"
Unverified Commit b42f85f5 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Fix kubeflow & frameworkcontroller pipeline (#3991)

parent aa2cc922
......@@ -128,5 +128,3 @@ def validate_all_content(experiment_config, config_path):
if 'maxExecDuration' in experiment_config:
experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration'])
if 'maxTrialDuration' in experiment_config:
experiment_config['maxTrialDuration'] = parse_time(experiment_config['maxTrialDuration'])
......@@ -13,6 +13,9 @@ def rest_put(url, data, timeout, show_error=False):
response = requests.put(url, headers={'Accept': 'application/json', 'Content-Type': 'application/json'},\
data=data, timeout=timeout)
return response
except requests.exceptions.Timeout:
print_error("Connect %s timeout." % url)
return None
except Exception as exception:
if show_error:
print_error(exception)
......@@ -24,6 +27,9 @@ def rest_post(url, data, timeout, show_error=False):
response = requests.post(url, headers={'Accept': 'application/json', 'Content-Type': 'application/json'},\
data=data, timeout=timeout)
return response
except requests.exceptions.Timeout:
print_error("Connect %s timeout." % url)
return None
except Exception as exception:
if show_error:
print_error(exception)
......@@ -34,6 +40,9 @@ def rest_get(url, timeout, show_error=False):
try:
response = requests.get(url, timeout=timeout)
return response
except requests.exceptions.Timeout:
print_error("Connect %s timeout." % url)
return None
except Exception as exception:
if show_error:
print_error(exception)
......@@ -44,6 +53,9 @@ def rest_delete(url, timeout, show_error=False):
try:
response = requests.delete(url, timeout=timeout)
return response
except requests.exceptions.Timeout:
print_error("Connect %s timeout." % url)
return None
except Exception as exception:
if show_error:
print_error(exception)
......
......@@ -457,6 +457,9 @@ class NNIManager implements Manager {
} else if (platform === 'local') {
const module_ = await import('../training_service/local/localTrainingService');
return new module_.LocalTrainingService(config);
} else if (platform === 'kubeflow') {
const module_ = await import('../training_service/kubernetes/kubeflow/kubeflowTrainingService');
return new module_.KubeflowTrainingService();
} else if (platform === 'frameworkcontroller') {
const module_ = await import('../training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService');
return new module_.FrameworkControllerTrainingService();
......
......@@ -21,7 +21,8 @@ import { MetricType } from '../common/datastore';
import { ProfileUpdateType } from '../common/manager';
import { TrialJobStatus } from '../common/trainingService';
const expressJoi = require('express-joi-validator');
// TODO: fix expressJoi
//const expressJoi = require('express-joi-validator');
class NNIRestHandler {
private restServer: NNIRestServer;
......@@ -205,7 +206,7 @@ class NNIRestHandler {
private setClusterMetaData(router: Router): void {
router.put(
'/experiment/cluster-metadata', expressJoi(ValidationSchemas.SETCLUSTERMETADATA),
'/experiment/cluster-metadata', //TODO: Fix validation expressJoi(ValidationSchemas.SETCLUSTERMETADATA),
async (req: Request, res: Response) => {
const metadata: any = req.body;
const keys: string[] = Object.keys(metadata);
......
......@@ -226,6 +226,7 @@ export namespace ValidationSchemas {
trainingServicePlatform: joi.string(),
searchSpace: joi.string().required(),
maxExecDuration: joi.number().min(0).required(),
maxTrialDuration: joi.number().min(0).required(),
multiPhase: joi.boolean(),
multiThread: joi.boolean(),
nniManagerIp: joi.string(),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment