Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
6f272049
Unverified
Commit
6f272049
authored
Jan 26, 2021
by
SparkSnail
Committed by
GitHub
Jan 26, 2021
Browse files
Remove paiYarn mode (#3327)
parent
f11aea0d
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
350 additions
and
1385 deletions
+350
-1385
nni/runtime/platform/__init__.py
nni/runtime/platform/__init__.py
+1
-1
nni/tools/nnictl/config_schema.py
nni/tools/nnictl/config_schema.py
+2
-15
nni/tools/nnictl/launcher.py
nni/tools/nnictl/launcher.py
+0
-26
nni/tools/trial_tool/trial_keeper.py
nni/tools/trial_tool/trial_keeper.py
+0
-2
test/config/training_service.yml
test/config/training_service.yml
+0
-16
test/nni_test/nnitest/generate_ts_config.py
test/nni_test/nnitest/generate_ts_config.py
+0
-15
ts/nni_manager/main.ts
ts/nni_manager/main.ts
+2
-7
ts/nni_manager/training_service/pai/paiConfig.ts
ts/nni_manager/training_service/pai/paiConfig.ts
+35
-0
ts/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts
ts/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts
+0
-33
ts/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
ts/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
+0
-20
ts/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
...ager/training_service/pai/paiK8S/paiK8STrainingService.ts
+0
-317
ts/nni_manager/training_service/pai/paiTrainingService.ts
ts/nni_manager/training_service/pai/paiTrainingService.ts
+305
-40
ts/nni_manager/training_service/pai/paiYarn/hdfsClientUtility.ts
...manager/training_service/pai/paiYarn/hdfsClientUtility.ts
+0
-226
ts/nni_manager/training_service/pai/paiYarn/paiYarnConfig.ts
ts/nni_manager/training_service/pai/paiYarn/paiYarnConfig.ts
+0
-127
ts/nni_manager/training_service/pai/paiYarn/paiYarnData.ts
ts/nni_manager/training_service/pai/paiYarn/paiYarnData.ts
+0
-24
ts/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
...er/training_service/pai/paiYarn/paiYarnTrainingService.ts
+0
-357
ts/nni_manager/training_service/pai/paiYarn/paiYarnTrialConfig.ts
...anager/training_service/pai/paiYarn/paiYarnTrialConfig.ts
+0
-27
ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
...ervice/reusable/environments/openPaiEnvironmentService.ts
+3
-3
ts/nni_manager/training_service/reusable/routerTrainingService.ts
...anager/training_service/reusable/routerTrainingService.ts
+2
-2
ts/nni_manager/training_service/test/hdfsClientUtility.test.ts
...i_manager/training_service/test/hdfsClientUtility.test.ts
+0
-127
No files found.
nni/runtime/platform/__init__.py
View file @
6f272049
...
...
@@ -9,7 +9,7 @@ if trial_env_vars.NNI_PLATFORM is None:
from
.standalone
import
*
elif
trial_env_vars
.
NNI_PLATFORM
==
'unittest'
:
from
.test
import
*
elif
trial_env_vars
.
NNI_PLATFORM
in
(
'local'
,
'remote'
,
'pai'
,
'kubeflow'
,
'frameworkcontroller'
,
'paiYarn'
,
'dlts'
,
'aml'
,
'adl'
,
'hybrid'
):
elif
trial_env_vars
.
NNI_PLATFORM
in
(
'local'
,
'remote'
,
'pai'
,
'kubeflow'
,
'frameworkcontroller'
,
'dlts'
,
'aml'
,
'adl'
,
'hybrid'
):
from
.local
import
*
else
:
raise
RuntimeError
(
'Unknown platform %s'
%
trial_env_vars
.
NNI_PLATFORM
)
nni/tools/nnictl/config_schema.py
View file @
6f272049
...
...
@@ -124,7 +124,7 @@ common_schema = {
Optional
(
'maxExecDuration'
):
And
(
Regex
(
r
'^[1-9][0-9]*[s|m|h|d]$'
,
error
=
'ERROR: maxExecDuration format is [digit]{s,m,h,d}'
)),
Optional
(
'maxTrialNum'
):
setNumberRange
(
'maxTrialNum'
,
int
,
1
,
99999
),
'trainingServicePlatform'
:
setChoice
(
'trainingServicePlatform'
,
'remote'
,
'local'
,
'pai'
,
'kubeflow'
,
'frameworkcontroller'
,
'paiYarn'
,
'dlts'
,
'aml'
,
'adl'
,
'hybrid'
),
'trainingServicePlatform'
,
'remote'
,
'local'
,
'pai'
,
'kubeflow'
,
'frameworkcontroller'
,
'dlts'
,
'aml'
,
'adl'
,
'hybrid'
),
Optional
(
'searchSpacePath'
):
And
(
os
.
path
.
exists
,
error
=
SCHEMA_PATH_ERROR
%
'searchSpacePath'
),
Optional
(
'multiPhase'
):
setType
(
'multiPhase'
,
bool
),
Optional
(
'multiThread'
):
setType
(
'multiThread'
,
bool
),
...
...
@@ -178,18 +178,6 @@ pai_yarn_trial_schema = {
}
}
pai_yarn_config_schema
=
{
'paiYarnConfig'
:
Or
({
'userName'
:
setType
(
'userName'
,
str
),
'passWord'
:
setType
(
'passWord'
,
str
),
'host'
:
setType
(
'host'
,
str
)
},
{
'userName'
:
setType
(
'userName'
,
str
),
'token'
:
setType
(
'token'
,
str
),
'host'
:
setType
(
'host'
,
str
)
})
}
pai_trial_schema
=
{
'trial'
:
{
...
...
@@ -456,7 +444,6 @@ training_service_schema_dict = {
'local'
:
Schema
({
**
common_schema
,
**
common_trial_schema
}),
'remote'
:
Schema
({
**
common_schema
,
**
common_trial_schema
,
**
machine_list_schema
,
**
remote_config_schema
}),
'pai'
:
Schema
({
**
common_schema
,
**
pai_trial_schema
,
**
pai_config_schema
}),
'paiYarn'
:
Schema
({
**
common_schema
,
**
pai_yarn_trial_schema
,
**
pai_yarn_config_schema
}),
'kubeflow'
:
Schema
({
**
common_schema
,
**
kubeflow_trial_schema
,
**
kubeflow_config_schema
}),
'frameworkcontroller'
:
Schema
({
**
common_schema
,
**
frameworkcontroller_trial_schema
,
**
frameworkcontroller_config_schema
}),
'aml'
:
Schema
({
**
common_schema
,
**
aml_trial_schema
,
**
aml_config_schema
}),
...
...
@@ -569,7 +556,7 @@ class NNIConfigSchema:
def
validate_pai_trial_conifg
(
self
,
experiment_config
):
'''validate the trial config in pai platform'''
if
experiment_config
.
get
(
'trainingServicePlatform'
)
in
[
'pai'
,
'paiYarn'
]:
if
experiment_config
.
get
(
'trainingServicePlatform'
)
in
[
'pai'
]:
if
experiment_config
.
get
(
'trial'
).
get
(
'shmMB'
)
and
\
experiment_config
[
'trial'
][
'shmMB'
]
>
experiment_config
[
'trial'
][
'memoryMB'
]:
raise
SchemaError
(
'shmMB should be no more than memoryMB!'
)
...
...
nni/tools/nnictl/launcher.py
View file @
6f272049
...
...
@@ -205,25 +205,6 @@ def set_pai_config(experiment_config, port, config_file_name):
#set trial_config
return
set_trial_config
(
experiment_config
,
port
,
config_file_name
),
err_message
def
set_pai_yarn_config
(
experiment_config
,
port
,
config_file_name
):
'''set paiYarn configuration'''
pai_yarn_config_data
=
dict
()
pai_yarn_config_data
[
'pai_yarn_config'
]
=
experiment_config
[
'paiYarnConfig'
]
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
pai_yarn_config_data
),
REST_TIME_OUT
)
err_message
=
None
if
not
response
or
not
response
.
status_code
==
200
:
if
response
is
not
None
:
err_message
=
response
.
text
_
,
stderr_full_path
=
get_log_path
(
config_file_name
)
with
open
(
stderr_full_path
,
'a+'
)
as
fout
:
fout
.
write
(
json
.
dumps
(
json
.
loads
(
err_message
),
indent
=
4
,
sort_keys
=
True
,
separators
=
(
','
,
':'
)))
return
False
,
err_message
result
,
message
=
setNNIManagerIp
(
experiment_config
,
port
,
config_file_name
)
if
not
result
:
return
result
,
message
#set trial_config
return
set_trial_config
(
experiment_config
,
port
,
config_file_name
),
err_message
def
set_kubeflow_config
(
experiment_config
,
port
,
config_file_name
):
'''set kubeflow configuration'''
kubeflow_config_data
=
dict
()
...
...
@@ -395,11 +376,6 @@ def set_experiment(experiment_config, mode, port, config_file_name):
{
'key'
:
'pai_config'
,
'value'
:
experiment_config
[
'paiConfig'
]})
request_data
[
'clusterMetaData'
].
append
(
{
'key'
:
'trial_config'
,
'value'
:
experiment_config
[
'trial'
]})
elif
experiment_config
[
'trainingServicePlatform'
]
==
'paiYarn'
:
request_data
[
'clusterMetaData'
].
append
(
{
'key'
:
'pai_yarn_config'
,
'value'
:
experiment_config
[
'paiYarnConfig'
]})
request_data
[
'clusterMetaData'
].
append
(
{
'key'
:
'trial_config'
,
'value'
:
experiment_config
[
'trial'
]})
elif
experiment_config
[
'trainingServicePlatform'
]
==
'kubeflow'
:
request_data
[
'clusterMetaData'
].
append
(
{
'key'
:
'kubeflow_config'
,
'value'
:
experiment_config
[
'kubeflowConfig'
]})
...
...
@@ -453,8 +429,6 @@ def set_platform_config(platform, experiment_config, port, config_file_name, res
config_result
,
err_msg
=
set_remote_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'pai'
:
config_result
,
err_msg
=
set_pai_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'paiYarn'
:
config_result
,
err_msg
=
set_pai_yarn_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'kubeflow'
:
config_result
,
err_msg
=
set_kubeflow_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'frameworkcontroller'
:
...
...
nni/tools/trial_tool/trial_keeper.py
View file @
6f272049
...
...
@@ -254,8 +254,6 @@ if __name__ == '__main__':
exit
(
1
)
check_version
(
args
)
try
:
if
NNI_PLATFORM
==
'paiYarn'
and
is_multi_phase
():
fetch_parameter_file
(
args
)
if
NNI_PLATFORM
==
'adl'
:
_set_adaptdl_signal_handler
()
main_loop
(
args
)
...
...
test/config/training_service.yml
View file @
6f272049
...
...
@@ -59,22 +59,6 @@ frameworkcontroller:
local
:
trainingServicePlatform
:
local
paiYarn
:
nniManagerIp
:
maxExecDuration
:
15m
paiYarnConfig
:
host
:
passWord
:
userName
:
trainingServicePlatform
:
paiYarn
trial
:
gpuNum
:
1
cpuNum
:
1
dataDir
:
image
:
memoryMB
:
8192
outputDir
:
virtualCluster
:
pai
:
nniManagerIp
:
maxExecDuration
:
15m
...
...
test/nni_test/nnitest/generate_ts_config.py
View file @
6f272049
...
...
@@ -13,21 +13,6 @@ def update_training_service_config(args):
config
=
get_yml_content
(
TRAINING_SERVICE_FILE
)
if
args
.
nni_manager_ip
is
not
None
:
config
[
args
.
ts
][
'nniManagerIp'
]
=
args
.
nni_manager_ip
if
args
.
ts
==
'paiYarn'
:
if
args
.
pai_user
is
not
None
:
config
[
args
.
ts
][
'paiYarnConfig'
][
'userName'
]
=
args
.
pai_user
if
args
.
pai_pwd
is
not
None
:
config
[
args
.
ts
][
'paiYarnConfig'
][
'passWord'
]
=
args
.
pai_pwd
if
args
.
pai_host
is
not
None
:
config
[
args
.
ts
][
'paiYarnConfig'
][
'host'
]
=
args
.
pai_host
if
args
.
nni_docker_image
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'image'
]
=
args
.
nni_docker_image
if
args
.
data_dir
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'dataDir'
]
=
args
.
data_dir
if
args
.
output_dir
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'outputDir'
]
=
args
.
output_dir
if
args
.
vc
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'virtualCluster'
]
=
args
.
vc
if
args
.
ts
==
'pai'
:
if
args
.
pai_user
is
not
None
:
config
[
args
.
ts
][
'paiConfig'
][
'userName'
]
=
args
.
pai_user
...
...
ts/nni_manager/main.ts
View file @
6f272049
...
...
@@ -25,7 +25,6 @@ import { AdlTrainingService } from './training_service/kubernetes/adl/adlTrainin
import
{
KubeflowTrainingService
}
from
'
./training_service/kubernetes/kubeflow/kubeflowTrainingService
'
;
import
{
LocalTrainingService
}
from
'
./training_service/local/localTrainingService
'
;
import
{
RouterTrainingService
}
from
'
./training_service/reusable/routerTrainingService
'
;
import
{
PAIYarnTrainingService
}
from
'
./training_service/pai/paiYarn/paiYarnTrainingService
'
;
import
{
DLTSTrainingService
}
from
'
./training_service/dlts/dltsTrainingService
'
;
...
...
@@ -46,10 +45,6 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
Container
.
bind
(
TrainingService
)
.
to
(
LocalTrainingService
)
.
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
paiYarn
'
)
{
Container
.
bind
(
TrainingService
)
.
to
(
PAIYarnTrainingService
)
.
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
kubeflow
'
)
{
Container
.
bind
(
TrainingService
)
.
to
(
KubeflowTrainingService
)
...
...
@@ -97,7 +92,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
function
usage
():
void
{
console
.
info
(
'
usage: node main.js --port <port> --mode
\
<local/remote/pai/kubeflow/frameworkcontroller/
paiYarn/
aml/adl/hybrid> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>
'
);
<local/remote/pai/kubeflow/frameworkcontroller/aml/adl/hybrid> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>
'
);
}
const
strPort
:
string
=
parseArg
([
'
--port
'
,
'
-p
'
]);
...
...
@@ -117,7 +112,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
const
port
:
number
=
parseInt
(
strPort
,
10
);
const
mode
:
string
=
parseArg
([
'
--mode
'
,
'
-m
'
]);
if
(
!
[
'
local
'
,
'
remote
'
,
'
pai
'
,
'
kubeflow
'
,
'
frameworkcontroller
'
,
'
paiYarn
'
,
'
dlts
'
,
'
aml
'
,
'
adl
'
,
'
hybrid
'
].
includes
(
mode
))
{
if
(
!
[
'
local
'
,
'
remote
'
,
'
pai
'
,
'
kubeflow
'
,
'
frameworkcontroller
'
,
'
dlts
'
,
'
aml
'
,
'
adl
'
,
'
hybrid
'
].
includes
(
mode
))
{
console
.
log
(
`FATAL: unknown mode:
${
mode
}
`
);
usage
();
process
.
exit
(
1
);
...
...
ts/nni_manager/training_service/pai/paiConfig.ts
View file @
6f272049
...
...
@@ -4,6 +4,7 @@
'
use strict
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
export
class
PAIClusterConfig
{
public
readonly
userName
:
string
;
...
...
@@ -71,3 +72,37 @@ export class PAITrialJobDetail implements TrialJobDetail {
this
.
paiJobDetailUrl
=
paiJobDetailUrl
;
}
}
export
const
PAI_TRIAL_COMMAND_FORMAT
:
string
=
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& NNI_CODE_DIR={6} && mkdir -p $NNI_SYS_DIR/code && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code && sh $NNI_SYS_DIR/install_nni.sh \
&& cd $NNI_SYS_DIR/code && python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}' | tee $NNI_OUTPUT_DIR/trial.log`
;
/**
* PAI trial configuration
*/
export
class
NNIPAITrialConfig
extends
TrialConfig
{
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
public
virtualCluster
?:
string
;
public
readonly
nniManagerNFSMountPath
:
string
;
public
readonly
containerNFSMountPath
:
string
;
public
readonly
paiStorageConfigName
:
string
;
public
readonly
paiConfigPath
?:
string
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
nniManagerNFSMountPath
:
string
,
containerNFSMountPath
:
string
,
paiStorageConfigName
:
string
,
virtualCluster
?:
string
,
paiConfigPath
?:
string
)
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
image
=
image
;
this
.
virtualCluster
=
virtualCluster
;
this
.
nniManagerNFSMountPath
=
nniManagerNFSMountPath
;
this
.
containerNFSMountPath
=
containerNFSMountPath
;
this
.
paiStorageConfigName
=
paiStorageConfigName
;
this
.
paiConfigPath
=
paiConfigPath
;
}
}
ts/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts
deleted
100644 → 0
View file @
f11aea0d
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
{
TrialConfig
}
from
'
../../common/trialConfig
'
;
/**
* PAI trial configuration
*/
export
class
NNIPAIK8STrialConfig
extends
TrialConfig
{
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
public
virtualCluster
?:
string
;
public
readonly
nniManagerNFSMountPath
:
string
;
public
readonly
containerNFSMountPath
:
string
;
public
readonly
paiStorageConfigName
:
string
;
public
readonly
paiConfigPath
?:
string
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
nniManagerNFSMountPath
:
string
,
containerNFSMountPath
:
string
,
paiStorageConfigName
:
string
,
virtualCluster
?:
string
,
paiConfigPath
?:
string
)
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
image
=
image
;
this
.
virtualCluster
=
virtualCluster
;
this
.
nniManagerNFSMountPath
=
nniManagerNFSMountPath
;
this
.
containerNFSMountPath
=
containerNFSMountPath
;
this
.
paiStorageConfigName
=
paiStorageConfigName
;
this
.
paiConfigPath
=
paiConfigPath
;
}
}
ts/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
deleted
100644 → 0
View file @
f11aea0d
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
export
const
PAI_INSTALL_NNI_SHELL_FORMAT
:
string
=
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
return
else
# Install nni
python3 -m pip install --user nni
fi`
;
export
const
PAI_K8S_TRIAL_COMMAND_FORMAT
:
string
=
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& NNI_CODE_DIR={6} && mkdir -p $NNI_SYS_DIR/code && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code && sh $NNI_SYS_DIR/install_nni.sh \
&& cd $NNI_SYS_DIR/code && python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}' | tee $NNI_OUTPUT_DIR/trial.log`
;
ts/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
deleted
100644 → 0
View file @
f11aea0d
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
request
from
'
request
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
HyperParameters
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
}
from
'
../../../common/trainingService
'
;
import
{
generateParamFileName
,
getIPV4Address
,
getVersion
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
execMkdir
,
validateCodeDir
,
execCopydir
}
from
'
../../common/util
'
;
import
{
PAI_K8S_TRIAL_COMMAND_FORMAT
}
from
'
./paiK8SData
'
;
import
{
NNIPAIK8STrialConfig
}
from
'
./paiK8SConfig
'
;
import
{
PAITrainingService
}
from
'
../paiTrainingService
'
;
import
{
PAIClusterConfig
,
PAITrialJobDetail
}
from
'
../paiConfig
'
;
import
{
PAIJobRestServer
}
from
'
../paiJobRestServer
'
;
const
yaml
=
require
(
'
js-yaml
'
);
/**
* Training Service implementation for OpenPAI (Open Platform for AI)
* Refer https://github.com/Microsoft/pai for more info about OpenPAI
*/
@
component
.
Singleton
class
PAIK8STrainingService
extends
PAITrainingService
{
protected
paiTrialConfig
:
NNIPAIK8STrialConfig
|
undefined
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
private
paiJobConfig
:
any
;
private
nniVersion
:
string
|
undefined
;
constructor
()
{
super
();
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
PAI_CLUSTER_CONFIG
:
this
.
paiJobRestServer
=
new
PAIJobRestServer
(
component
.
get
(
PAIK8STrainingService
));
this
.
paiClusterConfig
=
<
PAIClusterConfig
>
JSON
.
parse
(
value
);
this
.
paiClusterConfig
.
host
=
this
.
formatPAIHost
(
this
.
paiClusterConfig
.
host
);
this
.
paiToken
=
this
.
paiClusterConfig
.
token
;
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
break
;
}
this
.
paiTrialConfig
=
<
NNIPAIK8STrialConfig
>
JSON
.
parse
(
value
);
// Validate to make sure codeDir doesn't have too many files
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
const
nniManagerNFSExpCodeDir
=
path
.
join
(
this
.
paiTrialConfig
.
nniManagerNFSMountPath
,
this
.
experimentId
,
'
nni-code
'
);
await
execMkdir
(
nniManagerNFSExpCodeDir
);
//Copy codeDir files to local working folder
this
.
log
.
info
(
`Starting copy codeDir data from
${
this
.
paiTrialConfig
.
codeDir
}
to
${
nniManagerNFSExpCodeDir
}
`
);
this
.
copyExpCodeDirPromise
=
execCopydir
(
this
.
paiTrialConfig
.
codeDir
,
nniManagerNFSExpCodeDir
);
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
this
.
paiJobConfig
=
yaml
.
safeLoad
(
fs
.
readFileSync
(
this
.
paiTrialConfig
.
paiConfigPath
,
'
utf8
'
));
}
break
;
}
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
this
.
nniVersion
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
case
TrialConfigMetadataKey
.
MULTI_PHASE
:
this
.
isMultiPhase
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
default
:
//Reject for unknown keys
this
.
log
.
error
(
`Uknown key:
${
key
}
`
);
}
}
// update trial parameters for multi-phase
public
async
updateTrialJob
(
trialJobId
:
string
,
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`updateTrialJob failed:
${
trialJobId
}
not found`
);
}
// Write file content ( parameter.cfg ) to working folders
await
this
.
writeParameterFile
(
trialJobDetail
.
logPath
,
form
.
hyperParameters
);
return
trialJobDetail
;
}
public
async
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
`paiClusterConfig not initialized!`
);
}
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
`paiTrialConfig not initialized!`
);
}
this
.
log
.
info
(
`submitTrialJob: form:
${
JSON
.
stringify
(
form
)}
`
);
const
trialJobId
:
string
=
uniqueString
(
5
);
//TODO: use HDFS working folder instead
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
expRootDir
,
'
trials
'
,
trialJobId
);
const
paiJobName
:
string
=
`nni_exp_
${
this
.
experimentId
}
_trial_
${
trialJobId
}
`
;
const
logPath
:
string
=
path
.
join
(
this
.
paiTrialConfig
.
nniManagerNFSMountPath
,
this
.
experimentId
,
trialJobId
);
const
paiJobDetailUrl
:
string
=
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/job-detail.html?username=
${
this
.
paiClusterConfig
.
userName
}
&jobName=
${
paiJobName
}
`
;
const
trialJobDetail
:
PAITrialJobDetail
=
new
PAITrialJobDetail
(
trialJobId
,
'
WAITING
'
,
paiJobName
,
Date
.
now
(),
trialWorkingFolder
,
form
,
logPath
,
paiJobDetailUrl
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
jobQueue
.
push
(
trialJobId
);
return
trialJobDetail
;
}
private
generateNNITrialCommand
(
trialJobDetail
:
PAITrialJobDetail
,
command
:
string
):
string
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
containerNFSExpCodeDir
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
/nni-code`
;
const
containerWorkingDir
:
string
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
/
${
trialJobDetail
.
id
}
`
;
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
PAI_K8S_TRIAL_COMMAND_FORMAT
,
`
${
containerWorkingDir
}
`
,
`
${
containerWorkingDir
}
/nnioutput`
,
trialJobDetail
.
id
,
this
.
experimentId
,
trialJobDetail
.
form
.
sequenceId
,
this
.
isMultiPhase
,
containerNFSExpCodeDir
,
command
,
nniManagerIp
,
this
.
paiRestServerPort
,
this
.
nniVersion
,
this
.
logCollection
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
return
nniPaiTrialCommand
;
}
private
generateJobConfigInYamlFormat
(
trialJobDetail
:
PAITrialJobDetail
):
any
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
jobName
=
`nni_exp_
${
this
.
experimentId
}
_trial_
${
trialJobDetail
.
id
}
`
let
nniJobConfig
:
any
=
undefined
;
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
nniJobConfig
=
JSON
.
parse
(
JSON
.
stringify
(
this
.
paiJobConfig
));
//Trick for deep clone in Typescript
nniJobConfig
.
name
=
jobName
;
// Each taskRole will generate new command in NNI's command format
// Each command will be formatted to NNI style
for
(
const
taskRoleIndex
in
nniJobConfig
.
taskRoles
)
{
const
commands
=
nniJobConfig
.
taskRoles
[
taskRoleIndex
].
commands
const
nniTrialCommand
=
this
.
generateNNITrialCommand
(
trialJobDetail
,
commands
.
join
(
"
&&
"
).
replace
(
/
([
"'$`
\\])
/g
,
'
\\
$1
'
));
nniJobConfig
.
taskRoles
[
taskRoleIndex
].
commands
=
[
nniTrialCommand
]
}
}
else
{
nniJobConfig
=
{
protocolVersion
:
2
,
name
:
jobName
,
type
:
'
job
'
,
jobRetryCount
:
0
,
prerequisites
:
[
{
type
:
'
dockerimage
'
,
uri
:
this
.
paiTrialConfig
.
image
,
name
:
'
docker_image_0
'
}
],
taskRoles
:
{
taskrole
:
{
instances
:
1
,
completion
:
{
minFailedInstances
:
1
,
minSucceededInstances
:
-
1
},
taskRetryCount
:
0
,
dockerImage
:
'
docker_image_0
'
,
resourcePerInstance
:
{
gpu
:
this
.
paiTrialConfig
.
gpuNum
,
cpu
:
this
.
paiTrialConfig
.
cpuNum
,
memoryMB
:
this
.
paiTrialConfig
.
memoryMB
},
commands
:
[
this
.
generateNNITrialCommand
(
trialJobDetail
,
this
.
paiTrialConfig
.
command
)
]
}
},
extras
:
{
'
storages
'
:
[
{
name
:
this
.
paiTrialConfig
.
paiStorageConfigName
}
],
submitFrom
:
'
submit-job-v2
'
}
}
if
(
this
.
paiTrialConfig
.
virtualCluster
)
{
nniJobConfig
.
defaults
=
{
virtualCluster
:
this
.
paiTrialConfig
.
virtualCluster
}
}
}
return
yaml
.
safeDump
(
nniJobConfig
);
}
protected
async
submitTrialJobToPAI
(
trialJobId
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`Failed to find PAITrialJobDetail for job
${
trialJobId
}
`
);
}
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
}
if
(
this
.
paiJobRestServer
===
undefined
)
{
throw
new
Error
(
'
paiJobRestServer is not initialized
'
);
}
// Make sure experiment code files is copied from local to NFS
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
this
.
log
.
info
(
`Copy codeDir data finished.`
);
// All trials share same destination NFS code folder, only copy codeDir once for an experiment.
// After copy data finished, set copyExpCodeDirPromise be undefined to avoid log content duplicated.
this
.
copyExpCodeDirPromise
=
undefined
;
}
this
.
paiRestServerPort
=
this
.
paiJobRestServer
.
clusterRestServerPort
;
// Step 1. Prepare PAI job configuration
//create trial local working folder locally.
await
execMkdir
(
trialJobDetail
.
logPath
);
// Write NNI installation file to local files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
logPath
,
'
install_nni.sh
'
),
CONTAINER_INSTALL_NNI_SHELL_FORMAT
,
{
encoding
:
'
utf8
'
});
// Write file content ( parameter.cfg ) to local working folders
if
(
trialJobDetail
.
form
!==
undefined
)
{
await
this
.
writeParameterFile
(
trialJobDetail
.
logPath
,
trialJobDetail
.
form
.
hyperParameters
);
}
//Generate Job Configuration in yaml format
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
trialJobDetail
);
this
.
log
.
debug
(
paiJobConfig
);
// Step 2. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const
submitJobRequest
:
request
.
Options
=
{
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v2/jobs`
,
method
:
'
POST
'
,
body
:
paiJobConfig
,
followAllRedirects
:
true
,
headers
:
{
'
Content-Type
'
:
'
text/yaml
'
,
Authorization
:
`Bearer
${
this
.
paiToken
}
`
}
};
request
(
submitJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
// If submit success, will get status code 202. refer: https://github.com/microsoft/pai/blob/master/src/rest-server/docs/swagger.yaml
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`Submit trial
${
trialJobId
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
body
}
`
;
this
.
log
.
error
(
errorMessage
);
trialJobDetail
.
status
=
'
FAILED
'
;
deferred
.
reject
(
errorMessage
);
}
else
{
trialJobDetail
.
submitTime
=
Date
.
now
();
}
deferred
.
resolve
(
true
);
});
return
deferred
.
promise
;
}
private
async
writeParameterFile
(
directory
:
string
,
hyperParameters
:
HyperParameters
):
Promise
<
void
>
{
const
filepath
:
string
=
path
.
join
(
directory
,
generateParamFileName
(
hyperParameters
));
await
fs
.
promises
.
writeFile
(
filepath
,
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
export
{
PAIK8STrainingService
};
ts/nni_manager/training_service/pai/paiTrainingService.ts
View file @
6f272049
...
...
@@ -3,6 +3,7 @@
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
request
from
'
request
'
;
import
*
as
component
from
'
../../common/component
'
;
...
...
@@ -13,41 +14,53 @@ import { getExperimentId } from '../../common/experimentStartupInfo';
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
NNIManagerIpConfig
,
TrainingService
,
HyperParameters
,
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
LogType
}
from
'
../../common/trainingService
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
PAIJobInfoCollector
}
from
'
./paiJobInfoCollector
'
;
import
{
PAIJobRestServer
}
from
'
./paiJobRestServer
'
;
import
{
PAIClusterConfig
,
PAITrialJobDetail
}
from
'
./paiConfig
'
;
import
{
PAIClusterConfig
,
PAITrialJobDetail
,
PAI_TRIAL_COMMAND_FORMAT
,
NNIPAITrialConfig
}
from
'
./paiConfig
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
generateParamFileName
,
getIPV4Address
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execMkdir
,
validateCodeDir
,
execCopydir
}
from
'
../common/util
'
;
const
yaml
=
require
(
'
js-yaml
'
);
/**
* Training Service implementation for OpenPAI (Open Platform for AI)
* Refer https://github.com/Microsoft/pai for more info about OpenPAI
*/
@
component
.
Singleton
abstract
class
PAITrainingService
implements
TrainingService
{
protected
readonly
log
!
:
Logger
;
protected
readonly
metricsEmitter
:
EventEmitter
;
protected
readonly
trialJobsMap
:
Map
<
string
,
PAITrialJobDetail
>
;
protected
readonly
expRootDir
:
string
;
protected
paiClusterConfig
?:
PAIClusterConfig
;
protected
readonly
jobQueue
:
string
[];
protected
stopping
:
boolean
=
false
;
protected
paiToken
?:
string
;
protected
paiTokenUpdateTime
?:
number
;
protected
readonly
paiTokenUpdateInterval
:
number
;
protected
readonly
experimentId
!
:
string
;
protected
readonly
paiJobCollector
:
PAIJobInfoCollector
;
protected
paiRestServerPort
?:
number
;
protected
nniManagerIpConfig
?:
NNIManagerIpConfig
;
protected
versionCheck
:
boolean
=
true
;
protected
logCollection
:
string
;
protected
isMultiPhase
:
boolean
=
false
;
protected
authFileHdfsPath
:
string
|
undefined
=
undefined
;
protected
portList
?:
string
|
undefined
;
protected
paiJobRestServer
?:
PAIJobRestServer
;
protected
protocol
:
string
=
'
http
'
;
class
PAITrainingService
implements
TrainingService
{
private
readonly
log
!
:
Logger
;
private
readonly
metricsEmitter
:
EventEmitter
;
private
readonly
trialJobsMap
:
Map
<
string
,
PAITrialJobDetail
>
;
private
readonly
expRootDir
:
string
;
private
paiClusterConfig
?:
PAIClusterConfig
;
private
readonly
jobQueue
:
string
[];
private
stopping
:
boolean
=
false
;
private
paiToken
?:
string
;
private
paiTokenUpdateTime
?:
number
;
private
readonly
paiTokenUpdateInterval
:
number
;
private
readonly
experimentId
!
:
string
;
private
readonly
paiJobCollector
:
PAIJobInfoCollector
;
private
paiRestServerPort
?:
number
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
versionCheck
:
boolean
=
true
;
private
logCollection
:
string
;
private
isMultiPhase
:
boolean
=
false
;
private
paiJobRestServer
?:
PAIJobRestServer
;
private
protocol
:
string
=
'
http
'
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
private
paiJobConfig
:
any
;
private
nniVersion
:
string
|
undefined
;
private
paiTrialConfig
:
NNIPAITrialConfig
|
undefined
;
constructor
()
{
this
.
log
=
getLogger
();
...
...
@@ -76,18 +89,6 @@ abstract class PAITrainingService implements TrainingService {
this
.
log
.
info
(
'
PAI training service exit.
'
);
}
public
async
submitTrialJob
(
_form
:
TrialJobApplicationForm
):
Promise
<
any
>
{
throw
new
Error
(
'
Not implemented!
'
);
}
public
async
updateTrialJob
(
_trialJobId
:
string
,
_form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
throw
new
Error
(
'
Not implemented!
'
);
}
protected
async
submitTrialJobToPAI
(
_trialJobId
:
string
):
Promise
<
boolean
>
{
throw
new
Error
(
'
Not implemented!
'
);
}
protected
async
submitJobLoop
():
Promise
<
void
>
{
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
&&
this
.
jobQueue
.
length
>
0
)
{
...
...
@@ -104,10 +105,6 @@ abstract class PAITrainingService implements TrainingService {
}
}
public
async
setClusterMetadata
(
_key
:
string
,
_value
:
string
):
Promise
<
void
>
{
throw
new
Error
(
'
Not implemented!
'
);
}
public
async
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
const
jobs
:
TrialJobDetail
[]
=
[];
...
...
@@ -311,6 +308,274 @@ abstract class PAITrainingService implements TrainingService {
return
Promise
.
race
([
timeoutDelay
,
deferred
.
promise
])
.
finally
(()
=>
{
clearTimeout
(
timeoutId
);
});
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
PAI_CLUSTER_CONFIG
:
this
.
paiJobRestServer
=
new
PAIJobRestServer
(
component
.
get
(
PAITrainingService
));
this
.
paiClusterConfig
=
<
PAIClusterConfig
>
JSON
.
parse
(
value
);
this
.
paiClusterConfig
.
host
=
this
.
formatPAIHost
(
this
.
paiClusterConfig
.
host
);
this
.
paiToken
=
this
.
paiClusterConfig
.
token
;
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
break
;
}
this
.
paiTrialConfig
=
<
NNIPAITrialConfig
>
JSON
.
parse
(
value
);
// Validate to make sure codeDir doesn't have too many files
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
const
nniManagerNFSExpCodeDir
=
path
.
join
(
this
.
paiTrialConfig
.
nniManagerNFSMountPath
,
this
.
experimentId
,
'
nni-code
'
);
await
execMkdir
(
nniManagerNFSExpCodeDir
);
//Copy codeDir files to local working folder
this
.
log
.
info
(
`Starting copy codeDir data from
${
this
.
paiTrialConfig
.
codeDir
}
to
${
nniManagerNFSExpCodeDir
}
`
);
this
.
copyExpCodeDirPromise
=
execCopydir
(
this
.
paiTrialConfig
.
codeDir
,
nniManagerNFSExpCodeDir
);
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
this
.
paiJobConfig
=
yaml
.
safeLoad
(
fs
.
readFileSync
(
this
.
paiTrialConfig
.
paiConfigPath
,
'
utf8
'
));
}
break
;
}
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
this
.
nniVersion
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
case
TrialConfigMetadataKey
.
MULTI_PHASE
:
this
.
isMultiPhase
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
default
:
//Reject for unknown keys
this
.
log
.
error
(
`Uknown key:
${
key
}
`
);
}
}
// update trial parameters for multi-phase
public
async
updateTrialJob
(
trialJobId
:
string
,
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`updateTrialJob failed:
${
trialJobId
}
not found`
);
}
// Write file content ( parameter.cfg ) to working folders
await
this
.
writeParameterFile
(
trialJobDetail
.
logPath
,
form
.
hyperParameters
);
return
trialJobDetail
;
}
public
async
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
`paiClusterConfig not initialized!`
);
}
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
`paiTrialConfig not initialized!`
);
}
this
.
log
.
info
(
`submitTrialJob: form:
${
JSON
.
stringify
(
form
)}
`
);
const
trialJobId
:
string
=
uniqueString
(
5
);
//TODO: use HDFS working folder instead
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
expRootDir
,
'
trials
'
,
trialJobId
);
const
paiJobName
:
string
=
`nni_exp_
${
this
.
experimentId
}
_trial_
${
trialJobId
}
`
;
const
logPath
:
string
=
path
.
join
(
this
.
paiTrialConfig
.
nniManagerNFSMountPath
,
this
.
experimentId
,
trialJobId
);
const
paiJobDetailUrl
:
string
=
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/job-detail.html?username=
${
this
.
paiClusterConfig
.
userName
}
&jobName=
${
paiJobName
}
`
;
const
trialJobDetail
:
PAITrialJobDetail
=
new
PAITrialJobDetail
(
trialJobId
,
'
WAITING
'
,
paiJobName
,
Date
.
now
(),
trialWorkingFolder
,
form
,
logPath
,
paiJobDetailUrl
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
jobQueue
.
push
(
trialJobId
);
return
trialJobDetail
;
}
private
generateNNITrialCommand
(
trialJobDetail
:
PAITrialJobDetail
,
command
:
string
):
string
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
containerNFSExpCodeDir
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
/nni-code`
;
const
containerWorkingDir
:
string
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
/
${
trialJobDetail
.
id
}
`
;
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
PAI_TRIAL_COMMAND_FORMAT
,
`
${
containerWorkingDir
}
`
,
`
${
containerWorkingDir
}
/nnioutput`
,
trialJobDetail
.
id
,
this
.
experimentId
,
trialJobDetail
.
form
.
sequenceId
,
this
.
isMultiPhase
,
containerNFSExpCodeDir
,
command
,
nniManagerIp
,
this
.
paiRestServerPort
,
this
.
nniVersion
,
this
.
logCollection
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
return
nniPaiTrialCommand
;
}
private
generateJobConfigInYamlFormat
(
trialJobDetail
:
PAITrialJobDetail
):
any
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
jobName
=
`nni_exp_
${
this
.
experimentId
}
_trial_
${
trialJobDetail
.
id
}
`
let
nniJobConfig
:
any
=
undefined
;
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
nniJobConfig
=
JSON
.
parse
(
JSON
.
stringify
(
this
.
paiJobConfig
));
//Trick for deep clone in Typescript
nniJobConfig
.
name
=
jobName
;
// Each taskRole will generate new command in NNI's command format
// Each command will be formatted to NNI style
for
(
const
taskRoleIndex
in
nniJobConfig
.
taskRoles
)
{
const
commands
=
nniJobConfig
.
taskRoles
[
taskRoleIndex
].
commands
const
nniTrialCommand
=
this
.
generateNNITrialCommand
(
trialJobDetail
,
commands
.
join
(
"
&&
"
).
replace
(
/
([
"'$`
\\])
/g
,
'
\\
$1
'
));
nniJobConfig
.
taskRoles
[
taskRoleIndex
].
commands
=
[
nniTrialCommand
]
}
}
else
{
nniJobConfig
=
{
protocolVersion
:
2
,
name
:
jobName
,
type
:
'
job
'
,
jobRetryCount
:
0
,
prerequisites
:
[
{
type
:
'
dockerimage
'
,
uri
:
this
.
paiTrialConfig
.
image
,
name
:
'
docker_image_0
'
}
],
taskRoles
:
{
taskrole
:
{
instances
:
1
,
completion
:
{
minFailedInstances
:
1
,
minSucceededInstances
:
-
1
},
taskRetryCount
:
0
,
dockerImage
:
'
docker_image_0
'
,
resourcePerInstance
:
{
gpu
:
this
.
paiTrialConfig
.
gpuNum
,
cpu
:
this
.
paiTrialConfig
.
cpuNum
,
memoryMB
:
this
.
paiTrialConfig
.
memoryMB
},
commands
:
[
this
.
generateNNITrialCommand
(
trialJobDetail
,
this
.
paiTrialConfig
.
command
)
]
}
},
extras
:
{
'
storages
'
:
[
{
name
:
this
.
paiTrialConfig
.
paiStorageConfigName
}
],
submitFrom
:
'
submit-job-v2
'
}
}
if
(
this
.
paiTrialConfig
.
virtualCluster
)
{
nniJobConfig
.
defaults
=
{
virtualCluster
:
this
.
paiTrialConfig
.
virtualCluster
}
}
}
return
yaml
.
safeDump
(
nniJobConfig
);
}
protected
async
submitTrialJobToPAI
(
trialJobId
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`Failed to find PAITrialJobDetail for job
${
trialJobId
}
`
);
}
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
}
if
(
this
.
paiJobRestServer
===
undefined
)
{
throw
new
Error
(
'
paiJobRestServer is not initialized
'
);
}
// Make sure experiment code files is copied from local to NFS
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
this
.
log
.
info
(
`Copy codeDir data finished.`
);
// All trials share same destination NFS code folder, only copy codeDir once for an experiment.
// After copy data finished, set copyExpCodeDirPromise be undefined to avoid log content duplicated.
this
.
copyExpCodeDirPromise
=
undefined
;
}
this
.
paiRestServerPort
=
this
.
paiJobRestServer
.
clusterRestServerPort
;
// Step 1. Prepare PAI job configuration
//create trial local working folder locally.
await
execMkdir
(
trialJobDetail
.
logPath
);
// Write NNI installation file to local files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
logPath
,
'
install_nni.sh
'
),
CONTAINER_INSTALL_NNI_SHELL_FORMAT
,
{
encoding
:
'
utf8
'
});
// Write file content ( parameter.cfg ) to local working folders
if
(
trialJobDetail
.
form
!==
undefined
)
{
await
this
.
writeParameterFile
(
trialJobDetail
.
logPath
,
trialJobDetail
.
form
.
hyperParameters
);
}
//Generate Job Configuration in yaml format
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
trialJobDetail
);
this
.
log
.
debug
(
paiJobConfig
);
// Step 2. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const
submitJobRequest
:
request
.
Options
=
{
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v2/jobs`
,
method
:
'
POST
'
,
body
:
paiJobConfig
,
followAllRedirects
:
true
,
headers
:
{
'
Content-Type
'
:
'
text/yaml
'
,
Authorization
:
`Bearer
${
this
.
paiToken
}
`
}
};
request
(
submitJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
// If submit success, will get status code 202. refer: https://github.com/microsoft/pai/blob/master/src/rest-server/docs/swagger.yaml
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`Submit trial
${
trialJobId
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
body
}
`
;
this
.
log
.
error
(
errorMessage
);
trialJobDetail
.
status
=
'
FAILED
'
;
deferred
.
reject
(
errorMessage
);
}
else
{
trialJobDetail
.
submitTime
=
Date
.
now
();
}
deferred
.
resolve
(
true
);
});
return
deferred
.
promise
;
}
private
async
writeParameterFile
(
directory
:
string
,
hyperParameters
:
HyperParameters
):
Promise
<
void
>
{
const
filepath
:
string
=
path
.
join
(
directory
,
generateParamFileName
(
hyperParameters
));
await
fs
.
promises
.
writeFile
(
filepath
,
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
export
{
PAITrainingService
};
ts/nni_manager/training_service/pai/paiYarn/hdfsClientUtility.ts
deleted
100644 → 0
View file @
f11aea0d
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getLogger
}
from
'
../../../common/log
'
;
import
{
unixPathJoin
}
from
'
../../../common/utils
'
;
/**
* HDFS client utility, including copy file/directory
*/
export
namespace
HDFSClientUtility
{
/**
* Get NNI experiment root directory
* @param hdfsUserName HDFS user name
*/
export
function
hdfsExpRootDir
(
hdfsUserName
:
string
):
string
{
return
'
/
'
+
unixPathJoin
(
hdfsUserName
,
'
nni
'
,
'
experiments
'
,
getExperimentId
());
}
/**
* Get NNI experiment code directory
* @param hdfsUserName HDFS user name
*/
export
function
getHdfsExpCodeDir
(
hdfsUserName
:
string
):
string
{
return
unixPathJoin
(
hdfsExpRootDir
(
hdfsUserName
),
'
codeDir
'
);
}
/**
* Get NNI trial working directory
* @param hdfsUserName HDFS user name
* @param trialId NNI trial ID
*/
export
function
getHdfsTrialWorkDir
(
hdfsUserName
:
string
,
trialId
:
string
):
string
{
const
root
:
string
=
hdfsExpRootDir
(
hdfsUserName
);
return
unixPathJoin
(
root
,
'
trials
'
,
trialId
);
}
/**
* Copy a local file to hdfs directory
*
* @param localFilePath local file path(source)
* @param hdfsFilePath hdfs file path(target)
* @param hdfsClient hdfs client
*/
export
async
function
copyFileToHdfs
(
localFilePath
:
string
,
hdfsFilePath
:
string
,
hdfsClient
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
fs
.
exists
(
localFilePath
,
(
exists
:
boolean
)
=>
{
// Detect if local file exist
if
(
exists
)
{
const
localFileStream
:
fs
.
ReadStream
=
fs
.
createReadStream
(
localFilePath
);
const
hdfsFileStream
:
any
=
hdfsClient
.
createWriteStream
(
hdfsFilePath
);
localFileStream
.
pipe
(
hdfsFileStream
);
hdfsFileStream
.
on
(
'
finish
'
,
()
=>
{
deferred
.
resolve
();
});
hdfsFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
getLogger
()
.
error
(
`HDFSCientUtility:copyFileToHdfs, copy file failed, err is
${
err
.
message
}
`
);
deferred
.
reject
(
err
);
});
}
else
{
getLogger
()
.
error
(
`HDFSCientUtility:copyFileToHdfs,
${
localFilePath
}
doesn't exist locally`
);
deferred
.
reject
(
'
file not exist!
'
);
}
});
return
deferred
.
promise
;
}
/**
* Recursively copy local directory to hdfs directory
*
* @param localDirectory local directory
* @param hdfsDirectory HDFS directory
* @param hdfsClient HDFS client
*/
export
async
function
copyDirectoryToHdfs
(
localDirectory
:
string
,
hdfsDirectory
:
string
,
hdfsClient
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
// TODO: fs.readdirSync doesn't support ~($HOME)
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
for
(
const
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
await
copyFileToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
}
else
{
// If filePath is a directory, recuisively copy it to remote directory
await
copyDirectoryToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
}
}
catch
(
error
)
{
deferred
.
reject
(
error
);
}
}
// All files/directories are copied successfully, resolve
deferred
.
resolve
();
return
deferred
.
promise
;
}
/**
* Check if an HDFS path already exists
*
* @param hdfsPath target path need to check in HDFS
* @param hdfsClient HDFS client
*/
export
async
function
pathExists
(
hdfsPath
:
string
,
hdfsClient
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
exists
(
hdfsPath
,
(
exist
:
boolean
)
=>
{
deferred
.
resolve
(
exist
);
});
let
timeoutId
:
NodeJS
.
Timer
;
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
):
void
=>
{
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId
=
setTimeout
(()
=>
{
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
);
},
5000
);
});
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
])
.
finally
(()
=>
{
clearTimeout
(
timeoutId
);
});
}
/**
* Read content from HDFS file
*
* @param hdfsPath HDFS file path
* @param hdfsClient HDFS client
*/
export
async
function
readFileFromHDFS
(
hdfsPath
:
string
,
hdfsClient
:
any
):
Promise
<
Buffer
>
{
const
deferred
:
Deferred
<
Buffer
>
=
new
Deferred
<
Buffer
>
();
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
if
(
!
exist
)
{
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
}
const
remoteFileStream
:
any
=
hdfsClient
.
createReadStream
(
hdfsPath
);
remoteFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
// Reject with the error
deferred
.
reject
(
err
);
});
remoteFileStream
.
on
(
'
data
'
,
(
chunk
:
any
)
=>
{
// Concat the data chunk to buffer
buffer
=
Buffer
.
concat
([
buffer
,
chunk
]);
});
remoteFileStream
.
on
(
'
finish
'
,
()
=>
{
// Upload is done, resolve
deferred
.
resolve
(
buffer
);
});
return
deferred
.
promise
;
}
/**
* Mkdir in HDFS, use default permission 755
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient HDFS client
*/
export
function
mkdir
(
hdfsPath
:
string
,
hdfsClient
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
mkdir
(
hdfsPath
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
deferred
.
resolve
(
true
);
}
else
{
deferred
.
reject
(
err
.
message
);
}
});
return
deferred
.
promise
;
}
/**
* Read directory contents
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient HDFS client
*/
export
async
function
readdir
(
hdfsPath
:
string
,
hdfsClient
:
any
):
Promise
<
string
[]
>
{
const
deferred
:
Deferred
<
string
[]
>
=
new
Deferred
<
string
[]
>
();
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
if
(
!
exist
)
{
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
}
hdfsClient
.
readdir
(
hdfsPath
,
(
err
:
any
,
files
:
any
[])
=>
{
if
(
err
)
{
deferred
.
reject
(
err
);
}
deferred
.
resolve
(
files
);
});
return
deferred
.
promise
;
}
/**
* Delete HDFS path
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient HDFS client
* @param recursive Mark if need to delete recursively
*/
export
function
deletePath
(
hdfsPath
:
string
,
hdfsClient
:
any
,
recursive
:
boolean
=
true
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
unlink
(
hdfsPath
,
recursive
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
deferred
.
resolve
(
true
);
}
else
{
deferred
.
reject
(
err
.
message
);
}
});
return
deferred
.
promise
;
}
}
ts/nni_manager/training_service/pai/paiYarn/paiYarnConfig.ts
deleted
100644 → 0
View file @
f11aea0d
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
{
TrialConfig
}
from
'
../../common/trialConfig
'
;
/**
* Task role for PAI
*/
export
class
PAITaskRole
{
// Name for the task role
public
readonly
name
:
string
;
// Number of tasks for the task role, no less than 1
public
readonly
taskNumber
:
number
;
// CPU number for one task in the task role, no less than 1
public
readonly
cpuNumber
:
number
;
// Memory for one task in the task role, no less than 100
public
readonly
memoryMB
:
number
;
// GPU number for one task in the task role, no less than 0
public
readonly
gpuNumber
:
number
;
// Executable command for tasks in the task role, can not be empty
public
readonly
command
:
string
;
//Shared memory for one task in the task role
public
readonly
shmMB
?:
number
;
//portList to specify the port used in container
public
portList
?:
PortListMetaData
[];
/**
* Constructor
* @param name Name for the task role
* @param taskNumber Number of tasks for the task role, no less than 1
* @param cpuNumber CPU number for one task in the task role, no less than 1
* @param memoryMB Memory for one task in the task role, no less than 100
* @param gpuNumber GPU number for one task in the task role, no less than 0
* @param command Executable command for tasks in the task role, can not be empty
*/
constructor
(
name
:
string
,
taskNumber
:
number
,
cpuNumber
:
number
,
memoryMB
:
number
,
gpuNumber
:
number
,
command
:
string
,
shmMB
?:
number
,
portList
?:
PortListMetaData
[])
{
this
.
name
=
name
;
this
.
taskNumber
=
taskNumber
;
this
.
cpuNumber
=
cpuNumber
;
this
.
memoryMB
=
memoryMB
;
this
.
gpuNumber
=
gpuNumber
;
this
.
command
=
command
;
this
.
shmMB
=
shmMB
;
this
.
portList
=
portList
;
}
}
/**
* Trial job configuration submitted to PAI
*/
export
class
PAIJobConfig
{
// Name for the job, need to be unique
public
readonly
jobName
:
string
;
// URL pointing to the Docker image for all tasks in the job
public
readonly
image
:
string
;
// Code directory on HDFS
public
readonly
codeDir
:
string
;
//authentication file used for private Docker registry
public
readonly
authFile
?:
string
;
// List of taskRole, one task role at least
public
taskRoles
:
PAITaskRole
[];
//The virtual cluster job runs on.
public
readonly
virtualCluster
:
string
;
/**
* Constructor
* @param jobName Name for the job, need to be unique
* @param image URL pointing to the Docker image for all tasks in the job
* @param dataDir Data directory existing on HDFS
* @param outputDir Output directory on HDFS
* @param taskRoles List of taskRole, one task role at least
*/
constructor
(
jobName
:
string
,
image
:
string
,
codeDir
:
string
,
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
,
authFile
?:
string
)
{
this
.
jobName
=
jobName
;
this
.
image
=
image
;
this
.
codeDir
=
codeDir
;
this
.
taskRoles
=
taskRoles
;
this
.
virtualCluster
=
virtualCluster
;
this
.
authFile
=
authFile
;
}
}
/**
* portList data structure used in PAI taskRole
*/
export
class
PortListMetaData
{
public
readonly
label
:
string
=
''
;
public
readonly
beginAt
:
number
=
0
;
public
readonly
portNumber
:
number
=
0
;
}
/**
* PAI trial configuration
*/
export
class
NNIPAITrialConfig
extends
TrialConfig
{
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
//The virtual cluster job runs on. If omitted, the job will run on default virtual cluster
public
virtualCluster
?:
string
;
//Shared memory for one task in the task role
public
shmMB
?:
number
;
//authentication file used for private Docker registry
public
authFile
?:
string
;
//portList to specify the port used in container
public
portList
?:
PortListMetaData
[];
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
,
authFile
?:
string
,
portList
?:
PortListMetaData
[])
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
image
=
image
;
this
.
virtualCluster
=
virtualCluster
;
this
.
shmMB
=
shmMB
;
this
.
authFile
=
authFile
;
this
.
portList
=
portList
;
}
}
ts/nni_manager/training_service/pai/paiYarn/paiYarnData.ts
deleted
100644 → 0
View file @
f11aea0d
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
export
const
PAI_INSTALL_NNI_SHELL_FORMAT
:
string
=
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
return
else
# Install nni
python3 -m pip install --user nni
fi`
;
export
const
PAI_TRIAL_COMMAND_FORMAT
:
string
=
`export NNI_PLATFORM=paiYarn NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& cd $NNI_SYS_DIR && sh install_nni.sh \
&& python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \
--pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \
--nni_manager_version '{13}' --log_collection '{14}'`
;
export
const
PAI_LOG_PATH_FORMAT
:
string
=
`http://{0}/webhdfs/explorer.html#{1}`
;
ts/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
deleted
100644 → 0
View file @
f11aea0d
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
request
from
'
request
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
HyperParameters
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
}
from
'
../../../common/trainingService
'
;
import
{
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getVersion
,
uniqueString
,
unixPathJoin
}
from
'
../../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
execMkdir
,
validateCodeDir
}
from
'
../../common/util
'
;
import
{
HDFSClientUtility
}
from
'
./hdfsClientUtility
'
;
import
{
NNIPAITrialConfig
,
PAIJobConfig
,
PAITaskRole
}
from
'
./paiYarnConfig
'
;
import
{
PAI_LOG_PATH_FORMAT
,
PAI_TRIAL_COMMAND_FORMAT
}
from
'
./paiYarnData
'
;
import
{
PAITrainingService
}
from
'
../paiTrainingService
'
;
import
{
PAIClusterConfig
,
PAITrialJobDetail
}
from
'
../paiConfig
'
;
import
*
as
WebHDFS
from
'
webhdfs
'
;
import
{
PAIJobRestServer
,
ParameterFileMeta
}
from
'
../paiJobRestServer
'
;
/**
* Training Service implementation for OpenPAI (Open Platform for AI)
* Refer https://github.com/Microsoft/pai for more info about OpenPAI
*/
@
component
.
Singleton
class
PAIYarnTrainingService
extends
PAITrainingService
{
private
hdfsClient
:
any
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
private
copyAuthFilePromise
?:
Promise
<
void
>
;
private
paiTrialConfig
?:
NNIPAITrialConfig
;
constructor
()
{
super
();
}
public
async
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
`paiBaseClusterConfig not initialized!`
);
}
this
.
log
.
info
(
`submitTrialJob: form:
${
JSON
.
stringify
(
form
)}
`
);
const
trialJobId
:
string
=
uniqueString
(
5
);
//TODO: use HDFS working folder instead
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
expRootDir
,
'
trials
'
,
trialJobId
);
const
paiJobName
:
string
=
`nni_exp_
${
this
.
experimentId
}
_trial_
${
trialJobId
}
`
;
const
hdfsCodeDir
:
string
=
HDFSClientUtility
.
getHdfsTrialWorkDir
(
this
.
paiClusterConfig
.
userName
,
trialJobId
);
const
hdfsOutputDir
:
string
=
unixPathJoin
(
hdfsCodeDir
,
'
nnioutput
'
);
const
hdfsLogPath
:
string
=
String
.
Format
(
PAI_LOG_PATH_FORMAT
,
this
.
paiClusterConfig
.
host
,
hdfsOutputDir
);
const
trialJobDetail
:
PAITrialJobDetail
=
new
PAITrialJobDetail
(
trialJobId
,
'
WAITING
'
,
paiJobName
,
Date
.
now
(),
trialWorkingFolder
,
form
,
hdfsLogPath
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
jobQueue
.
push
(
trialJobId
);
return
trialJobDetail
;
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
PAI_YARN_CLUSTER_CONFIG
:
this
.
paiJobRestServer
=
new
PAIJobRestServer
(
component
.
get
(
PAIYarnTrainingService
));
this
.
paiClusterConfig
=
<
PAIClusterConfig
>
JSON
.
parse
(
value
);
this
.
paiClusterConfig
.
host
=
this
.
formatPAIHost
(
this
.
paiClusterConfig
.
host
);
this
.
hdfsClient
=
WebHDFS
.
createClient
({
user
:
this
.
paiClusterConfig
.
userName
,
// Refer PAI document for Pylon mapping https://github.com/Microsoft/pai/tree/master/docs/pylon
port
:
80
,
path
:
'
/webhdfs/api/v1
'
,
host
:
this
.
paiClusterConfig
.
host
});
this
.
paiClusterConfig
.
host
=
this
.
formatPAIHost
(
this
.
paiClusterConfig
.
host
);
if
(
this
.
paiClusterConfig
.
passWord
)
{
// Get PAI authentication token
await
this
.
updatePaiToken
();
}
else
if
(
this
.
paiClusterConfig
.
token
)
{
this
.
paiToken
=
this
.
paiClusterConfig
.
token
;
}
else
{
throw
new
Error
(
'
pai cluster config format error, please set password or token!
'
);
}
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
this
.
paiClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
break
;
}
this
.
paiTrialConfig
=
<
NNIPAITrialConfig
>
JSON
.
parse
(
value
);
// Validate to make sure codeDir doesn't have too many files
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
// Copy experiment files from local folder to HDFS
this
.
copyExpCodeDirPromise
=
HDFSClientUtility
.
copyDirectoryToHdfs
(
this
.
paiTrialConfig
.
codeDir
,
HDFSClientUtility
.
getHdfsExpCodeDir
(
this
.
paiClusterConfig
.
userName
),
this
.
hdfsClient
);
// Upload authFile to hdfs
if
(
this
.
paiTrialConfig
.
authFile
)
{
this
.
authFileHdfsPath
=
unixPathJoin
(
HDFSClientUtility
.
hdfsExpRootDir
(
this
.
paiClusterConfig
.
userName
),
'
authFile
'
);
this
.
copyAuthFilePromise
=
HDFSClientUtility
.
copyFileToHdfs
(
this
.
paiTrialConfig
.
authFile
,
this
.
authFileHdfsPath
,
this
.
hdfsClient
);
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
case
TrialConfigMetadataKey
.
MULTI_PHASE
:
this
.
isMultiPhase
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
default
:
//Reject for unknown keys
throw
new
Error
(
`Uknown key:
${
key
}
`
);
}
}
protected
async
submitTrialJobToPAI
(
trialJobId
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`Failed to find PAITrialJobDetail for job
${
trialJobId
}
`
);
}
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
}
if
(
this
.
paiJobRestServer
===
undefined
)
{
throw
new
Error
(
'
paiJobRestServer is not initialized
'
);
}
this
.
paiRestServerPort
=
this
.
paiJobRestServer
.
clusterRestServerPort
;
// Make sure experiment code files is copied from local to HDFS
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
}
//Make sure authFile is copied from local to HDFS
if
(
this
.
paiTrialConfig
.
authFile
)
{
await
this
.
copyAuthFilePromise
;
}
// Step 1. Prepare PAI job configuration
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
//create tmp trial working folder locally.
await
execMkdir
(
trialLocalTempFolder
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
// Write file content ( parameter.cfg ) to local tmp folders
if
(
trialJobDetail
.
form
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialJobDetail
.
form
.
hyperParameters
)),
trialJobDetail
.
form
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
}
);
}
const
hdfsCodeDir
:
string
=
HDFSClientUtility
.
getHdfsTrialWorkDir
(
this
.
paiClusterConfig
.
userName
,
trialJobId
);
const
hdfsOutputDir
:
string
=
unixPathJoin
(
hdfsCodeDir
,
'
nnioutput
'
);
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
PAI_TRIAL_COMMAND_FORMAT
,
// PAI will copy job's codeDir into /root directory
`$PWD/
${
trialJobId
}
`
,
`$PWD/
${
trialJobId
}
/nnioutput`
,
trialJobId
,
this
.
experimentId
,
trialJobDetail
.
form
.
sequenceId
,
this
.
isMultiPhase
,
this
.
paiTrialConfig
.
command
,
nniManagerIp
,
this
.
paiRestServerPort
,
hdfsOutputDir
,
this
.
paiClusterConfig
.
host
,
this
.
paiClusterConfig
.
userName
,
HDFSClientUtility
.
getHdfsExpCodeDir
(
this
.
paiClusterConfig
.
userName
),
version
,
this
.
logCollection
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
this
.
log
.
info
(
`nniPAItrial command is
${
nniPaiTrialCommand
.
trim
()}
`
);
const
paiTaskRoles
:
PAITaskRole
[]
=
[
new
PAITaskRole
(
`nni_trail_
${
trialJobId
}
`
,
// Task role number
1
,
// Task CPU number
this
.
paiTrialConfig
.
cpuNum
,
// Task memory
this
.
paiTrialConfig
.
memoryMB
,
// Task GPU number
this
.
paiTrialConfig
.
gpuNum
,
// Task command
nniPaiTrialCommand
,
// Task shared memory
this
.
paiTrialConfig
.
shmMB
,
// Task portList
this
.
paiTrialConfig
.
portList
)
];
const
paiJobConfig
:
PAIJobConfig
=
new
PAIJobConfig
(
// Job name
trialJobDetail
.
paiJobName
,
// Docker image
this
.
paiTrialConfig
.
image
,
// codeDir
`$PAI_DEFAULT_FS_URI
${
hdfsCodeDir
}
`
,
// PAI Task roles
paiTaskRoles
,
// Add Virutal Cluster
this
.
paiTrialConfig
.
virtualCluster
===
undefined
?
'
default
'
:
this
.
paiTrialConfig
.
virtualCluster
.
toString
(),
//Task auth File
this
.
authFileHdfsPath
);
// Step 2. Upload code files in codeDir onto HDFS
try
{
await
HDFSClientUtility
.
copyDirectoryToHdfs
(
trialLocalTempFolder
,
hdfsCodeDir
,
this
.
hdfsClient
);
}
catch
(
error
)
{
this
.
log
.
error
(
`PAI Training service: copy
${
this
.
paiTrialConfig
.
codeDir
}
to HDFS
${
hdfsCodeDir
}
failed, error is
${
error
}
`
);
trialJobDetail
.
status
=
'
FAILED
'
;
// eslint-disable-line require-atomic-updates
return
true
;
}
// Step 3. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const
submitJobRequest
:
request
.
Options
=
{
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
this
.
paiClusterConfig
.
userName
}
/jobs`
,
method
:
'
POST
'
,
json
:
true
,
body
:
paiJobConfig
,
headers
:
{
'
Content-Type
'
:
'
application/json
'
,
Authorization
:
`Bearer
${
this
.
paiToken
}
`
}
};
request
(
submitJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
_body
:
any
)
=>
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`Submit trial
${
trialJobId
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
response
.
body
.
message
}
`
;
this
.
log
.
error
(
errorMessage
);
trialJobDetail
.
status
=
'
FAILED
'
;
deferred
.
resolve
(
true
);
}
else
{
trialJobDetail
.
submitTime
=
Date
.
now
();
deferred
.
resolve
(
true
);
}
});
return
deferred
.
promise
;
}
public
async
updateTrialJob
(
trialJobId
:
string
,
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
const
trialJobDetail
:
undefined
|
TrialJobDetail
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`updateTrialJob failed:
${
trialJobId
}
not found`
);
}
await
this
.
writeParameterFile
(
trialJobId
,
form
.
hyperParameters
);
return
trialJobDetail
;
}
protected
async
writeParameterFile
(
trialJobId
:
string
,
hyperParameters
:
HyperParameters
):
Promise
<
void
>
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
PAI trial config is not initialized
'
);
}
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
const
hpFileName
:
string
=
generateParamFileName
(
hyperParameters
);
const
localFilepath
:
string
=
path
.
join
(
trialLocalTempFolder
,
hpFileName
);
await
fs
.
promises
.
writeFile
(
localFilepath
,
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
const
hdfsCodeDir
:
string
=
HDFSClientUtility
.
getHdfsTrialWorkDir
(
this
.
paiClusterConfig
.
userName
,
trialJobId
);
const
hdfsHpFilePath
:
string
=
path
.
join
(
hdfsCodeDir
,
hpFileName
);
await
HDFSClientUtility
.
copyFileToHdfs
(
localFilepath
,
hdfsHpFilePath
,
this
.
hdfsClient
);
await
this
.
postParameterFileMeta
({
experimentId
:
this
.
experimentId
,
trialId
:
trialJobId
,
filePath
:
hdfsHpFilePath
});
}
protected
postParameterFileMeta
(
parameterFileMeta
:
ParameterFileMeta
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
this
.
paiJobRestServer
===
undefined
)
{
throw
new
Error
(
'
paiJobRestServer not implemented!
'
);
}
const
req
:
request
.
Options
=
{
uri
:
`
${
this
.
paiJobRestServer
.
endPoint
}${
this
.
paiJobRestServer
.
apiRootUrl
}
/parameter-file-meta`
,
method
:
'
POST
'
,
json
:
true
,
body
:
parameterFileMeta
};
request
(
req
,
(
err
:
Error
,
_res
:
request
.
Response
)
=>
{
if
(
err
)
{
deferred
.
reject
(
err
);
}
else
{
deferred
.
resolve
();
}
});
return
deferred
.
promise
;
}
}
export
{
PAIYarnTrainingService
};
ts/nni_manager/training_service/pai/paiYarn/paiYarnTrialConfig.ts
deleted
100644 → 0
View file @
f11aea0d
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
{
TrialConfig
}
from
'
../../common/trialConfig
'
;
/**
* PAI configuration to run trials
*/
export
class
PAIYarnTrialConfig
extends
TrialConfig
{
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
outputDir
:
string
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
)
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
image
=
image
;
this
.
dataDir
=
dataDir
;
this
.
outputDir
=
outputDir
;
}
}
ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
View file @
6f272049
...
...
@@ -12,7 +12,7 @@ import { getExperimentId } from '../../../common/experimentStartupInfo';
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
PAIClusterConfig
}
from
'
../../pai/paiConfig
'
;
import
{
NNIPAI
K8S
TrialConfig
}
from
'
../../pai/pai
K8S/paiK8S
Config
'
;
import
{
NNIPAITrialConfig
}
from
'
../../pai/paiConfig
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
StorageService
}
from
'
../storageService
'
;
...
...
@@ -25,7 +25,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
private
readonly
log
:
Logger
=
getLogger
();
private
paiClusterConfig
:
PAIClusterConfig
|
undefined
;
private
paiTrialConfig
:
NNIPAI
K8S
TrialConfig
|
undefined
;
private
paiTrialConfig
:
NNIPAITrialConfig
|
undefined
;
private
paiJobConfig
:
any
;
private
paiToken
?:
string
;
private
protocol
:
string
=
'
http
'
;
...
...
@@ -62,7 +62,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
break
;
}
this
.
paiTrialConfig
=
<
NNIPAI
K8S
TrialConfig
>
JSON
.
parse
(
value
);
this
.
paiTrialConfig
=
<
NNIPAITrialConfig
>
JSON
.
parse
(
value
);
// Validate to make sure codeDir doesn't have too many files
const
storageService
=
component
.
get
<
StorageService
>
(
StorageService
);
...
...
ts/nni_manager/training_service/reusable/routerTrainingService.ts
View file @
6f272049
...
...
@@ -11,7 +11,7 @@ import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetri
import
{
delay
}
from
'
../../common/utils
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
PAIClusterConfig
}
from
'
../pai/paiConfig
'
;
import
{
PAI
K8S
TrainingService
}
from
'
../pai/pai
K8S/paiK8S
TrainingService
'
;
import
{
PAITrainingService
}
from
'
../pai/paiTrainingService
'
;
import
{
RemoteMachineTrainingService
}
from
'
../remote_machine/remoteMachineTrainingService
'
;
import
{
MountedStorageService
}
from
'
./storages/mountedStorageService
'
;
import
{
StorageService
}
from
'
./storageService
'
;
...
...
@@ -131,7 +131,7 @@ class RouterTrainingService implements TrainingService {
await
this
.
internalTrainingService
.
setClusterMetadata
(
'
platform_list
'
,
'
pai
'
);
}
else
{
this
.
log
.
debug
(
`caching metadata key:{} value:{}, as training service is not determined.`
);
this
.
internalTrainingService
=
component
.
get
(
PAI
K8S
TrainingService
);
this
.
internalTrainingService
=
component
.
get
(
PAITrainingService
);
}
}
else
if
(
key
===
TrialConfigMetadataKey
.
AML_CLUSTER_CONFIG
)
{
this
.
internalTrainingService
=
component
.
get
(
TrialDispatcher
);
...
...
ts/nni_manager/training_service/test/hdfsClientUtility.test.ts
deleted
100644 → 0
View file @
f11aea0d
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
chai
from
'
chai
'
;
import
*
as
chaiAsPromised
from
'
chai-as-promised
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
*
as
tmp
from
'
tmp
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
HDFSClientUtility
}
from
'
../pai/paiYarn/hdfsClientUtility
'
;
var
WebHDFS
=
require
(
'
webhdfs
'
);
var
rmdir
=
require
(
'
rmdir
'
);
describe
(
'
WebHDFS
'
,
function
()
{
/*
To enable web HDFS client unit test, HDFS information needs to be configured in:
Default/.vscode/hdfsInfo.json, whose content looks like:
{
"user": "user1",
"port": 50070,
"host": "10.0.0.0"
}
*/
let
skip
:
boolean
=
false
;
let
testHDFSInfo
:
any
;
let
hdfsClient
:
any
;
try
{
testHDFSInfo
=
JSON
.
parse
(
fs
.
readFileSync
(
'
../../.vscode/hdfsInfo.json
'
,
'
utf8
'
));
console
.
log
(
testHDFSInfo
);
hdfsClient
=
WebHDFS
.
createClient
({
user
:
testHDFSInfo
.
user
,
port
:
testHDFSInfo
.
port
,
host
:
testHDFSInfo
.
host
});
}
catch
(
err
)
{
console
.
log
(
'
Please configure rminfo.json to enable remote machine unit test.
'
);
skip
=
true
;
}
before
(()
=>
{
chai
.
should
();
chai
.
use
(
chaiAsPromised
);
tmp
.
setGracefulCleanup
();
prepareUnitTest
();
});
after
(()
=>
{
cleanupUnitTest
();
});
it
(
'
Test HDFS utility path functions
'
,
async
()
=>
{
if
(
skip
)
{
return
;
}
const
testPath
:
string
=
'
/nni_unittest_
'
+
uniqueString
(
6
);
let
exists
:
boolean
=
await
HDFSClientUtility
.
pathExists
(
testPath
,
hdfsClient
);
// The new random named path is expected to not exist
chai
.
expect
(
exists
).
to
.
be
.
equals
(
false
);
const
mkdirResult
:
boolean
=
await
HDFSClientUtility
.
mkdir
(
testPath
,
hdfsClient
);
// Mkdir is expected to be successful
chai
.
expect
(
mkdirResult
).
to
.
be
.
equals
(
true
);
exists
=
await
HDFSClientUtility
.
pathExists
(
testPath
,
hdfsClient
);
// The newly created path is expected to exist
chai
.
expect
(
exists
).
to
.
be
.
equals
(
true
);
const
deleteResult
:
boolean
=
await
HDFSClientUtility
.
deletePath
(
testPath
,
hdfsClient
);
// Delete path is expected to be successful
chai
.
expect
(
deleteResult
).
to
.
be
.
equals
(
true
);
exists
=
await
HDFSClientUtility
.
pathExists
(
testPath
,
hdfsClient
);
// The deleted path is not expected to exist
chai
.
expect
(
exists
).
to
.
be
.
equals
(
false
);
});
it
(
'
Test HDFS utility copyFileToHdfs
'
,
async
()
=>
{
if
(
skip
)
{
return
;
}
// Prepare local directory and files
const
tmpLocalDirectoryPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
'
nni_unittest_dir_
'
+
uniqueString
(
6
));
const
tmpDataFilePath
:
string
=
path
.
join
(
tmpLocalDirectoryPath
,
'
file_
'
+
uniqueString
(
6
));
const
testFileData
:
string
=
'
TestContent123
'
;
fs
.
mkdirSync
(
tmpLocalDirectoryPath
);
fs
.
writeFileSync
(
tmpDataFilePath
,
testFileData
);
const
testHDFSFilePath
:
string
=
'
/nni_unittest_
'
+
uniqueString
(
6
);
let
exists
:
boolean
=
await
HDFSClientUtility
.
pathExists
(
testHDFSFilePath
,
hdfsClient
);
// The new random named path is expected to not exist
chai
.
expect
(
exists
).
to
.
be
.
equals
(
false
);
await
HDFSClientUtility
.
copyFileToHdfs
(
tmpDataFilePath
,
testHDFSFilePath
,
hdfsClient
);
exists
=
await
HDFSClientUtility
.
pathExists
(
testHDFSFilePath
,
hdfsClient
);
// After copy local file to HDFS, the target file path in HDFS is expected to exist
chai
.
expect
(
exists
).
to
.
be
.
equals
(
true
);
const
buffer
:
Buffer
=
await
HDFSClientUtility
.
readFileFromHDFS
(
testHDFSFilePath
,
hdfsClient
);
const
actualFileData
:
string
=
buffer
.
toString
(
'
utf8
'
);
// The file content read from HDFS is expected to equal to the content of local file
chai
.
expect
(
actualFileData
).
to
.
be
.
equals
(
testFileData
);
const
testHDFSDirPath
:
string
=
path
.
join
(
'
/nni_unittest_
'
+
uniqueString
(
6
)
+
'
_dir
'
);
await
HDFSClientUtility
.
copyDirectoryToHdfs
(
tmpLocalDirectoryPath
,
testHDFSDirPath
,
hdfsClient
);
const
files
:
any
[]
=
await
HDFSClientUtility
.
readdir
(
testHDFSDirPath
,
hdfsClient
);
// Expected file count under HDFS target directory is 1
chai
.
expect
(
files
.
length
).
to
.
be
.
equals
(
1
);
// Expected file name under HDFS target directory is equal to local file name
chai
.
expect
(
files
[
0
].
pathSuffix
).
to
.
be
.
equals
(
path
.
parse
(
tmpDataFilePath
).
base
);
// Cleanup
rmdir
(
tmpLocalDirectoryPath
);
let
deleteRestult
:
boolean
=
await
HDFSClientUtility
.
deletePath
(
testHDFSFilePath
,
hdfsClient
);
chai
.
expect
(
deleteRestult
).
to
.
be
.
equals
(
true
);
deleteRestult
=
await
HDFSClientUtility
.
deletePath
(
testHDFSDirPath
,
hdfsClient
);
chai
.
expect
(
deleteRestult
).
to
.
be
.
equals
(
true
);
});
});
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment