Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
22993e5d
Commit
22993e5d
authored
Jun 20, 2019
by
demianzhang
Committed by
chicm-ms
Jun 20, 2019
Browse files
Pass tslint for training service (#1177)
* fix local and remote training services tslint
parent
ae7a72bc
Changes
44
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
888 additions
and
740 deletions
+888
-740
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+198
-185
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
...anager/training_service/kubernetes/kubernetesApiClient.ts
+29
-20
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+33
-27
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+3
-4
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
...training_service/kubernetes/kubernetesJobInfoCollector.ts
+7
-8
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
...er/training_service/kubernetes/kubernetesJobRestServer.ts
+7
-6
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+146
-124
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+28
-26
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+49
-45
src/nni_manager/training_service/pai/hdfsClientUtility.ts
src/nni_manager/training_service/pai/hdfsClientUtility.ts
+43
-31
src/nni_manager/training_service/pai/paiConfig.ts
src/nni_manager/training_service/pai/paiConfig.ts
+20
-8
src/nni_manager/training_service/pai/paiData.ts
src/nni_manager/training_service/pai/paiData.ts
+12
-7
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
+30
-22
src/nni_manager/training_service/pai/paiJobRestServer.ts
src/nni_manager/training_service/pai/paiJobRestServer.ts
+5
-4
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+63
-51
src/nni_manager/training_service/pai/paiTrialConfig.ts
src/nni_manager/training_service/pai/paiTrialConfig.ts
+8
-4
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
...i_manager/training_service/remote_machine/gpuScheduler.ts
+31
-23
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+71
-63
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
...ning_service/remote_machine/remoteMachineJobRestServer.ts
+5
-4
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+100
-78
No files found.
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
22993e5d
...
...
@@ -17,35 +17,34 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
JobApplicationForm
,
TrialJobApplicationForm
,
TrialJobDetail
,
NNIManagerIpConfig
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
KubeflowClusterConfigNFS
,
KubeflowClusterConfigAzure
,
KubeflowTrialConfigPytorch
,
KubeflowTrialConfigTensorflow
,
KubeflowClusterConfigFactory
,
KubeflowTrialConfigFactory
,
KubeflowTrialConfig
,
KubeflowClusterConfig
}
from
'
./kubeflowConfig
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
KubeflowOperatorClient
}
from
'
./kubeflowApiClient
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
import
{
KubeflowClusterConfig
,
KubeflowClusterConfigAzure
,
KubeflowClusterConfigFactory
,
KubeflowClusterConfigNFS
,
KubeflowTrialConfig
,
KubeflowTrialConfigFactory
,
KubeflowTrialConfigPytorch
,
KubeflowTrialConfigTensorflow
}
from
'
./kubeflowConfig
'
;
import
{
KubeflowJobInfoCollector
}
from
'
./kubeflowJobInfoCollector
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
// tslint:disable: no-unsafe-any no-any
/**
* Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
...
...
@@ -54,7 +53,7 @@ import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
class
KubeflowTrainingService
extends
KubernetesTrainingService
implements
KubernetesTrainingService
{
private
kubeflowClusterConfig
?:
KubeflowClusterConfig
;
private
kubeflowTrialConfig
?:
KubeflowTrialConfig
;
private
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
private
readonly
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
constructor
()
{
super
();
...
...
@@ -67,7 +66,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
public
async
run
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Run Kubeflow training service.
'
);
this
.
kubernetesJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
if
(
!
this
.
kubernetesJobRestServer
)
{
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
await
this
.
kubernetesJobRestServer
.
start
();
...
...
@@ -77,7 +76,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
...
...
@@ -86,17 +85,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow job operator client is undefined
'
);
}
if
(
!
this
.
kubernetesRestServerPort
)
{
if
(
this
.
kubernetesRestServerPort
===
undefined
)
{
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
kubeflowJobName
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
kubeflowJobName
:
string
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
curTrialSequenceId
:
number
=
this
.
generateSequenceId
();
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
//prepare the runscript
...
...
@@ -125,6 +124,72 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
return
Promise
.
resolve
(
trialJobDetail
);
}
// tslint:disable:no-redundant-jsdoc
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
const
kubeflowClusterJsonObject
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
await
this
.
createNFSStorage
(
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
);
const
kubeflowTrialJsonObjsect
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
}
return
Promise
.
resolve
();
}
/**
* upload code files to nfs or azureStroage
* @param trialJobId
...
...
@@ -132,209 +197,156 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* return: trialJobOutputUrl
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
let
trialJobOutputUrl
:
string
=
''
;
assert
(
!
this
.
kubeflowClusterConfig
.
storage
assert
(
this
.
kubeflowClusterConfig
.
storage
===
undefined
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
try
{
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
try
{
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
}
catch
(
error
){
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
\
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
le
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
cons
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
// initialize kubeflow trial config to specific type
let
kubeflowTrialConfig
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
kubeflowTrialConfig
:
any
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
;
}
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
)
{
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_worker.sh
'
),
workerRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
// Write parameter server file content run_ps.sh to local tmp folders
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
le
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
)
{
}
// Write parameter server file content run_ps.sh to local tmp folders
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
cons
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_ps.sh
'
),
psRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
!==
undefined
)
{
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_master.sh
'
),
masterRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
}
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
;
if
(
trialForm
!==
undefined
&&
trialForm
.
hyperParameters
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
private
async
prepareKubeflowConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
// initialize kubeflow trial config to specific type
let
kubeflowTrialConfig
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
kubeflowTrialConfig
:
any
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
;
}
const
workerPodResources
:
any
=
{};
if
(
kubeflowTrialConfig
.
worker
)
{
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
kubeflowTrialConfig
.
worker
.
gpuNum
)
kubeflowTrialConfig
.
worker
.
gpuNum
)
;
}
workerPodResources
.
limits
=
Object
.
assign
({},
workerPodResources
.
requests
)
;
workerPodResources
.
limits
=
{...
workerPodResources
.
requests
}
;
le
t
nonWorkerResources
:
any
=
{};
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
le
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
)
{
cons
t
nonWorkerResources
:
any
=
{};
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
cons
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
tensorflowTrialConfig
.
ps
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
)
;
tensorflowTrialConfig
.
ps
.
gpuNum
)
;
nonWorkerResources
.
limits
=
{...
nonWorkerResources
.
requests
}
;
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
le
t
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
cons
t
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
pyTorchTrialConfig
.
master
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
pyTorchTrialConfig
.
master
.
gpuNum
);
nonWorkerResources
.
limits
=
{...
nonWorkerResources
.
requests
};
}
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
let
kubeflowClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
let
azureKubeflowClusterConfig
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsKubeflowClusterConfig
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
await
this
.
createNFSStorage
(
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
!
this
.
kubeflowClusterConfig
){
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
)
let
kubeflowTrialJsonObjsect
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
break
;
}
return
Promise
.
resolve
();
}
/**
* Generate kubeflow resource config file
* @param trialJobId trial job id
...
...
@@ -343,43 +355,42 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master
*/
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
any
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
const
replicaSpecsObj
:
any
=
{};
le
t
replicaSpecsObjMap
=
new
Map
<
string
,
object
>
();
cons
t
replicaSpecsObjMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
le
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
cons
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
if
(
tensorflowTrialConfig
.
ps
){
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
tfReplicaSpecs
'
:
replicaSpecsObj
})
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
)
{
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
tfReplicaSpecs
:
replicaSpecsObj
});
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
!==
undefined
)
{
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
}
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
pytorchReplicaSpecs
'
:
replicaSpecsObj
})
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
pytorchReplicaSpecs
:
replicaSpecsObj
})
;
}
return
{
...
...
@@ -406,21 +417,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param runScriptFile script file name
* @param podResources pod resource config section
*/
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
):
any
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
):
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
le
t
volumeSpecMap
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
){
cons
t
volumeSpecMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -429,9 +441,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
shareName
:
`
${
this
.
azureStorageShare
}
`
,
readonly
:
false
}
}])
}
else
{
le
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
}])
;
}
else
{
cons
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -439,13 +451,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
server
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
server
}
`
,
path
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
path
}
`
}
}])
}])
;
}
return
{
replicas
:
replicaNumber
,
template
:
{
metadata
:
{
// tslint:disable-next-line:no-null-keyword
creationTimestamp
:
null
},
spec
:
{
...
...
@@ -455,7 +468,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// TODO: change the name based on operator's type
name
:
this
.
kubernetesCRDClient
.
containerName
,
image
:
replicaImage
,
args
:
[
"
sh
"
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
args
:
[
'
sh
'
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
volumeMounts
:
[
{
name
:
'
nni-vol
'
,
...
...
@@ -470,5 +483,5 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
};
}
}
export
{
KubeflowTrainingService
}
// tslint:enable: no-unsafe-any no-any
export
{
KubeflowTrainingService
}
;
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
View file @
22993e5d
...
...
@@ -19,44 +19,46 @@
'
use strict
'
;
import
*
as
os
from
'
os
'
import
*
as
path
from
'
path
'
;
import
{
Client1_10
,
config
}
from
'
kubernetes-client
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
var
K8SClient
=
require
(
'
kubernetes-client
'
).
Client
;
var
K8SConfig
=
require
(
'
kubernetes-client
'
).
config
;
/**
* Generict Kubernetes client, target version >= 1.9
*/
// tslint:disable: no-any no-unsafe-any
class
GeneralK8sClient
{
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
constructor
()
{
this
.
client
=
new
K8S
Client
({
config
:
K8SC
onfig
.
fromKubeconfig
(),
version
:
'
1.9
'
});
this
.
client
=
new
Client
1_10
({
config
:
c
onfig
.
fromKubeconfig
(),
version
:
'
1.9
'
});
this
.
client
.
loadSpec
();
}
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`Create secrets failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
}
/**
* Kubernetes CRD client
*/
abstract
class
KubernetesCRDClient
{
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
protected
crdSchema
:
any
;
constructor
()
{
this
.
client
=
new
K8S
Client
({
config
:
K8SC
onfig
.
fromKubeconfig
()
});
this
.
client
=
new
Client
1_10
({
config
:
c
onfig
.
fromKubeconfig
()
});
this
.
client
.
loadSpec
();
}
...
...
@@ -65,7 +67,7 @@ abstract class KubernetesCRDClient {
public
abstract
get
containerName
():
string
;
public
get
jobKind
():
string
{
if
(
this
.
crdSchema
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
names
&&
this
.
crdSchema
.
spec
.
names
.
kind
)
{
...
...
@@ -76,7 +78,7 @@ abstract class KubernetesCRDClient {
}
public
get
apiVersion
():
string
{
if
(
this
.
crdSchema
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
version
)
{
return
this
.
crdSchema
.
spec
.
version
;
...
...
@@ -88,43 +90,50 @@ abstract class KubernetesCRDClient {
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`Create kubernetes job failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
//TODO : replace any
public
async
getKubernetesJob
(
kubeflowJobName
:
string
):
Promise
<
any
>
{
let
result
:
Promise
<
any
>
;
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
).
get
();
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
)
.
get
();
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
response
.
body
);
}
else
{
result
=
Promise
.
reject
(
`KubeflowOperatorClient get tfjobs failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
public
async
deleteKubernetesJob
(
labels
:
Map
<
string
,
string
>
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
// construct match query from labels for deleting tfjob
const
matchQuery
:
string
=
Array
.
from
(
labels
.
keys
()).
map
(
labelKey
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
).
join
(
'
,
'
);
const
matchQuery
:
string
=
Array
.
from
(
labels
.
keys
())
.
map
((
labelKey
:
string
)
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
)
.
join
(
'
,
'
);
try
{
const
deleteResult
:
any
=
await
this
.
operator
().
delete
({
const
deleteResult
:
any
=
await
this
.
operator
()
.
delete
({
qs
:
{
labelSelector
:
matchQuery
,
propagationPolicy
:
"
Background
"
propagationPolicy
:
'
Background
'
}
});
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`KubeflowOperatorClient, delete labels
${
matchQuery
}
get wrong statusCode
${
deleteResult
.
statusCode
}
`
);
result
=
Promise
.
reject
(
`KubeflowOperatorClient, delete labels
${
matchQuery
}
get wrong statusCode
${
deleteResult
.
statusCode
}
`
);
}
}
catch
(
err
)
{
}
catch
(
err
)
{
result
=
Promise
.
reject
(
err
);
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
22993e5d
...
...
@@ -22,6 +22,7 @@
export
type
KubernetesStorageKind
=
'
nfs
'
|
'
azureStorage
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
// tslint:disable: completed-docs function-name
export
abstract
class
KubernetesClusterConfig
{
public
readonly
storage
?:
KubernetesStorageKind
;
public
readonly
apiVersion
:
string
;
...
...
@@ -31,7 +32,7 @@ export abstract class KubernetesClusterConfig {
this
.
apiVersion
=
apiVersion
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
throw
new
MethodNotImplementedError
();
}
}
...
...
@@ -56,12 +57,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
this
.
nfs
=
nfs
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
nfs
'
;
}
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigNFS
{
let
kubernetesClusterConfigObjectNFS
=
<
KubernetesClusterConfigNFS
>
jsonObject
;
const
kubernetesClusterConfigObjectNFS
:
KubernetesClusterConfigNFS
=
<
KubernetesClusterConfigNFS
>
jsonObject
;
return
new
KubernetesClusterConfigNFS
(
kubernetesClusterConfigObjectNFS
.
apiVersion
,
kubernetesClusterConfigObjectNFS
.
nfs
,
...
...
@@ -71,12 +73,12 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
}
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
public
readonly
keyVault
:
k
eyVaultConfig
;
public
readonly
keyVault
:
K
eyVaultConfig
;
public
readonly
azureStorage
:
AzureStorage
;
constructor
(
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
...
...
@@ -85,12 +87,13 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
this
.
azureStorage
=
azureStorage
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
azureStorage
'
;
}
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigAzure
{
let
kubernetesClusterConfigObjectAzure
=
<
KubernetesClusterConfigAzure
>
jsonObject
;
const
kubernetesClusterConfigObjectAzure
:
KubernetesClusterConfigAzure
=
<
KubernetesClusterConfigAzure
>
jsonObject
;
return
new
KubernetesClusterConfigAzure
(
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
...
...
@@ -100,17 +103,20 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
}
}
// tslint:disable-next-line:no-unnecessary-class
export
class
KubernetesClusterConfigFactory
{
public
static
generateKubernetesClusterConfig
(
jsonObject
:
object
):
KubernetesClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
switch
(
storageConfig
.
storage
)
{
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
switch
(
storageConfig
.
storage
)
{
case
'
azureStorage
'
:
return
KubernetesClusterConfigAzure
.
getInstance
(
jsonObject
);
case
'
nfs
'
||
undefined
:
case
'
nfs
'
:
case
undefined
:
return
KubernetesClusterConfigNFS
.
getInstance
(
jsonObject
);
default
:
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
}
...
...
@@ -118,9 +124,9 @@ export class KubernetesClusterConfigFactory {
* NFS configuration to store Kubeflow job related files
*/
export
class
NFSConfig
{
/
**
IP Adress of NFS server
*/
/
/
IP Adress of NFS server
public
readonly
server
:
string
;
/
**
exported NFS path on NFS server
*/
/
/
exported NFS path on NFS server
public
readonly
path
:
string
;
constructor
(
server
:
string
,
path
:
string
)
{
...
...
@@ -133,13 +139,13 @@ export class NFSConfig {
* KeyVault configuration to store the key of Azure Storage Service
* Refer https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2
*/
export
class
k
eyVaultConfig
{
/
**
The vault-name to specify vault
*/
export
class
K
eyVaultConfig
{
/
/
The vault-name to specify vault
public
readonly
vaultName
:
string
;
/
**
The name to specify private key
*/
/
/
The name to specify private key
public
readonly
name
:
string
;
constructor
(
vaultName
:
string
,
name
:
string
){
constructor
(
vaultName
:
string
,
name
:
string
)
{
this
.
vaultName
=
vaultName
;
this
.
name
=
name
;
}
...
...
@@ -149,12 +155,12 @@ export class keyVaultConfig {
* Azure Storage Service
*/
export
class
AzureStorage
{
/
**
The azure share to storage files
*/
/
/
The azure share to storage files
public
readonly
azureShare
:
string
;
/
**
The account name of sotrage service
*/
/
/
The account name of sotrage service
public
readonly
accountName
:
string
;
constructor
(
azureShare
:
string
,
accountName
:
string
){
constructor
(
azureShare
:
string
,
accountName
:
string
)
{
this
.
azureShare
=
azureShare
;
this
.
accountName
=
accountName
;
}
...
...
@@ -164,23 +170,23 @@ export class AzureStorage {
* Trial job configuration for Kubernetes
*/
export
class
KubernetesTrialConfigTemplate
{
/
**
CPU number
*/
/
/
CPU number
public
readonly
cpuNum
:
number
;
/
**
Memory
*/
/
/
Memory
public
readonly
memoryMB
:
number
;
/
**
Docker image
*/
/
/
Docker image
public
readonly
image
:
string
;
/
**
Trail command
*/
/
/
Trail command
public
readonly
command
:
string
;
/
**
Required GPU number for trial job. The number should be in [0,100]
*/
/
/
Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
this
.
command
=
command
;
this
.
gpuNum
=
gpuNum
;
this
.
cpuNum
=
cpuNum
;
...
...
@@ -195,4 +201,4 @@ export class KubernetesTrialConfig {
constructor
(
codeDir
:
string
)
{
this
.
codeDir
=
codeDir
;
}
}
\ No newline at end of file
}
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
22993e5d
...
...
@@ -24,7 +24,6 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo
/**
* KubeflowTrialJobDetail
*/
// tslint:disable-next-line:max-classes-per-file
export
class
KubernetesTrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
status
:
TrialJobStatus
;
...
...
@@ -55,7 +54,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
}
}
export
const
K
ubernetesScriptFormat
=
export
const
k
ubernetesScriptFormat
:
string
=
`#!/bin/bash
export NNI_PLATFORM={0}
export NNI_SYS_DIR=$PWD/nni/{1}
...
...
@@ -71,5 +70,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10}
--nni_manager_version '{11}' --log_collection '{12}'`
+
`
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10}
\
--nni_manager_version '{11}' --log_collection '{12}'
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
;
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
View file @
22993e5d
...
...
@@ -20,11 +20,10 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
MethodNotImplementedError
,
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
/**
...
...
@@ -43,22 +42,22 @@ export class KubernetesJobInfoCollector {
public
async
retrieveTrialStatus
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
)
:
Promise
<
void
>
{
assert
(
kubernetesCRDClient
!==
undefined
);
const
updateKubernetesTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
le
t
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
!
kubernetesTrialJob
)
{
for
(
cons
t
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
kubernetesTrialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
if
(
Date
.
now
()
-
kubernetesTrialJob
.
submitTime
<
20
*
1000
)
{
if
(
Date
.
now
()
-
kubernetesTrialJob
.
submitTime
<
20
*
1000
)
{
return
Promise
.
resolve
();
}
updateKubernetesTrialJobs
.
push
(
this
.
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
,
kubernetesTrialJob
))
updateKubernetesTrialJobs
.
push
(
this
.
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
,
kubernetesTrialJob
))
;
}
await
Promise
.
all
(
updateKubernetesTrialJobs
);
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
throw
new
MethodNotImplementedError
();
}
}
\ No newline at end of file
}
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
View file @
22993e5d
...
...
@@ -19,19 +19,19 @@
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
KubernetesTrainingService
}
from
'
./kubernetesTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*/
@
component
.
Singleton
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
@
Inject
private
kubernetesTrainingService
?
:
KubernetesTrainingService
;
private
readonly
kubernetesTrainingService
?
:
KubernetesTrainingService
;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
...
...
@@ -41,8 +41,9 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
this
.
kubernetesTrainingService
=
kubernetesTrainingService
;
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
if
(
!
this
.
kubernetesTrainingService
)
{
if
(
this
.
kubernetesTrainingService
===
undefined
)
{
throw
Error
(
'
kubernetesTrainingService not initialized!
'
);
}
// Split metrics array into single metric, then emit
...
...
@@ -54,4 +55,4 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
});
}
}
}
\ No newline at end of file
}
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
22993e5d
...
...
@@ -17,35 +17,36 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
path
from
'
path
'
;
import
*
as
azureStorage
from
'
azure-storage
'
;
import
{
EventEmitter
}
from
'
events
'
;
import
{
Base64
}
from
'
js-base64
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
getExperimentId
,
getInitTrialSequenceId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getExperimentRootDir
,
uniqueString
,
getJobCancelStatus
,
getIPV4Address
,
getVersion
}
from
'
../../common/utils
'
;
import
{
TrialJobDetail
,
TrialJobMetric
,
NNIManagerIpConfig
NNIManagerIpConfig
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
import
{
KubernetesTrialJobDetail
,
KubernetesScriptFormat
}
from
'
./kubernetesData
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
*
as
azureStorage
from
'
azure-storage
'
;
var
azure
=
require
(
'
azure-storage
'
);
var
base64
=
require
(
'
js-base64
'
).
Base64
;
/**
* Training Service implementation for Kubernetes
*/
abstract
class
KubernetesTrainingService
{
protected
readonly
NNI_KUBERNETES_TRIAL_LABEL
:
string
=
'
nni-kubernetes-trial
'
;
protected
readonly
log
!
:
Logger
;
protected
readonly
metricsEmitter
:
EventEmitter
;
protected
readonly
trialJobsMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
;
/
**
experiment root dir in NFS
*/
/
/
experiment root dir in NFS
protected
readonly
trialLocalNFSTempFolder
:
string
;
protected
stopping
:
boolean
=
false
;
protected
experimentId
!
:
string
;
...
...
@@ -76,13 +77,14 @@ abstract class KubernetesTrainingService {
this
.
logCollection
=
'
none
'
;
}
public
generatePodResource
(
memory
:
number
,
cpuNum
:
number
,
gpuNum
:
number
)
{
// tslint:disable:no-any
public
generatePodResource
(
memory
:
number
,
cpuNum
:
number
,
gpuNum
:
number
):
any
{
return
{
'
memory
'
:
`
${
memory
}
Mi`
,
'
cpu
'
:
`
${
cpuNum
}
`
,
memory
:
`
${
memory
}
Mi`
,
cpu
:
`
${
cpuNum
}
`
,
'
nvidia.com/gpu
'
:
`
${
gpuNum
}
`
}
}
}
;
}
// tslint:enable:no-any
public
async
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
const
jobs
:
TrialJobDetail
[]
=
[];
...
...
@@ -91,7 +93,7 @@ abstract class KubernetesTrainingService {
if
(
value
.
form
.
jobType
===
'
TRIAL
'
)
{
jobs
.
push
(
await
this
.
getTrialJob
(
key
));
}
}
;
}
return
Promise
.
resolve
(
jobs
);
}
...
...
@@ -100,18 +102,18 @@ abstract class KubernetesTrainingService {
const
kubernetesTrialJob
:
TrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
kubernetesTrialJob
)
{
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
)
if
(
kubernetesTrialJob
===
undefined
)
{
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
)
;
}
return
Promise
.
resolve
(
kubernetesTrialJob
);
}
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
{
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
:
void
{
this
.
metricsEmitter
.
on
(
'
metric
'
,
listener
);
}
public
removeTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
{
public
removeTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
:
void
{
this
.
metricsEmitter
.
off
(
'
metric
'
,
listener
);
}
...
...
@@ -127,6 +129,96 @@ abstract class KubernetesTrainingService {
return
this
.
metricsEmitter
;
}
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
KubernetesTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
not found`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
failed because operatorClient is undefined`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
try
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()],
[
'
trialId
'
,
trialJobId
]
]
));
}
catch
(
err
)
{
const
errorMessage
:
string
=
`Delete trial
${
trialJobId
}
failed:
${
err
}
`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
trialJobDetail
.
endTime
=
Date
.
now
();
trialJobDetail
.
status
=
getJobCancelStatus
(
isEarlyStopped
);
return
Promise
.
resolve
();
}
public
async
cleanUp
():
Promise
<
void
>
{
this
.
stopping
=
true
;
// First, cancel all running kubernetes jobs
for
(
const
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
([
'
RUNNING
'
,
'
WAITING
'
,
'
UNKNOWN
'
].
includes
(
kubernetesTrialJob
.
status
))
{
try
{
await
this
.
cancelTrialJob
(
trialJobId
);
}
catch
(
error
)
{
// DONT throw error during cleanup
}
kubernetesTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
}
// Delete all kubernetes jobs whose expId label is current experiment id
try
{
if
(
this
.
kubernetesCRDClient
!==
undefined
)
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()]
]
));
}
}
catch
(
error
)
{
this
.
log
.
error
(
`Delete kubernetes job with label: app=
${
this
.
NNI_KUBERNETES_TRIAL_LABEL
}
,\
expId=
${
getExperimentId
()}
failed, error is
${
error
}
`
);
}
// Unmount NFS
try
{
await
cpp
.
exec
(
`sudo umount
${
this
.
trialLocalNFSTempFolder
}
`
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Unmount
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
);
}
// Stop kubernetes rest server
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
try
{
await
this
.
kubernetesJobRestServer
.
stop
();
this
.
log
.
info
(
'
Kubernetes Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
// tslint:disable-next-line: no-unsafe-any
this
.
log
.
error
(
`Kubernetes Training service rest server stopped failed, error:
${
error
.
message
}
`
);
return
Promise
.
reject
(
error
);
}
return
Promise
.
resolve
();
}
protected
generateSequenceId
():
number
{
if
(
this
.
nextTrialSequenceId
===
-
1
)
{
this
.
nextTrialSequenceId
=
getInitTrialSequenceId
();
...
...
@@ -135,20 +227,26 @@ abstract class KubernetesTrainingService {
return
this
.
nextTrialSequenceId
++
;
}
// tslint:disable: no-unsafe-any no-any
protected
async
createAzureStorage
(
vaultName
:
string
,
valutKeyName
:
string
,
accountName
:
string
,
azureShare
:
string
):
Promise
<
void
>
{
try
{
const
result
=
await
cpp
.
exec
(
`az keyvault secret show --name
${
valutKeyName
}
--vault-name
${
vaultName
}
`
);
if
(
result
.
stderr
)
{
const
result
:
any
=
await
cpp
.
exec
(
`az keyvault secret show --name
${
valutKeyName
}
--vault-name
${
vaultName
}
`
);
if
(
result
.
stderr
)
{
const
errorMessage
:
string
=
result
.
stderr
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
const
storageAccountKey
=
JSON
.
parse
(
result
.
stdout
).
value
;
const
storageAccountKey
:
any
=
JSON
.
parse
(
result
.
stdout
).
value
;
if
(
this
.
azureStorageAccountName
===
undefined
)
{
throw
new
Error
(
'
azureStorageAccountName not initialized!
'
);
}
//create storage client
this
.
azureStorageClient
=
azure
.
createFileService
(
this
.
azureStorageAccountName
,
storageAccountKey
);
this
.
azureStorageClient
=
azure
Storage
.
createFileService
(
this
.
azureStorageAccountName
,
storageAccountKey
);
await
AzureStorageClientUtility
.
createShare
(
this
.
azureStorageClient
,
this
.
azureStorageShare
);
//create sotrage secret
this
.
azureStorageSecretName
=
'
nni-secret-
'
+
uniqueString
(
8
).
toLowerCase
();
this
.
azureStorageSecretName
=
String
.
Format
(
'
nni-secret-{0}
'
,
uniqueString
(
8
)
.
toLowerCase
());
await
this
.
genericK8sClient
.
createSecret
(
{
apiVersion
:
'
v1
'
,
...
...
@@ -163,38 +261,42 @@ abstract class KubernetesTrainingService {
},
type
:
'
Opaque
'
,
data
:
{
azurestorageaccountname
:
b
ase64
.
encode
(
this
.
azureStorageAccountName
),
azurestorageaccountkey
:
b
ase64
.
encode
(
storageAccountKey
)
azurestorageaccountname
:
B
ase64
.
encode
(
this
.
azureStorageAccountName
),
azurestorageaccountkey
:
B
ase64
.
encode
(
storageAccountKey
)
}
}
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
return
Promise
.
resolve
();
}
// tslint:enable: no-unsafe-any no-any
/**
/**
* Genereate run script for different roles(like worker or ps)
* @param trialJobId trial job id
* @param trialWorkingFolder working folder
* @param command
* @param command
command
* @param trialSequenceId sequence id
*/
protected
async
generateRunScript
(
platform
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
command
:
string
,
trialSequenceId
:
string
,
roleName
:
string
,
gpuNum
:
number
):
Promise
<
string
>
{
let
nvidia
_s
cript
:
string
=
''
;
command
:
string
,
trialSequenceId
:
string
,
roleName
:
string
,
gpuNum
:
number
):
Promise
<
string
>
{
let
nvidia
S
cript
:
string
=
''
;
// Nvidia devcie plugin for K8S has a known issue that requesting zero GPUs allocates all GPUs
// Refer https://github.com/NVIDIA/k8s-device-plugin/issues/61
// So we have to explicitly set CUDA_VISIBLE_DEVICES to empty if user sets gpuNum to 0 in NNI config file
if
(
gpuNum
===
0
)
{
nvidia
_s
cript
=
`export CUDA_VISIBLE_DEVICES='0'`
;
if
(
gpuNum
===
0
)
{
nvidia
S
cript
=
`export CUDA_VISIBLE_DEVICES='0'`
;
}
const
nniManagerIp
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
=
this
.
versionCheck
?
await
getVersion
():
''
;
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
runScript
:
string
=
String
.
Format
(
K
ubernetesScriptFormat
,
k
ubernetesScriptFormat
,
platform
,
trialJobId
,
path
.
join
(
trialWorkingFolder
,
'
output
'
,
`
${
roleName
}
_output`
),
...
...
@@ -202,108 +304,28 @@ abstract class KubernetesTrainingService {
getExperimentId
(),
trialWorkingFolder
,
trialSequenceId
,
nvidia
_s
cript
,
nvidia
S
cript
,
command
,
nniManagerIp
,
this
.
kubernetesRestServerPort
,
version
,
this
.
logCollection
);
return
Promise
.
resolve
(
runScript
);
}
protected
async
createNFSStorage
(
nfsServer
:
string
,
nfsPath
:
string
):
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
`
);
try
{
await
cpp
.
exec
(
`sudo mount
${
nfsServer
}
:
${
nfsPath
}
${
this
.
trialLocalNFSTempFolder
}
`
);
}
catch
(
error
)
{
}
catch
(
error
)
{
const
mountError
:
string
=
`Mount NFS
${
nfsServer
}
:
${
nfsPath
}
to
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
;
this
.
log
.
error
(
mountError
);
return
Promise
.
reject
(
mountError
);
}
return
Promise
.
resolve
();
}
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
KubernetesTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJobDetail
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
not found`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
failed because operatorClient is undefined`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
try
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()],
[
'
trialId
'
,
trialJobId
]
]
));
}
catch
(
err
)
{
const
errorMessage
:
string
=
`Delete trial
${
trialJobId
}
failed:
${
err
}
`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
trialJobDetail
.
endTime
=
Date
.
now
();
trialJobDetail
.
status
=
getJobCancelStatus
(
isEarlyStopped
);
return
Promise
.
resolve
();
}
public
async
cleanUp
():
Promise
<
void
>
{
this
.
stopping
=
true
;
// First, cancel all running kubernetes jobs
for
(
let
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
([
'
RUNNING
'
,
'
WAITING
'
,
'
UNKNOWN
'
].
includes
(
kubernetesTrialJob
.
status
))
{
try
{
await
this
.
cancelTrialJob
(
trialJobId
);
}
catch
(
error
)
{}
// DONT throw error during cleanup
kubernetesTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
}
// Delete all kubernetes jobs whose expId label is current experiment id
try
{
if
(
this
.
kubernetesCRDClient
)
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()]
]
));
}
}
catch
(
error
)
{
this
.
log
.
error
(
`Delete kubernetes job with label: app=
${
this
.
NNI_KUBERNETES_TRIAL_LABEL
}
,expId=
${
getExperimentId
()}
failed, error is
${
error
}
`
);
}
// Unmount NFS
try
{
await
cpp
.
exec
(
`sudo umount
${
this
.
trialLocalNFSTempFolder
}
`
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Unmount
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
);
}
// Stop kubernetes rest server
if
(
!
this
.
kubernetesJobRestServer
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
try
{
await
this
.
kubernetesJobRestServer
.
stop
();
this
.
log
.
info
(
'
Kubernetes Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Kubernetes Training service rest server stopped failed, error:
${
error
.
message
}
`
);
Promise
.
reject
(
error
);
return
Promise
.
reject
(
mountError
);
}
return
Promise
.
resolve
();
}
}
export
{
KubernetesTrainingService
}
export
{
KubernetesTrainingService
};
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
22993e5d
...
...
@@ -25,10 +25,10 @@ import * as fs from 'fs';
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
execMkdir
,
getScriptName
,
getgpuMetricsCollectorScriptContent
,
execScript
,
execTail
,
execRemove
,
execKill
}
from
'
../common/util
'
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
execKill
,
execMkdir
,
execRemove
,
execTail
,
getgpuMetricsCollectorScriptContent
,
getScriptName
,
runScript
}
from
'
../common/util
'
;
/**
* GPUScheduler for local training service
...
...
@@ -37,8 +37,8 @@ class GPUScheduler {
private
gpuSummary
!
:
GPUSummary
;
private
stopping
:
boolean
;
private
log
:
Logger
;
private
gpuMetricCollectorScriptFolder
:
string
;
private
readonly
log
:
Logger
;
private
readonly
gpuMetricCollectorScriptFolder
:
string
;
constructor
()
{
this
.
stopping
=
false
;
...
...
@@ -58,28 +58,15 @@ class GPUScheduler {
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
);
//generate gpu_metrics_collector script
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
getScriptName
(
'
gpu_metrics_collector
'
));
const
gpuMetricsCollectorScriptContent
:
string
=
getgpuMetricsCollectorScriptContent
(
this
.
gpuMetricCollectorScriptFolder
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
execScript
(
gpuMetricsCollectorScriptPath
)
}
public
getAvailableGPUIndices
(
useActiveGpu
:
boolean
,
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
):
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
if
(
process
.
platform
===
'
win32
'
||
useActiveGpu
)
{
if
(
process
.
platform
===
'
win32
'
||
useActiveGpu
)
{
return
this
.
gpuSummary
.
gpuInfos
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
}
else
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
=
==
undefined
&&
info
.
activeProcessNum
===
0
||
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
!==
undefined
)
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
}
else
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
===
undefined
&&
info
.
activeProcessNum
===
0
||
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
!
==
undefined
)
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
}
}
...
...
@@ -105,17 +92,32 @@ class GPUScheduler {
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
);
//generate gpu_metrics_collector script
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
getScriptName
(
'
gpu_metrics_collector
'
));
const
gpuMetricsCollectorScriptContent
:
string
=
getgpuMetricsCollectorScriptContent
(
this
.
gpuMetricCollectorScriptFolder
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
runScript
(
gpuMetricsCollectorScriptPath
);
}
// tslint:disable:non-literal-fs-path
private
async
updateGPUSummary
():
Promise
<
void
>
{
le
t
gpuMetricPath
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
);
cons
t
gpuMetricPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
);
if
(
fs
.
existsSync
(
gpuMetricPath
))
{
const
cmdresult
:
cpp
.
childProcessPromise
.
Result
=
await
execTail
(
gpuMetricPath
);
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
if
(
cmdresult
!==
undefined
&&
cmdresult
.
stdout
!==
undefined
)
{
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
else
{
this
.
log
.
error
(
'
Could not get gpu metrics information!
'
);
}
}
else
{
this
.
log
.
warning
(
'
gpu_metrics file does not exist!
'
)
}
else
{
this
.
log
.
warning
(
'
gpu_metrics file does not exist!
'
)
;
}
}
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
22993e5d
...
...
@@ -24,6 +24,7 @@ import { EventEmitter } from 'events';
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
ts
from
'
tail-stream
'
;
import
*
as
tkill
from
'
tree-kill
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getInitTrialSequenceId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
...
...
@@ -31,14 +32,14 @@ import {
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
uniqueString
,
isAlive
,
getNewLine
}
from
'
../../common/utils
'
;
import
{
execMkdir
,
getScriptName
,
execScript
,
setEnvironmentVariable
,
execNewFile
}
from
'
../common/util
'
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
getNewLine
,
isAlive
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execMkdir
,
execNewFile
,
getScriptName
,
runScript
,
setEnvironmentVariable
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
const
tkill
=
require
(
'
tree-kill
'
);
/**
* Decode a command
* @param Buffer binary incoming data
...
...
@@ -46,7 +47,7 @@ const tkill = require('tree-kill');
* success: true if the buffer contains at least one complete command; otherwise false
* remain: remaining data after the first command
*/
// tslint:disable
-next-line:
informative-docs
// tslint:disable
:newline-per-chained-call
informative-docs
function
decodeCommand
(
data
:
Buffer
):
[
boolean
,
string
,
string
,
Buffer
]
{
if
(
data
.
length
<
8
)
{
return
[
false
,
''
,
''
,
data
];
...
...
@@ -61,6 +62,7 @@ function decodeCommand(data: Buffer): [boolean, string, string, Buffer] {
return
[
true
,
commandType
,
content
,
remain
];
}
// tslint:enable:newline-per-chained-call informative-docs
/**
* LocalTrialJobDetail
...
...
@@ -117,21 +119,21 @@ class LocalConfig {
* Local machine training service
*/
class
LocalTrainingService
implements
TrainingService
{
private
eventEmitter
:
EventEmitter
;
private
jobMap
:
Map
<
string
,
LocalTrialJobDetail
>
;
private
jobQueue
:
string
[];
private
readonly
eventEmitter
:
EventEmitter
;
private
readonly
jobMap
:
Map
<
string
,
LocalTrialJobDetail
>
;
private
readonly
jobQueue
:
string
[];
private
initialized
:
boolean
;
private
stopping
:
boolean
;
private
rootDir
!
:
string
;
private
trialSequenceId
:
number
;
private
gpuScheduler
!
:
GPUScheduler
;
private
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
;
private
readonly
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
;
private
designatedGpuIndices
!
:
Set
<
number
>
;
private
log
:
Logger
;
private
readonly
log
:
Logger
;
private
localTrailConfig
?:
TrialConfig
;
private
localConfig
?:
LocalConfig
;
private
isMultiPhase
:
boolean
;
private
jobStreamMap
:
Map
<
string
,
ts
.
Stream
>
;
private
readonly
jobStreamMap
:
Map
<
string
,
ts
.
Stream
>
;
private
maxTrialNumPerGpu
:
number
;
private
useActiveGpu
:
boolean
;
...
...
@@ -182,7 +184,7 @@ class LocalTrainingService implements TrainingService {
return
this
.
getHostJob
(
trialJobId
);
}
if
(
trialJob
.
status
===
'
RUNNING
'
)
{
le
t
alive
:
boolean
=
await
isAlive
(
trialJob
.
pid
);
cons
t
alive
:
boolean
=
await
isAlive
(
trialJob
.
pid
);
if
(
!
alive
)
{
trialJob
.
endTime
=
Date
.
now
();
this
.
setTrialJobStatus
(
trialJob
,
'
FAILED
'
);
...
...
@@ -276,7 +278,7 @@ class LocalTrainingService implements TrainingService {
return
Promise
.
resolve
();
}
if
(
trialJob
.
form
.
jobType
===
'
TRIAL
'
)
{
await
tkill
(
trialJob
.
pid
,
'
SIGKILL
'
);
tkill
(
trialJob
.
pid
,
'
SIGKILL
'
);
}
else
if
(
trialJob
.
form
.
jobType
===
'
HOST
'
)
{
await
cpp
.
exec
(
`pkill -9 -P
${
trialJob
.
pid
}
`
);
}
else
{
...
...
@@ -290,7 +292,8 @@ class LocalTrainingService implements TrainingService {
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
if
(
!
this
.
initialized
)
{
this
.
rootDir
=
getExperimentRootDir
();
if
(
!
fs
.
existsSync
(
this
.
rootDir
)){
// tslint:disable-next-line:non-literal-fs-path
if
(
!
fs
.
existsSync
(
this
.
rootDir
))
{
await
cpp
.
exec
(
`powershell.exe mkdir
${
this
.
rootDir
}
`
);
}
this
.
initialized
=
true
;
...
...
@@ -299,7 +302,7 @@ class LocalTrainingService implements TrainingService {
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
this
.
localTrailConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
if
(
!
this
.
localTrailConfig
)
{
if
(
this
.
localTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
}
this
.
log
.
info
(
`required GPU number is
${
this
.
localTrailConfig
.
gpuNum
}
`
);
...
...
@@ -336,10 +339,10 @@ class LocalTrainingService implements TrainingService {
switch
(
key
)
{
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
let
getResult
:
Promise
<
string
>
;
if
(
!
this
.
localTrailConfig
)
{
if
(
this
.
localTrailConfig
===
undefined
)
{
getResult
=
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`
${
key
}
is never set yet`
));
}
else
{
getResult
=
Promise
.
resolve
(
!
this
.
localTrailConfig
?
''
:
JSON
.
stringify
(
this
.
localTrailConfig
));
getResult
=
Promise
.
resolve
(
JSON
.
stringify
(
this
.
localTrailConfig
));
}
return
getResult
;
...
...
@@ -366,7 +369,7 @@ class LocalTrainingService implements TrainingService {
if
([
'
SUCCEEDED
'
,
'
FAILED
'
,
'
USER_CANCELED
'
,
'
SYS_CANCELED
'
,
'
EARLY_STOPPED
'
].
includes
(
trialJob
.
status
))
{
if
(
this
.
jobStreamMap
.
has
(
trialJob
.
id
))
{
const
stream
:
ts
.
Stream
|
undefined
=
this
.
jobStreamMap
.
get
(
trialJob
.
id
);
if
(
!
stream
)
{
if
(
stream
===
undefined
)
{
throw
new
Error
(
`Could not find stream in trial
${
trialJob
.
id
}
`
);
}
stream
.
destroy
();
...
...
@@ -376,13 +379,13 @@ class LocalTrainingService implements TrainingService {
if
(
trialJob
.
gpuIndices
!==
undefined
&&
trialJob
.
gpuIndices
.
length
>
0
&&
this
.
gpuScheduler
!==
undefined
)
{
if
(
oldStatus
===
'
RUNNING
'
&&
trialJob
.
status
!==
'
RUNNING
'
)
{
for
(
const
index
of
trialJob
.
gpuIndices
)
{
le
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
cons
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
throw
new
Error
(
`gpu resource schedule error`
);
}
else
if
(
num
===
1
)
{
}
else
if
(
num
===
1
)
{
this
.
occupiedGpuIndexNumMap
.
delete
(
index
);
}
else
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
-
1
)
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
-
1
)
;
}
}
}
...
...
@@ -424,10 +427,10 @@ class LocalTrainingService implements TrainingService {
}
let
selectedGPUIndices
:
number
[]
=
[];
le
t
availableGpuIndices
:
number
[]
=
this
.
gpuScheduler
.
getAvailableGPUIndices
(
this
.
useActiveGpu
,
this
.
occupiedGpuIndexNumMap
);
for
(
le
t
index
of
availableGpuIndices
)
{
le
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
||
num
<
this
.
maxTrialNumPerGpu
)
{
cons
t
availableGpuIndices
:
number
[]
=
this
.
gpuScheduler
.
getAvailableGPUIndices
(
this
.
useActiveGpu
,
this
.
occupiedGpuIndexNumMap
);
for
(
cons
t
index
of
availableGpuIndices
)
{
cons
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
||
num
<
this
.
maxTrialNumPerGpu
)
{
selectedGPUIndices
.
push
(
index
);
}
}
...
...
@@ -461,11 +464,11 @@ class LocalTrainingService implements TrainingService {
private
occupyResource
(
resource
:
{
gpuIndices
:
number
[]}):
void
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
for
(
const
index
of
resource
.
gpuIndices
)
{
le
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
1
)
cons
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
1
)
;
}
else
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
+
1
)
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
+
1
)
;
}
}
}
...
...
@@ -498,20 +501,20 @@ class LocalTrainingService implements TrainingService {
}
}
private
getScript
(
localTrailConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]{
le
t
script
:
string
[]
=
[];
if
(
process
.
platform
===
"
win32
"
)
{
private
getScript
(
localTrailConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]
{
cons
t
script
:
string
[]
=
[];
if
(
process
.
platform
===
'
win32
'
)
{
script
.
push
(
`cmd /c
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = "$NOW_DATE" + "000"`
,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
-NoNewline -encoding utf8`
);
}
else
{
}
else
{
script
.
push
(
`eval
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s000
\`
>
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
}
return
script
;
}
...
...
@@ -519,28 +522,29 @@ class LocalTrainingService implements TrainingService {
const
trialJobDetail
:
LocalTrialJobDetail
=
<
LocalTrialJobDetail
>
this
.
jobMap
.
get
(
trialJobId
);
const
variables
:
{
key
:
string
;
value
:
string
}[]
=
this
.
getEnvironmentVariables
(
trialJobDetail
,
resource
);
if
(
!
this
.
localTrailConfig
)
{
if
(
this
.
localTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
runScript
Lines
:
string
[]
=
[];
if
(
process
.
platform
!==
"
win32
"
)
{
runScript
Lines
.
push
(
'
#!/bin/bash
'
);
const
runScript
Content
:
string
[]
=
[];
if
(
process
.
platform
!==
'
win32
'
)
{
runScript
Content
.
push
(
'
#!/bin/bash
'
);
}
runScript
Lines
.
push
(
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
runScript
Content
.
push
(
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
for
(
const
variable
of
variables
)
{
runScript
Lines
.
push
(
setEnvironmentVariable
(
variable
));
runScript
Content
.
push
(
setEnvironmentVariable
(
variable
));
}
const
scripts
:
string
[]
=
this
.
getScript
(
this
.
localTrailConfig
,
trialJobDetail
.
workingDirectory
);
scripts
.
forEach
(
script
=>
{
runScript
Lines
.
push
(
script
);
scripts
.
forEach
(
(
script
:
string
)
=>
{
runScript
Content
.
push
(
script
);
});
await
execMkdir
(
trialJobDetail
.
workingDirectory
);
await
execMkdir
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
));
await
execNewFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
metrics
'
));
const
scriptName
:
string
=
getScriptName
(
'
run
'
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
),
runScriptLines
.
join
(
getNewLine
()),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
),
runScriptContent
.
join
(
getNewLine
()),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
await
this
.
writeParameterFile
(
trialJobDetail
.
workingDirectory
,
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
).
hyperParameters
);
const
trialJobProcess
:
cp
.
ChildProcess
=
exec
Script
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
));
const
trialJobProcess
:
cp
.
ChildProcess
=
run
Script
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
));
this
.
setTrialJobStatus
(
trialJobDetail
,
'
RUNNING
'
);
trialJobDetail
.
startTime
=
Date
.
now
();
trialJobDetail
.
pid
=
trialJobProcess
.
pid
;
...
...
src/nni_manager/training_service/pai/hdfsClientUtility.ts
View file @
22993e5d
...
...
@@ -17,12 +17,12 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
import
*
as
path
from
'
path
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
}
from
'
../../common/log
'
;
import
{
unixPathJoin
}
from
'
../../common/utils
'
import
{
unixPathJoin
}
from
'
../../common/utils
'
;
/**
* HDFS client utility, including copy file/directory
...
...
@@ -33,6 +33,7 @@ export namespace HDFSClientUtility {
* @param hdfsUserName HDFS user name
*/
function
hdfsExpRootDir
(
hdfsUserName
:
string
):
string
{
// tslint:disable-next-line:prefer-template
return
'
/
'
+
unixPathJoin
(
hdfsUserName
,
'
nni
'
,
'
experiments
'
,
getExperimentId
());
}
...
...
@@ -50,8 +51,8 @@ export namespace HDFSClientUtility {
* @param trialId NNI trial ID
*/
export
function
getHdfsTrialWorkDir
(
hdfsUserName
:
string
,
trialId
:
string
):
string
{
le
t
root
=
hdfsExpRootDir
(
hdfsUserName
)
console
.
log
(
root
)
cons
t
root
:
string
=
hdfsExpRootDir
(
hdfsUserName
)
;
return
unixPathJoin
(
root
,
'
trials
'
,
trialId
);
}
...
...
@@ -62,26 +63,31 @@ export namespace HDFSClientUtility {
* @param hdfsFilePath hdfs file path(target)
* @param hdfsClient hdfs client
*/
// tslint:disable: no-unsafe-any non-literal-fs-path no-any
export
async
function
copyFileToHdfs
(
localFilePath
:
string
,
hdfsFilePath
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
// tslint:disable-next-line:non-literal-fs-path
fs
.
exists
(
localFilePath
,
(
exists
:
boolean
)
=>
{
// Detect if local file exist
if
(
exists
)
{
var
localFileStream
=
fs
.
createReadStream
(
localFilePath
);
var
hdfsFileStream
=
hdfsClient
.
createWriteStream
(
hdfsFilePath
);
const
localFileStream
:
fs
.
ReadStream
=
fs
.
createReadStream
(
localFilePath
);
const
hdfsFileStream
:
any
=
hdfsClient
.
createWriteStream
(
hdfsFilePath
);
localFileStream
.
pipe
(
hdfsFileStream
);
hdfsFileStream
.
on
(
'
finish
'
,
function
onFinish
()
{
hdfsFileStream
.
on
(
'
finish
'
,
()
=>
{
deferred
.
resolve
();
});
hdfsFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
getLogger
().
error
(
`HDFSCientUtility:copyFileToHdfs, copy file failed, err is
${
err
.
message
}
`
);
getLogger
()
.
error
(
`HDFSCientUtility:copyFileToHdfs, copy file failed, err is
${
err
.
message
}
`
);
deferred
.
reject
(
err
);
});
}
else
{
getLogger
().
error
(
`HDFSCientUtility:copyFileToHdfs,
${
localFilePath
}
doesn't exist locally`
);
getLogger
()
.
error
(
`HDFSCientUtility:copyFileToHdfs,
${
localFilePath
}
doesn't exist locally`
);
deferred
.
reject
(
'
file not exist!
'
);
}
});
return
deferred
.
promise
;
}
...
...
@@ -92,21 +98,23 @@ export namespace HDFSClientUtility {
* @param hdfsDirectory HDFS directory
* @param hdfsClient HDFS client
*/
export
async
function
copyDirectoryToHdfs
(
localDirectory
:
string
,
hdfsDirectory
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
export
async
function
copyDirectoryToHdfs
(
localDirectory
:
string
,
hdfsDirectory
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
// TODO: fs.readdirSync doesn't support ~($HOME)
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
for
(
var
fileName
of
fileNameArray
){
for
(
const
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
if
(
fs
.
lstatSync
(
fullFilePath
).
isFile
())
{
// tslint:disable-next-line:non-literal-fs-path
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
await
copyFileToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
}
else
{
// If filePath is a directory, recuisively copy it to remote directory
await
copyDirectoryToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
}
}
catch
(
error
)
{
}
catch
(
error
)
{
deferred
.
reject
(
error
);
}
}
...
...
@@ -122,16 +130,16 @@ export namespace HDFSClientUtility {
* @param hdfsPath HDFS file path
* @param hdfsClient HDFS client
*/
export
async
function
readFileFromHDFS
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
Buffer
>
{
export
async
function
readFileFromHDFS
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
Buffer
>
{
const
deferred
:
Deferred
<
Buffer
>
=
new
Deferred
<
Buffer
>
();
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
if
(
!
exist
)
{
if
(
!
exist
)
{
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
}
const
remoteFileStream
=
hdfsClient
.
createReadStream
(
hdfsPath
);
const
remoteFileStream
:
any
=
hdfsClient
.
createReadStream
(
hdfsPath
);
remoteFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
// Reject with the error
deferred
.
reject
(
err
);
...
...
@@ -142,7 +150,7 @@ export namespace HDFSClientUtility {
buffer
=
Buffer
.
concat
([
buffer
,
chunk
]);
});
remoteFileStream
.
on
(
'
finish
'
,
function
onFinish
()
{
remoteFileStream
.
on
(
'
finish
'
,
()
=>
{
// Upload is done, resolve
deferred
.
resolve
(
buffer
);
});
...
...
@@ -158,30 +166,32 @@ export namespace HDFSClientUtility {
*/
export
async
function
pathExists
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
exists
(
hdfsPath
,
(
exist
:
boolean
)
=>
{
hdfsClient
.
exists
(
hdfsPath
,
(
exist
:
boolean
)
=>
{
deferred
.
resolve
(
exist
);
});
let
timeoutId
:
NodeJS
.
Timer
let
timeoutId
:
NodeJS
.
Timer
;
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
)
:
void
=>
{
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId
=
setTimeout
(()
=>
deferred
.
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
),
5000
);
timeoutId
=
setTimeout
(()
=>
{
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
)
;
}
,
5000
);
});
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
]).
finally
(()
=>
clearTimeout
(
timeoutId
));
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
])
.
finally
(()
=>
{
clearTimeout
(
timeoutId
);
});
}
/**
* Mkdir in HDFS, use default permission 755
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
HDFS client
*/
export
function
mkdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
mkdir
(
hdfsPath
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
hdfsClient
.
mkdir
(
hdfsPath
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
deferred
.
resolve
(
true
);
}
else
{
deferred
.
reject
(
err
.
message
);
...
...
@@ -195,17 +205,17 @@ export namespace HDFSClientUtility {
* Read directory contents
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
HDFS client
*/
export
async
function
readdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
string
[]
>
{
const
deferred
:
Deferred
<
string
[]
>
=
new
Deferred
<
string
[]
>
();
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
if
(
!
exist
)
{
if
(
!
exist
)
{
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
}
hdfsClient
.
readdir
(
hdfsPath
,
(
err
:
any
,
files
:
any
[]
)
=>
{
if
(
err
)
{
hdfsClient
.
readdir
(
hdfsPath
,
(
err
:
any
,
files
:
any
[])
=>
{
if
(
err
)
{
deferred
.
reject
(
err
);
}
...
...
@@ -218,18 +228,20 @@ export namespace HDFSClientUtility {
/**
* Delete HDFS path
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
HDFS client
* @param recursive Mark if need to delete recursively
*/
export
function
deletePath
(
hdfsPath
:
string
,
hdfsClient
:
any
,
recursive
:
boolean
=
true
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
unlink
(
hdfsPath
,
recursive
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
hdfsClient
.
unlink
(
hdfsPath
,
recursive
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
deferred
.
resolve
(
true
);
}
else
{
deferred
.
reject
(
err
.
message
);
}
});
return
deferred
.
promise
;
}
// tslint:enable: no-unsafe-any non-literal-fs-path no-any
}
src/nni_manager/training_service/pai/paiConfig.ts
View file @
22993e5d
...
...
@@ -19,8 +19,11 @@
'
use strict
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
/**
* Task role for PAI
*/
export
class
PAITaskRole
{
// Name for the task role
public
readonly
name
:
string
;
...
...
@@ -46,7 +49,8 @@ export class PAITaskRole {
* @param gpuNumber GPU number for one task in the task role, no less than 0
* @param command Executable command for tasks in the task role, can not be empty
*/
constructor
(
name
:
string
,
taskNumber
:
number
,
cpuNumber
:
number
,
memoryMB
:
number
,
gpuNumber
:
number
,
command
:
string
,
shmMB
?:
number
)
{
constructor
(
name
:
string
,
taskNumber
:
number
,
cpuNumber
:
number
,
memoryMB
:
number
,
gpuNumber
:
number
,
command
:
string
,
shmMB
?:
number
)
{
this
.
name
=
name
;
this
.
taskNumber
=
taskNumber
;
this
.
cpuNumber
=
cpuNumber
;
...
...
@@ -57,7 +61,10 @@ export class PAITaskRole {
}
}
export
class
PAIJobConfig
{
/**
* Trial job configuration submitted to PAI
*/
export
class
PAIJobConfig
{
// Name for the job, need to be unique
public
readonly
jobName
:
string
;
// URL pointing to the Docker image for all tasks in the job
...
...
@@ -84,7 +91,7 @@ export class PAIJobConfig{
* @param taskRoles List of taskRole, one task role at least
*/
constructor
(
jobName
:
string
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
codeDir
:
string
,
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
)
{
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
)
{
this
.
jobName
=
jobName
;
this
.
image
=
image
;
this
.
dataDir
=
dataDir
;
...
...
@@ -95,6 +102,9 @@ export class PAIJobConfig{
}
}
/**
* PAI cluster configuration
*/
export
class
PAIClusterConfig
{
public
readonly
userName
:
string
;
public
readonly
passWord
:
string
;
...
...
@@ -106,14 +116,17 @@ export class PAIClusterConfig {
* @param passWord password of PAI Cluster
* @param host Host IP of PAI Cluster
*/
constructor
(
userName
:
string
,
passWord
:
string
,
host
:
string
){
constructor
(
userName
:
string
,
passWord
:
string
,
host
:
string
)
{
this
.
userName
=
userName
;
this
.
passWord
=
passWord
;
this
.
host
=
host
;
}
}
export
class
NNIPAITrialConfig
extends
TrialConfig
{
/**
* PAI trial configuration
*/
export
class
NNIPAITrialConfig
extends
TrialConfig
{
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
...
...
@@ -126,7 +139,7 @@ export class NNIPAITrialConfig extends TrialConfig{
public
shmMB
?:
number
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
)
{
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
)
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
...
...
@@ -137,4 +150,3 @@ export class NNIPAITrialConfig extends TrialConfig{
this
.
shmMB
=
shmMB
;
}
}
src/nni_manager/training_service/pai/paiData.ts
View file @
22993e5d
...
...
@@ -19,8 +19,11 @@
'
use strict
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
common/trainingService
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../
common/trainingService
'
;
/**
* PAI trial job detail
*/
export
class
PAITrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
status
:
TrialJobStatus
;
...
...
@@ -37,7 +40,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
public
isEarlyStopped
?:
boolean
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
,
hdfsLogPath
:
string
)
{
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
,
hdfsLogPath
:
string
)
{
this
.
id
=
id
;
this
.
status
=
status
;
this
.
paiJobName
=
paiJobName
;
...
...
@@ -61,13 +64,15 @@ else
fi`
;
export
const
PAI_TRIAL_COMMAND_FORMAT
:
string
=
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --nni_manager_version '{12}' --log_collection '{13}'`
;
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} \
&& cd $NNI_SYS_DIR && sh install_nni.sh \
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}' \
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' \
--nni_manager_version '{12}' --log_collection '{13}'`
;
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
`hdfs://{0}:9000/`
;
// tslint:disable:no-http-string
export
const
PAI_LOG_PATH_FORMAT
:
string
=
`http://{0}/webhdfs/explorer.html#{1}`
`http://{0}/webhdfs/explorer.html#{1}`
;
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
View file @
22993e5d
...
...
@@ -19,13 +19,14 @@
'
use strict
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
request
from
'
request
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
PAITrialJobDetail
}
from
'
./paiData
'
;
import
{
PAIClusterConfig
}
from
'
./paiConfig
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
PAIClusterConfig
}
from
'
./paiConfig
'
;
import
{
PAITrialJobDetail
}
from
'
./paiData
'
;
/**
* Collector PAI jobs info from PAI cluster, and update pai job status locally
...
...
@@ -43,59 +44,64 @@ export class PAIJobInfoCollector {
}
public
async
retrieveTrialStatus
(
paiToken
?
:
string
,
paiClusterConfig
?:
PAIClusterConfig
)
:
Promise
<
void
>
{
if
(
!
paiClusterConfig
||
!
paiToken
)
{
if
(
paiClusterConfig
===
undefined
||
paiToken
===
undefined
)
{
return
Promise
.
resolve
();
}
const
updatePaiTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
le
t
[
trialJobId
,
paiTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
!
paiTrialJob
)
{
for
(
cons
t
[
trialJobId
,
paiTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
paiTrialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
updatePaiTrialJobs
.
push
(
this
.
getSinglePAITrialJobInfo
(
paiTrialJob
,
paiToken
,
paiClusterConfig
))
updatePaiTrialJobs
.
push
(
this
.
getSinglePAITrialJobInfo
(
paiTrialJob
,
paiToken
,
paiClusterConfig
))
;
}
await
Promise
.
all
(
updatePaiTrialJobs
);
}
private
getSinglePAITrialJobInfo
(
paiTrialJob
:
PAITrialJobDetail
,
paiToken
:
string
,
paiClusterConfig
:
PAIClusterConfig
)
:
Promise
<
void
>
{
private
getSinglePAITrialJobInfo
(
paiTrialJob
:
PAITrialJobDetail
,
paiToken
:
string
,
paiClusterConfig
:
PAIClusterConfig
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
!
this
.
statusesNeedToCheck
.
includes
(
paiTrialJob
.
status
))
{
deferred
.
resolve
();
return
deferred
.
promise
;
}
// Rest call to get PAI job info and update status
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const
getJobInfoRequest
:
request
.
Options
=
{
// tslint:disable-next-line:no-http-string
uri
:
`http://
${
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
paiClusterConfig
.
userName
}
/jobs/
${
paiTrialJob
.
paiJobName
}
`
,
method
:
'
GET
'
,
json
:
true
,
headers
:
{
"
Content-Type
"
:
"
application/json
"
,
"
Authorization
"
:
'
Bearer
'
+
paiToken
'
Content-Type
'
:
'
application/json
'
,
Authorization
:
`
Bearer
${
paiToken
}
`
}
};
// tslint:disable: no-unsafe-any no-any cyclomatic-complexity
//TODO : pass in request timeout param?
request
(
getJobInfoRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
500
)
{
if
(
(
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
500
)
{
this
.
log
.
error
(
`PAI Training service: get job info for trial
${
paiTrialJob
.
id
}
from PAI Cluster failed!`
);
// Queried PAI job info failed, set job status to UNKNOWN
if
(
paiTrialJob
.
status
===
'
WAITING
'
||
paiTrialJob
.
status
===
'
RUNNING
'
)
{
if
(
paiTrialJob
.
status
===
'
WAITING
'
||
paiTrialJob
.
status
===
'
RUNNING
'
)
{
paiTrialJob
.
status
=
'
UNKNOWN
'
;
}
}
else
{
if
(
response
.
body
.
jobStatus
&&
response
.
body
.
jobStatus
.
state
)
{
switch
(
response
.
body
.
jobStatus
.
state
)
{
if
(
response
.
body
.
jobStatus
&&
response
.
body
.
jobStatus
.
state
)
{
switch
(
response
.
body
.
jobStatus
.
state
)
{
case
'
WAITING
'
:
paiTrialJob
.
status
=
'
WAITING
'
;
break
;
case
'
RUNNING
'
:
paiTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
paiTrialJob
.
startTime
)
{
if
(
paiTrialJob
.
startTime
===
undefined
)
{
paiTrialJob
.
startTime
=
response
.
body
.
jobStatus
.
appLaunchedTime
;
}
if
(
!
paiTrialJob
.
url
)
{
if
(
paiTrialJob
.
url
===
undefined
)
{
paiTrialJob
.
url
=
response
.
body
.
jobStatus
.
appTrackingUrl
;
}
break
;
...
...
@@ -107,7 +113,9 @@ export class PAIJobInfoCollector {
paiTrialJob
.
status
=
paiTrialJob
.
isEarlyStopped
===
true
?
'
EARLY_STOPPED
'
:
'
USER_CANCELED
'
;
}
else
{
// if paiTrialJob's isEarlyStopped is undefined, that mean we didn't stop it via cancellation, mark it as SYS_CANCELLED by PAI
/* if paiTrialJob's isEarlyStopped is undefined, that mean we didn't stop it via cancellation,
* mark it as SYS_CANCELLED by PAI
*/
paiTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
break
;
...
...
@@ -116,18 +124,17 @@ export class PAIJobInfoCollector {
break
;
default
:
paiTrialJob
.
status
=
'
UNKNOWN
'
;
break
;
}
// For final job statues, update startTime, endTime and url
if
(
this
.
finalStatuses
.
includes
(
paiTrialJob
.
status
))
{
if
(
!
paiTrialJob
.
startTime
)
{
if
(
this
.
finalStatuses
.
includes
(
paiTrialJob
.
status
))
{
if
(
paiTrialJob
.
startTime
===
undefined
)
{
paiTrialJob
.
startTime
=
response
.
body
.
jobStatus
.
appLaunchedTime
;
}
if
(
!
paiTrialJob
.
endTime
)
{
if
(
paiTrialJob
.
endTime
===
undefined
)
{
paiTrialJob
.
endTime
=
response
.
body
.
jobStatus
.
completedTime
;
}
// Set pai trial job's url to WebHDFS output path
if
(
paiTrialJob
.
hdfsLogPath
)
{
if
(
paiTrialJob
.
hdfsLogPath
!==
undefined
)
{
paiTrialJob
.
url
+=
`,
${
paiTrialJob
.
hdfsLogPath
}
`
;
}
}
...
...
@@ -138,4 +145,5 @@ export class PAIJobInfoCollector {
return
deferred
.
promise
;
}
// tslint:enable: no-unsafe-any no-any
}
src/nni_manager/training_service/pai/paiJobRestServer.ts
View file @
22993e5d
...
...
@@ -19,17 +19,17 @@
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
PAITrainingService
}
from
'
./paiTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
* PAI Training service Rest server, provides rest API to support pai job metrics update
*
*/
@
component
.
Singleton
export
class
PAIJobRestServer
extends
ClusterJobRestServer
{
export
class
PAIJobRestServer
extends
ClusterJobRestServer
{
@
Inject
private
readonly
paiTrainingService
:
PAITrainingService
;
...
...
@@ -41,6 +41,7 @@ export class PAIJobRestServer extends ClusterJobRestServer{
this
.
paiTrainingService
=
component
.
get
(
PAITrainingService
);
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
...
...
@@ -51,4 +52,4 @@ export class PAIJobRestServer extends ClusterJobRestServer{
});
}
}
}
\ No newline at end of file
}
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
22993e5d
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
...
...
@@ -23,6 +22,7 @@
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
request
from
'
request
'
;
import
*
as
component
from
'
../../common/component
'
;
...
...
@@ -37,18 +37,17 @@ import {
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
getExperimentRootDir
,
getIPV4Address
,
getVersion
,
uniqueString
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
,
execMkdir
}
from
'
../common/util
'
;
import
{
unixPathJoin
}
from
'
../../common/utils
'
import
{
execMkdir
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
HDFSClientUtility
}
from
'
./hdfsClientUtility
'
;
import
{
NNIPAITrialConfig
,
PAIClusterConfig
,
PAIJobConfig
,
PAITaskRole
}
from
'
./paiConfig
'
;
import
{
PAI_LOG_PATH_FORMAT
,
PAI_OUTPUT_DIR_FORMAT
,
PAI_TRIAL_COMMAND_FORMAT
,
PAITrialJobDetail
}
from
'
./paiData
'
;
import
{
PAIJobInfoCollector
}
from
'
./paiJobInfoCollector
'
;
import
{
PAIJobRestServer
}
from
'
./paiJobRestServer
'
;
const
WebHDFS
=
require
(
'
webhdfs
'
)
;
import
*
as
WebHDFS
from
'
webhdfs
'
;
/**
* Training Service implementation for OpenPAI (Open Platform for AI)
...
...
@@ -62,13 +61,14 @@ class PAITrainingService implements TrainingService {
private
readonly
expRootDir
:
string
;
private
paiTrialConfig
:
NNIPAITrialConfig
|
undefined
;
private
paiClusterConfig
?:
PAIClusterConfig
;
private
jobQueue
:
string
[];
private
readonly
jobQueue
:
string
[];
private
stopping
:
boolean
=
false
;
// tslint:disable-next-line:no-any
private
hdfsClient
:
any
;
private
paiToken
?
:
string
;
private
paiTokenUpdateTime
?:
number
;
private
paiTokenUpdateInterval
:
number
;
private
experimentId
!
:
string
;
private
readonly
paiTokenUpdateInterval
:
number
;
private
readonly
experimentId
!
:
string
;
private
readonly
paiJobCollector
:
PAIJobInfoCollector
;
private
readonly
hdfsDirPattern
:
string
;
private
hdfsBaseDir
:
string
|
undefined
;
...
...
@@ -121,13 +121,13 @@ class PAITrainingService implements TrainingService {
}
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
const
paiTrialJob
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
paiTrialJob
)
{
if
(
paiTrialJob
===
undefined
)
{
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
);
}
...
...
@@ -144,7 +144,7 @@ class PAITrainingService implements TrainingService {
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
const
deferred
:
Deferred
<
PAITrialJobDetail
>
=
new
Deferred
<
PAITrialJobDetail
>
();
if
(
!
this
.
hdfsBaseDir
)
{
if
(
this
.
hdfsBaseDir
===
undefined
)
{
throw
new
Error
(
'
hdfsBaseDir is not initialized
'
);
}
...
...
@@ -187,24 +187,26 @@ class PAITrainingService implements TrainingService {
return
false
;
}
// tslint:disable:no-http-string
public
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
!
trialJobDetail
)
{
if
(
trialJobDetail
===
undefined
)
{
this
.
log
.
error
(
`cancelTrialJob: trial job id
${
trialJobId
}
not found`
);
return
Promise
.
reject
();
}
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
!
this
.
paiToken
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
}
const
stopJobRequest
:
request
.
Options
=
{
uri
:
`http://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
this
.
paiClusterConfig
.
userName
}
/jobs/
${
trialJobDetail
.
paiJobName
}
/executionType`
,
uri
:
`http://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
this
.
paiClusterConfig
.
userName
}
\
/jobs/
${
trialJobDetail
.
paiJobName
}
/executionType`
,
method
:
'
PUT
'
,
json
:
true
,
body
:
{
value
:
'
STOP
'
},
...
...
@@ -217,10 +219,12 @@ class PAITrainingService implements TrainingService {
// Set trialjobDetail's early stopped field, to mark the job's cancellation source
trialJobDetail
.
isEarlyStopped
=
isEarlyStopped
;
// tslint:disable-next-line:no-any
request
(
stopJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
400
)
{
if
(
(
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
this
.
log
.
error
(
`PAI Training service: stop trial
${
trialJobId
}
to PAI Cluster failed!`
);
deferred
.
reject
(
error
?
error
.
message
:
`Stop trial failed, http code:
${
response
.
statusCode
}
`
);
deferred
.
reject
((
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`Stop trial failed, http code:
${
response
.
statusCode
}
`
);
}
else
{
deferred
.
resolve
();
}
...
...
@@ -229,6 +233,7 @@ class PAITrainingService implements TrainingService {
return
deferred
.
promise
;
}
// tslint:disable: no-unsafe-any no-any
// tslint:disable-next-line:max-func-body-length
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -256,47 +261,47 @@ class PAITrainingService implements TrainingService {
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
deferred
.
reject
(
new
Error
(
'
pai cluster config is not initialized
'
));
break
;
}
this
.
paiTrialConfig
=
<
NNIPAITrialConfig
>
JSON
.
parse
(
value
);
//paiTrialConfig.outputDir could be null if it is not set in nnictl
if
(
this
.
paiTrialConfig
.
outputDir
===
undefined
||
this
.
paiTrialConfig
.
outputDir
===
null
){
if
(
this
.
paiTrialConfig
.
outputDir
===
undefined
||
this
.
paiTrialConfig
.
outputDir
===
null
)
{
this
.
paiTrialConfig
.
outputDir
=
String
.
Format
(
PAI_OUTPUT_DIR_FORMAT
,
this
.
paiClusterConfig
.
host
).
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
}
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
deferred
.
reject
(
new
Error
(
error
));
break
;
}
const
hdfsDirContent
=
this
.
paiTrialConfig
.
outputDir
.
match
(
this
.
hdfsDirPattern
);
const
hdfsDirContent
:
any
=
this
.
paiTrialConfig
.
outputDir
.
match
(
this
.
hdfsDirPattern
);
if
(
hdfsDirContent
===
null
)
{
throw
new
Error
(
'
Trial outputDir format Error
'
);
}
const
groups
=
hdfsDirContent
.
groups
;
const
groups
:
any
=
hdfsDirContent
.
groups
;
if
(
groups
===
undefined
)
{
throw
new
Error
(
'
Trial outputDir format Error
'
);
}
this
.
hdfsOutputHost
=
groups
[
'
host
'
];
this
.
hdfsOutputHost
=
groups
.
host
;
//TODO: choose to use /${username} as baseDir
this
.
hdfsBaseDir
=
groups
[
'
baseDir
'
]
;
if
(
this
.
hdfsBaseDir
===
undefined
)
{
this
.
hdfsBaseDir
=
groups
.
baseDir
;
if
(
this
.
hdfsBaseDir
===
undefined
)
{
this
.
hdfsBaseDir
=
'
/
'
;
}
let
dataOutputHdfsClient
;
let
dataOutputHdfsClient
:
any
;
if
(
this
.
paiClusterConfig
.
host
===
this
.
hdfsOutputHost
&&
this
.
hdfsClient
)
{
dataOutputHdfsClient
=
this
.
hdfsClient
;
}
else
{
...
...
@@ -338,6 +343,7 @@ class PAITrainingService implements TrainingService {
return
deferred
.
promise
;
}
// tslint:enable: no-unsafe-any
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
...
...
@@ -358,6 +364,7 @@ class PAITrainingService implements TrainingService {
deferred
.
resolve
();
this
.
log
.
info
(
'
PAI Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
// tslint:disable-next-line: no-unsafe-any
this
.
log
.
error
(
`PAI Training service rest server stopped failed, error:
${
error
.
message
}
`
);
deferred
.
reject
(
error
);
}
...
...
@@ -374,35 +381,35 @@ class PAITrainingService implements TrainingService {
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJobDetail
)
{
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`Failed to find PAITrialJobDetail for job
${
trialJobId
}
`
);
}
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
!
this
.
paiTrialConfig
)
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
if
(
!
this
.
paiToken
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
}
if
(
!
this
.
hdfsBaseDir
)
{
if
(
this
.
hdfsBaseDir
===
undefined
)
{
throw
new
Error
(
'
hdfsBaseDir is not initialized
'
);
}
if
(
!
this
.
hdfsOutputHost
)
{
if
(
this
.
hdfsOutputHost
===
undefined
)
{
throw
new
Error
(
'
hdfsOutputHost is not initialized
'
);
}
if
(
!
this
.
paiRestServerPort
)
{
if
(
this
.
paiRestServerPort
===
undefined
)
{
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
this
.
paiRestServerPort
=
restServer
.
clusterRestServerPort
;
}
// Make sure experiment code files is copied from local to HDFS
if
(
this
.
copyExpCodeDirPromise
)
{
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
}
...
...
@@ -420,13 +427,14 @@ class PAITrainingService implements TrainingService {
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
);
if
(
trialForm
)
{
if
(
trialForm
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
}
);
}
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
...
...
@@ -446,8 +454,10 @@ class PAITrainingService implements TrainingService {
HDFSClientUtility
.
getHdfsExpCodeDir
(
this
.
paiClusterConfig
.
userName
),
version
,
this
.
logCollection
).
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
// tslint:disable-next-line:no-console
console
.
log
(
`nniPAItrial command is
${
nniPaiTrialCommand
.
trim
()}
`
);
const
paiTaskRoles
:
PAITaskRole
[]
=
[
new
PAITaskRole
(
...
...
@@ -507,9 +517,10 @@ class PAITrainingService implements TrainingService {
Authorization
:
`Bearer
${
this
.
paiToken
}
`
}
};
// tslint:disable:no-any no-unsafe-any
request
(
submitJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
error
?
error
.
message
:
if
(
(
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`Submit trial
${
trialJobId
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
response
.
body
}
`
;
this
.
log
.
error
(
errorMessage
);
trialJobDetail
.
status
=
'
FAILED
'
;
...
...
@@ -533,18 +544,18 @@ class PAITrainingService implements TrainingService {
private
async
statusCheckingLoop
():
Promise
<
void
>
{
while
(
!
this
.
stopping
)
{
try
{
try
{
await
this
.
updatePaiToken
();
}
catch
(
error
){
}
catch
(
error
)
{
this
.
log
.
error
(
`
${
error
}
`
);
//only throw error when initlize paiToken first time
if
(
!
this
.
paiToken
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
error
);
}
}
await
this
.
paiJobCollector
.
retrieveTrialStatus
(
this
.
paiToken
,
this
.
paiClusterConfig
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
if
(
restServer
.
getErrorMessage
)
{
if
(
restServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
}
await
delay
(
3000
);
...
...
@@ -575,17 +586,17 @@ class PAITrainingService implements TrainingService {
const
currentTime
:
number
=
new
Date
().
getTime
();
//If pai token initialized and not reach the interval time, do not update
if
(
this
.
paiTokenUpdateTime
&&
(
currentTime
-
this
.
paiTokenUpdateTime
)
<
this
.
paiTokenUpdateInterval
){
if
(
this
.
paiTokenUpdateTime
!==
undefined
&&
(
currentTime
-
this
.
paiTokenUpdateTime
)
<
this
.
paiTokenUpdateInterval
)
{
return
Promise
.
resolve
();
}
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
const
paiClusterConfigError
:
string
=
`pai cluster config not initialized!`
;
this
.
log
.
error
(
`
${
paiClusterConfigError
}
`
);
throw
Error
(
`
${
paiClusterConfigError
}
`
);
}
const
authentication
_r
eq
:
request
.
Options
=
{
const
authentication
R
eq
:
request
.
Options
=
{
uri
:
`http://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/token`
,
method
:
'
POST
'
,
json
:
true
,
...
...
@@ -595,12 +606,12 @@ class PAITrainingService implements TrainingService {
}
};
request
(
authentication
_r
eq
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
)
{
request
(
authentication
R
eq
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
!==
undefined
&&
error
!==
null
)
{
this
.
log
.
error
(
`Get PAI token failed:
${
error
.
message
}
`
);
deferred
.
reject
(
new
Error
(
`Get PAI token failed:
${
error
.
message
}
`
));
}
else
{
if
(
response
.
statusCode
!==
200
){
if
(
response
.
statusCode
!==
200
)
{
this
.
log
.
error
(
`Get PAI token failed: get PAI Rest return code
${
response
.
statusCode
}
`
);
deferred
.
reject
(
new
Error
(
`Get PAI token failed:
${
response
.
body
}
, please check paiConfig username or password`
));
}
...
...
@@ -619,8 +630,9 @@ class PAITrainingService implements TrainingService {
});
return
Promise
.
race
([
timeoutDelay
,
deferred
.
promise
])
.
finally
(()
=>
clearTimeout
(
timeoutId
));
.
finally
(()
=>
{
clearTimeout
(
timeoutId
)
;
}
);
}
// tslint:enable:no-any no-unsafe-any no-http-string
}
export
{
PAITrainingService
};
src/nni_manager/training_service/pai/paiTrialConfig.ts
View file @
22993e5d
...
...
@@ -19,16 +19,20 @@
'
use strict
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
export
class
PAITrialConfig
extends
TrialConfig
{
/**
* PAI configuration to run trials
*/
export
class
PAITrialConfig
extends
TrialConfig
{
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
outputDir
:
string
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
)
{
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
)
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
...
...
@@ -36,4 +40,4 @@ export class PAITrialConfig extends TrialConfig{
this
.
dataDir
=
dataDir
;
this
.
outputDir
=
outputDir
;
}
}
\ No newline at end of file
}
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
View file @
22993e5d
...
...
@@ -21,10 +21,12 @@
import
*
as
assert
from
'
assert
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialJobDetail
}
from
'
../../common/trainingService
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
RemoteMachineTrialJobDetail
,
parseGpuIndices
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
ScheduleResultType
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
import
{
TrialJobDetail
}
from
'
common/trainingService
'
;
import
{
parseGpuIndices
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
/**
* A simple GPU scheduler implementation
...
...
@@ -32,7 +34,7 @@ import { TrialJobDetail } from 'common/trainingService';
export
class
GPUScheduler
{
private
readonly
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
private
log
:
Logger
=
getLogger
();
private
readonly
log
:
Logger
=
getLogger
();
/**
* Constructor
...
...
@@ -89,21 +91,21 @@ export class GPUScheduler {
* remove the job's gpu reversion
*/
public
removeGpuReservation
(
trialJobId
:
string
,
trialJobMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
):
void
{
le
t
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
trialJobMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
cons
t
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
trialJobMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`could not get trialJobDetail by id
${
trialJobId
}
`
);
}
if
(
trialJobDetail
.
rmMeta
!==
undefined
&&
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
!==
undefined
&&
trialJobDetail
.
gpuIndices
!==
undefined
&&
}
if
(
trialJobDetail
.
rmMeta
!==
undefined
&&
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
!==
undefined
&&
trialJobDetail
.
gpuIndices
!==
undefined
&&
trialJobDetail
.
gpuIndices
.
length
>
0
)
{
for
(
const
gpuInfo
of
trialJobDetail
.
gpuIndices
)
{
le
t
num
:
number
|
undefined
=
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
!==
undefined
)
{
if
(
num
===
1
)
{
cons
t
num
:
number
|
undefined
=
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
!==
undefined
)
{
if
(
num
===
1
)
{
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
delete
(
gpuInfo
.
index
);
}
else
{
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
-
1
)
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
-
1
)
;
}
}
}
...
...
@@ -116,7 +118,6 @@ export class GPUScheduler {
const
totalResourceMap
:
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
=
this
.
gpuResourceDetection
();
const
qualifiedRMs
:
RemoteMachineMeta
[]
=
[];
totalResourceMap
.
forEach
((
gpuInfos
:
GPUInfo
[],
rmMeta
:
RemoteMachineMeta
)
=>
{
if
(
gpuInfos
!==
undefined
&&
gpuInfos
.
length
>=
requiredGPUNum
)
{
qualifiedRMs
.
push
(
rmMeta
);
}
...
...
@@ -154,6 +155,7 @@ export class GPUScheduler {
}
}
this
.
log
.
debug
(
`designated gpu indices:
${
designatedGpuIndices
}
`
);
// tslint:disable: strict-boolean-expressions
rmMeta
.
gpuSummary
.
gpuInfos
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
// if the GPU has active process, OR be reserved by a job,
// or index not in gpuIndices configuration in machineList,
...
...
@@ -161,10 +163,10 @@ export class GPUScheduler {
// We should NOT allocate this GPU
// if users set useActiveGpu, use the gpu whether there is another activeProcess
if
(
designatedGpuIndices
===
undefined
||
designatedGpuIndices
.
has
(
gpuInfo
.
index
))
{
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
le
t
num
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
le
t
maxTrialNumPerGpu
:
number
=
rmMeta
.
maxTrialNumPerGpu
?
rmMeta
.
maxTrialNumPerGpu
:
1
;
if
((
num
===
undefined
&&
(
!
rmMeta
.
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
rmMeta
.
useActiveGpu
))
||
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
cons
t
num
:
number
|
undefined
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
cons
t
maxTrialNumPerGpu
:
number
=
rmMeta
.
maxTrialNumPerGpu
?
rmMeta
.
maxTrialNumPerGpu
:
1
;
if
((
num
===
undefined
&&
(
!
rmMeta
.
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
rmMeta
.
useActiveGpu
))
||
(
num
!==
undefined
&&
num
<
maxTrialNumPerGpu
))
{
availableGPUs
.
push
(
gpuInfo
);
}
...
...
@@ -179,6 +181,7 @@ export class GPUScheduler {
return
totalResourceMap
;
}
// tslint:enable: strict-boolean-expressions
private
selectMachine
(
rmMetas
:
RemoteMachineMeta
[]):
RemoteMachineMeta
{
assert
(
rmMetas
!==
undefined
&&
rmMetas
.
length
>
0
);
...
...
@@ -196,23 +199,28 @@ export class GPUScheduler {
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
let
num
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
===
undefined
)
{
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
let
num
:
number
|
undefined
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
===
undefined
)
{
num
=
0
;
}
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
+
1
);
}
else
{
}
else
{
throw
new
Error
(
`Machine
${
rmMeta
.
ip
}
occupiedGpuIndexMap initialize error!`
);
}
});
trialJobDetail
.
gpuIndices
=
allocatedGPUs
;
trialJobDetail
.
rmMeta
=
rmMeta
;
return
{
resultType
:
ScheduleResultType
.
SUCCEED
,
scheduleInfo
:
{
rmMeta
:
rmMeta
,
cuda_visible_device
:
allocatedGPUs
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
return
gpuInfo
.
index
;
}).
join
(
'
,
'
)
cuda_visible_device
:
allocatedGPUs
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
return
gpuInfo
.
index
;
})
.
join
(
'
,
'
)
}
};
}
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
22993e5d
...
...
@@ -23,7 +23,7 @@ import * as fs from 'fs';
import
{
Client
,
ConnectConfig
}
from
'
ssh2
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
GPU
Summary
,
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
GPU
Info
,
GPUSummary
}
from
'
../common/gpuData
'
;
/**
* Metadata of remote machine for configuration and statuc query
...
...
@@ -73,7 +73,6 @@ export class RemoteCommandResult {
/**
* RemoteMachineTrialJobDetail
*/
// tslint:disable-next-line:max-classes-per-file
export
class
RemoteMachineTrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
status
:
TrialJobStatus
;
...
...
@@ -98,7 +97,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
this
.
form
=
form
;
this
.
sequenceId
=
sequenceId
;
this
.
tags
=
[];
this
.
gpuIndices
=
[]
this
.
gpuIndices
=
[]
;
}
}
...
...
@@ -121,17 +120,20 @@ export class SSHClient {
return
this
.
usedConnectionNumber
;
}
public
addUsedConnectionNumber
()
{
public
addUsedConnectionNumber
()
:
void
{
this
.
usedConnectionNumber
+=
1
;
}
public
minusUsedConnectionNumber
()
{
public
minusUsedConnectionNumber
()
:
void
{
this
.
usedConnectionNumber
-=
1
;
}
}
/**
* The remote machine ssh client manager
*/
export
class
SSHClientManager
{
private
sshClientArray
:
SSHClient
[];
private
readonly
sshClientArray
:
SSHClient
[];
private
readonly
maxTrialNumberPerConnection
:
number
;
private
readonly
rmMeta
:
RemoteMachineMeta
;
constructor
(
sshClientArray
:
SSHClient
[],
maxTrialNumberPerConnection
:
number
,
rmMeta
:
RemoteMachineMeta
)
{
...
...
@@ -140,122 +142,128 @@ export class SSHClientManager {
this
.
maxTrialNumberPerConnection
=
maxTrialNumberPerConnection
;
}
/**
* Create a new ssh connection client and initialize it
*/
private
initNewSSHClient
():
Promise
<
Client
>
{
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
const
conn
:
Client
=
new
Client
();
let
connectConfig
:
ConnectConfig
=
{
host
:
this
.
rmMeta
.
ip
,
port
:
this
.
rmMeta
.
port
,
username
:
this
.
rmMeta
.
username
};
if
(
this
.
rmMeta
.
passwd
)
{
connectConfig
.
password
=
this
.
rmMeta
.
passwd
;
}
else
if
(
this
.
rmMeta
.
sshKeyPath
)
{
if
(
!
fs
.
existsSync
(
this
.
rmMeta
.
sshKeyPath
))
{
//SSh key path is not a valid file, reject
deferred
.
reject
(
new
Error
(
`
${
this
.
rmMeta
.
sshKeyPath
}
does not exist.`
));
}
const
privateKey
:
string
=
fs
.
readFileSync
(
this
.
rmMeta
.
sshKeyPath
,
'
utf8
'
);
connectConfig
.
privateKey
=
privateKey
;
connectConfig
.
passphrase
=
this
.
rmMeta
.
passphrase
;
}
else
{
deferred
.
reject
(
new
Error
(
`No valid passwd or sshKeyPath is configed.`
));
}
conn
.
on
(
'
ready
'
,
()
=>
{
this
.
addNewSSHClient
(
conn
);
deferred
.
resolve
(
conn
);
}).
on
(
'
error
'
,
(
err
:
Error
)
=>
{
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
}).
connect
(
connectConfig
);
return
deferred
.
promise
;
}
/**
* find a available ssh client in ssh array, if no ssh client available, return undefined
*/
public
async
getAvailableSSHClient
():
Promise
<
Client
>
{
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
for
(
const
index
in
this
.
sshClientArray
)
{
le
t
connectionNumber
:
number
=
this
.
sshClientArray
[
index
].
getUsedConnectionNumber
;
if
(
connectionNumber
<
this
.
maxTrialNumberPerConnection
)
{
for
(
const
index
of
this
.
sshClientArray
.
keys
()
)
{
cons
t
connectionNumber
:
number
=
this
.
sshClientArray
[
index
].
getUsedConnectionNumber
;
if
(
connectionNumber
<
this
.
maxTrialNumberPerConnection
)
{
this
.
sshClientArray
[
index
].
addUsedConnectionNumber
();
deferred
.
resolve
(
this
.
sshClientArray
[
index
].
getSSHClientInstance
);
return
deferred
.
promise
;
}
};
}
//init a new ssh client if could not get an available one
return
await
this
.
initNewSSHClient
();
return
this
.
initNewSSHClient
();
}
/**
* add a new ssh client to sshClientArray
* @param sshClient
* @param sshClient
SSH Client
*/
public
addNewSSHClient
(
client
:
Client
)
{
public
addNewSSHClient
(
client
:
Client
)
:
void
{
this
.
sshClientArray
.
push
(
new
SSHClient
(
client
,
1
));
}
/**
* first ssh cli
l
ent instance is used for gpu collector and host job
* first ssh client instance is used for gpu collector and host job
*/
public
getFirstSSHClient
()
{
public
getFirstSSHClient
()
:
Client
{
return
this
.
sshClientArray
[
0
].
getSSHClientInstance
;
}
/**
* close all of ssh client
*/
public
closeAllSSHClient
()
{
for
(
le
t
sshClient
of
this
.
sshClientArray
)
{
public
closeAllSSHClient
()
:
void
{
for
(
cons
t
sshClient
of
this
.
sshClientArray
)
{
sshClient
.
getSSHClientInstance
.
end
();
}
}
/**
* retrieve resource, minus a number for given ssh client
* @param client
* @param client
SSH Client
*/
public
releaseConnection
(
client
:
Client
|
undefined
)
{
if
(
!
client
)
{
public
releaseConnection
(
client
:
Client
|
undefined
)
:
void
{
if
(
client
===
undefined
)
{
throw
new
Error
(
`could not release a undefined ssh client`
);
}
for
(
le
t
index
in
this
.
sshClientArray
)
{
if
(
this
.
sshClientArray
[
index
].
getSSHClientInstance
===
client
)
{
for
(
cons
t
index
of
this
.
sshClientArray
.
keys
()
)
{
if
(
this
.
sshClientArray
[
index
].
getSSHClientInstance
===
client
)
{
this
.
sshClientArray
[
index
].
minusUsedConnectionNumber
();
break
;
}
}
}
}
/**
* Create a new ssh connection client and initialize it
*/
// tslint:disable:non-literal-fs-path
private
initNewSSHClient
():
Promise
<
Client
>
{
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
const
conn
:
Client
=
new
Client
();
const
connectConfig
:
ConnectConfig
=
{
host
:
this
.
rmMeta
.
ip
,
port
:
this
.
rmMeta
.
port
,
username
:
this
.
rmMeta
.
username
};
if
(
this
.
rmMeta
.
passwd
!==
undefined
)
{
connectConfig
.
password
=
this
.
rmMeta
.
passwd
;
}
else
if
(
this
.
rmMeta
.
sshKeyPath
!==
undefined
)
{
if
(
!
fs
.
existsSync
(
this
.
rmMeta
.
sshKeyPath
))
{
//SSh key path is not a valid file, reject
deferred
.
reject
(
new
Error
(
`
${
this
.
rmMeta
.
sshKeyPath
}
does not exist.`
));
}
const
privateKey
:
string
=
fs
.
readFileSync
(
this
.
rmMeta
.
sshKeyPath
,
'
utf8
'
);
connectConfig
.
privateKey
=
privateKey
;
connectConfig
.
passphrase
=
this
.
rmMeta
.
passphrase
;
}
else
{
deferred
.
reject
(
new
Error
(
`No valid passwd or sshKeyPath is configed.`
));
}
conn
.
on
(
'
ready
'
,
()
=>
{
this
.
addNewSSHClient
(
conn
);
deferred
.
resolve
(
conn
);
})
.
on
(
'
error
'
,
(
err
:
Error
)
=>
{
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
})
.
connect
(
connectConfig
);
return
deferred
.
promise
;
}
}
export
type
RemoteMachineScheduleResult
=
{
scheduleInfo
:
RemoteMachineScheduleInfo
|
undefined
;
resultType
:
ScheduleResultType
};
export
type
RemoteMachineScheduleInfo
=
{
rmMeta
:
RemoteMachineMeta
;
cuda_visible_device
:
string
};
export
enum
ScheduleResultType
{
/
*
Schedule succeeded
*/
/
/
Schedule succeeded
SUCCEED
,
/
*
Temporarily, no enough available GPU right now
*/
/
/
Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU
,
/
*
Cannot match requirement even if all GPU are a
*/
/
/
Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
export
const
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
:
string
=
`#!/bin/bash
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5}
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \
NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5}
cd $NNI_SYS_DIR
sh install_nni.sh
echo $$ >{6}
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $?
\`
date +%s%3N
\`
>{12}`
;
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
...
...
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
View file @
22993e5d
...
...
@@ -19,17 +19,17 @@
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
RemoteMachineTrainingService
}
from
'
./remoteMachineTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
* RemoteMachine Training service Rest server, provides rest RemoteMachine to support remotemachine job metrics update
*
*/
@
component
.
Singleton
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
@
Inject
private
readonly
remoteMachineTrainingService
:
RemoteMachineTrainingService
;
...
...
@@ -41,6 +41,7 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer{
this
.
remoteMachineTrainingService
=
component
.
get
(
RemoteMachineTrainingService
);
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWNls
...
...
@@ -51,4 +52,4 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer{
});
}
}
}
\ No newline at end of file
}
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
22993e5d
...
...
@@ -34,42 +34,45 @@ import { getExperimentId, getInitTrialSequenceId } from '../../common/experiment
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
ObservableTimer
}
from
'
../../common/observableTimer
'
;
import
{
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
NNIManagerIpConfig
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
,
getJobCancelStatus
,
getRemoteTmpDir
,
getIPV4Address
,
getVersion
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getRemoteTmpDir
,
getVersion
,
uniqueString
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execCopydir
,
execMkdir
,
execRemove
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
SSHClient
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
}
from
'
../common/gpuData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
,
execRemove
,
execMkdir
,
execCopydir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
/**
* Training Service implementation for Remote Machine (Linux)
*/
@
component
.
Singleton
class
RemoteMachineTrainingService
implements
TrainingService
{
private
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
//machine ssh client map
private
trialSSHClientMap
:
Map
<
string
,
Client
>
;
//trial ssh client map
private
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
:
number
=
5
// every ssh client has a max trial concurrency number
private
expRootDir
:
string
;
private
remoteExpRootDir
:
string
;
private
readonly
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
//machine ssh client map
private
readonly
trialSSHClientMap
:
Map
<
string
,
Client
>
;
//trial ssh client map
private
readonly
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
:
number
=
5
;
// every ssh client has a max trial concurrency number
private
readonly
expRootDir
:
string
;
private
readonly
remoteExpRootDir
:
string
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
gpuScheduler
:
GPUScheduler
;
private
jobQueue
:
string
[];
private
timer
:
ObservableTimer
;
private
readonly
gpuScheduler
:
GPUScheduler
;
private
readonly
jobQueue
:
string
[];
private
readonly
timer
:
ObservableTimer
;
private
stopping
:
boolean
=
false
;
private
metricsEmitter
:
EventEmitter
;
private
log
:
Logger
;
private
readonly
metricsEmitter
:
EventEmitter
;
private
readonly
log
:
Logger
;
private
isMultiPhase
:
boolean
=
false
;
private
trialSequenceId
:
number
;
private
remoteRestServerPort
?:
number
;
...
...
@@ -117,7 +120,7 @@ class RemoteMachineTrainingService implements TrainingService {
break
;
}
}
if
(
restServer
.
getErrorMessage
)
{
if
(
restServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
...
...
@@ -128,33 +131,34 @@ class RemoteMachineTrainingService implements TrainingService {
/**
* give trial a ssh connection
* @param trial
* @param trial
remote machine trial job detail
*/
public
async
allocateSSHClientForTrial
(
trial
:
RemoteMachineTrialJobDetail
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
!
trial
.
rmMeta
)
{
if
(
trial
.
rmMeta
===
undefined
)
{
throw
new
Error
(
`rmMeta not set in trial
${
trial
.
id
}
`
);
}
le
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
if
(
!
sshClientManager
)
{
cons
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
if
(
sshClientManager
===
undefined
)
{
throw
new
Error
(
`remoteSSHClient not initialized`
);
}
le
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
cons
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
this
.
trialSSHClientMap
.
set
(
trial
.
id
,
sshClient
);
deferred
.
resolve
();
return
deferred
.
promise
;
}
/**
* If a trial is finished, release the connection resource
* @param trial
* @param trial
remote machine trial job detail
*/
public
releaseTrialSSHClient
(
trial
:
RemoteMachineTrialJobDetail
):
void
{
if
(
!
trial
.
rmMeta
)
{
if
(
trial
.
rmMeta
===
undefined
)
{
throw
new
Error
(
`rmMeta not set in trial
${
trial
.
id
}
`
);
}
le
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
if
(
!
sshClientManager
)
{
cons
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
if
(
sshClientManager
===
undefined
)
{
throw
new
Error
(
`sshClientManager not initialized`
);
}
sshClientManager
.
releaseConnection
(
this
.
trialSSHClientMap
.
get
(
trial
.
id
));
...
...
@@ -171,7 +175,7 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
value
.
form
.
jobType
===
'
TRIAL
'
)
{
jobs
.
push
(
await
this
.
getTrialJob
(
key
));
}
}
;
}
deferred
.
resolve
(
jobs
);
return
deferred
.
promise
;
...
...
@@ -183,7 +187,7 @@ class RemoteMachineTrainingService implements TrainingService {
*/
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
const
trialJob
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJob
)
{
if
(
trialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
//TO DO: add another job status, and design new job status change logic
...
...
@@ -193,7 +197,7 @@ class RemoteMachineTrainingService implements TrainingService {
throw
new
Error
(
`rmMeta not set for submitted job
${
trialJobId
}
`
);
}
const
sshClient
:
Client
|
undefined
=
this
.
trialSSHClientMap
.
get
(
trialJob
.
id
);
if
(
!
sshClient
)
{
if
(
sshClient
===
undefined
)
{
throw
new
Error
(
`Invalid job id:
${
trialJobId
}
, cannot find ssh client`
);
}
...
...
@@ -223,8 +227,9 @@ class RemoteMachineTrainingService implements TrainingService {
* Submit trial job
* @param form trial job description form
*/
// tslint:disable-next-line:informative-docs
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
trialConfig
)
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
...
...
@@ -276,17 +281,6 @@ class RemoteMachineTrainingService implements TrainingService {
return
trialJobDetail
;
}
/**
* remove gpu reversion when job is not running
*/
private
updateGpuReservation
()
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
}
};
}
/**
* Is multiphase job supported in current training service
*/
...
...
@@ -298,10 +292,11 @@ class RemoteMachineTrainingService implements TrainingService {
* Cancel trial job
* @param trialJobId ID of trial job
*/
// tslint:disable:informative-docs no-unsafe-any
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
trialJob
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJob
)
{
if
(
trialJob
===
undefined
)
{
deferred
.
reject
();
throw
new
Error
(
`trial job id
${
trialJobId
}
not found`
);
}
...
...
@@ -316,7 +311,7 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
trialJob
.
rmMeta
!==
undefined
)
{
// If the trial job is already scheduled, check its status and kill the trial process in remote machine
const
sshClient
:
Client
|
undefined
=
this
.
trialSSHClientMap
.
get
(
trialJob
.
id
);
if
(
!
sshClient
)
{
if
(
sshClient
===
undefined
)
{
deferred
.
reject
();
throw
new
Error
(
`Invalid job id
${
trialJobId
}
, cannot find ssh client`
);
}
...
...
@@ -358,19 +353,22 @@ class RemoteMachineTrainingService implements TrainingService {
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
if
(
!
remoteMachineTrailConfig
)
{
if
(
remoteMachineTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
}
// codeDir is not a valid directory, throw Error
if
(
!
fs
.
lstatSync
(
remoteMachineTrailConfig
.
codeDir
).
isDirectory
())
{
// tslint:disable-next-line:non-literal-fs-path
if
(
!
fs
.
lstatSync
(
remoteMachineTrailConfig
.
codeDir
)
.
isDirectory
())
{
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
}
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
...
...
@@ -410,21 +408,32 @@ class RemoteMachineTrainingService implements TrainingService {
await
Promise
.
race
([
delay
(
10000
),
this
.
cleanupConnections
()]);
}
/**
* remove gpu reversion when job is not running
*/
private
updateGpuReservation
():
void
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
}
}
}
/**
* stop gpu_metric_collector process in remote machine and remove unused scripts
*/
private
async
cleanupConnections
():
Promise
<
void
>
{
try
{
try
{
for
(
const
[
rmMeta
,
sshClientManager
]
of
this
.
machineSSHClientMap
.
entries
())
{
le
t
jobpidPath
:
string
=
unixPathJoin
(
this
.
getRemoteScriptsPath
(
rmMeta
.
username
),
'
pid
'
);
le
t
client
:
Client
|
undefined
=
sshClientManager
.
getFirstSSHClient
();
if
(
client
)
{
cons
t
jobpidPath
:
string
=
unixPathJoin
(
this
.
getRemoteScriptsPath
(
rmMeta
.
username
),
'
pid
'
);
cons
t
client
:
Client
|
undefined
=
sshClientManager
.
getFirstSSHClient
();
if
(
client
!==
undefined
)
{
await
SSHClientUtility
.
remoteExeCommand
(
`pkill -P
\`
cat
${
jobpidPath
}
\`
`
,
client
);
await
SSHClientUtility
.
remoteExeCommand
(
`rm -rf
${
this
.
getRemoteScriptsPath
(
rmMeta
.
username
)}
`
,
client
);
}
sshClientManager
.
closeAllSSHClient
();
}
}
catch
(
error
)
{
}
catch
(
error
)
{
//ignore error, this function is called to cleanup remote connections when experiment is stopping
this
.
log
.
error
(
`Cleanup connection exception, error is
${
error
.
message
}
`
);
}
...
...
@@ -436,7 +445,8 @@ class RemoteMachineTrainingService implements TrainingService {
* Generate gpu metric collector directory to store temp gpu metric collector script files
*/
private
getLocalGpuMetricCollectorDir
():
string
{
let
userName
:
string
=
path
.
basename
(
os
.
homedir
());
//get current user name of os
const
userName
:
string
=
path
.
basename
(
os
.
homedir
());
//get current user name of os
return
path
.
join
(
os
.
tmpdir
(),
userName
,
'
nni
'
,
'
scripts
'
);
}
...
...
@@ -445,15 +455,16 @@ class RemoteMachineTrainingService implements TrainingService {
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
generateGpuMetricsCollectorScript
(
userName
:
string
):
Promise
<
void
>
{
le
t
gpuMetricCollectorScriptFolder
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
cons
t
gpuMetricCollectorScriptFolder
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
await
execMkdir
(
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
));
//generate gpu_metrics_collector.sh
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
// This directory is used to store gpu_metrics and pid created by script
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
remoteGPUScriptsDir
,
unixPathJoin
(
remoteGPUScriptsDir
,
'
pid
'
)
,
unixPathJoin
(
remoteGPUScriptsDir
,
'
pid
'
)
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
}
...
...
@@ -467,39 +478,44 @@ class RemoteMachineTrainingService implements TrainingService {
rmMetaList
.
forEach
(
async
(
rmMeta
:
RemoteMachineMeta
)
=>
{
rmMeta
.
occupiedGpuIndexMap
=
new
Map
<
number
,
number
>
();
le
t
sshClientManager
:
SSHClientManager
=
new
SSHClientManager
([],
this
.
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
,
rmMeta
);
le
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
cons
t
sshClientManager
:
SSHClientManager
=
new
SSHClientManager
([],
this
.
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
,
rmMeta
);
cons
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
this
.
machineSSHClientMap
.
set
(
rmMeta
,
sshClientManager
);
await
this
.
initRemoteMachineOnConnected
(
rmMeta
,
sshClient
);
if
(
++
connectedRMNum
===
rmMetaList
.
length
)
{
deferred
.
resolve
();
}
});
return
deferred
.
promise
;
}
private
async
initRemoteMachineOnConnected
(
rmMeta
:
RemoteMachineMeta
,
conn
:
Client
):
Promise
<
void
>
{
// Create root working directory after ssh connection is ready
await
this
.
generateGpuMetricsCollectorScript
(
rmMeta
.
username
);
//generate gpu script in local machine first, will copy to remote machine later
// generate gpu script in local machine first, will copy to remote machine later
await
this
.
generateGpuMetricsCollectorScript
(
rmMeta
.
username
);
const
nniRootDir
:
string
=
unixPathJoin
(
getRemoteTmpDir
(
this
.
remoteOS
),
'
nni
'
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
this
.
remoteExpRootDir
}
`
,
conn
);
// Copy NNI scripts to remote expeirment working directory
const
localGpuScriptCollectorDir
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
const
remoteGpuScriptCollectorDir
:
string
=
this
.
getRemoteScriptsPath
(
rmMeta
.
username
);
//the directory to store temp scripts in remote machine
// the directory to store temp scripts in remote machine
const
remoteGpuScriptCollectorDir
:
string
=
this
.
getRemoteScriptsPath
(
rmMeta
.
username
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
remoteGpuScriptCollectorDir
}
`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`chmod 777
${
nniRootDir
}
${
nniRootDir
}
/*
${
nniRootDir
}
/scripts/*`
,
conn
);
//copy gpu_metrics_collector.sh to remote
await
SSHClientUtility
.
copyFileToRemote
(
path
.
join
(
localGpuScriptCollectorDir
,
rmMeta
.
username
,
'
gpu_metrics_collector.sh
'
),
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
),
conn
);
await
SSHClientUtility
.
copyFileToRemote
(
path
.
join
(
localGpuScriptCollectorDir
,
rmMeta
.
username
,
'
gpu_metrics_collector.sh
'
),
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
),
conn
);
//Begin to execute gpu_metrics_collection scripts
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
)}
`
,
conn
);
this
.
timer
.
subscribe
(
async
(
tick
:
number
)
=>
{
const
cmdresult
:
RemoteCommandResult
=
await
SSHClientUtility
.
remoteExeCommand
(
`tail -n 1
${
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics
'
)}
`
,
conn
);
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
if
(
cmdresult
!==
undefined
&&
cmdresult
.
stdout
!==
undefined
)
{
rmMeta
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
}
...
...
@@ -509,7 +525,7 @@ class RemoteMachineTrainingService implements TrainingService {
private
async
prepareTrialJob
(
trialJobId
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
if
(
!
this
.
trialConfig
)
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
...
...
@@ -519,6 +535,7 @@ class RemoteMachineTrainingService implements TrainingService {
// If job is not WATIING, Don't prepare and resolve true immediately
if
(
trialJobDetail
.
status
!==
'
WAITING
'
)
{
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
}
// get an ssh client from scheduler
...
...
@@ -557,7 +574,7 @@ class RemoteMachineTrainingService implements TrainingService {
private
async
launchTrialOnScheduledMachine
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
TrialJobApplicationForm
,
rmScheduleInfo
:
RemoteMachineScheduleInfo
):
Promise
<
void
>
{
if
(
!
this
.
trialConfig
)
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
cuda_visible_device
:
string
=
rmScheduleInfo
.
cuda_visible_device
;
...
...
@@ -584,18 +601,19 @@ class RemoteMachineTrainingService implements TrainingService {
let
command
:
string
;
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
if
(
typeof
cuda_visible_device
===
'
string
'
&&
cuda_visible_device
.
length
>
0
)
{
if
(
typeof
cuda_visible_device
===
'
string
'
&&
cuda_visible_device
.
length
>
0
)
{
command
=
`CUDA_VISIBLE_DEVICES=
${
cuda_visible_device
}
${
this
.
trialConfig
.
command
}
`
;
}
else
{
command
=
`CUDA_VISIBLE_DEVICES=" "
${
this
.
trialConfig
.
command
}
`
;
}
const
nniManagerIp
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
if
(
!
this
.
remoteRestServerPort
)
{
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
if
(
this
.
remoteRestServerPort
===
undefined
)
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
this
.
remoteRestServerPort
=
restServer
.
clusterRestServerPort
;
}
const
version
=
this
.
versionCheck
?
await
getVersion
():
''
;
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
runScriptTrialContent
:
string
=
String
.
Format
(
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
trialWorkingFolder
,
...
...
@@ -611,7 +629,7 @@ class RemoteMachineTrainingService implements TrainingService {
version
,
this
.
logCollection
,
unixPathJoin
(
trialWorkingFolder
,
'
.nni
'
,
'
code
'
)
)
)
;
//create tmp trial working folder locally.
await
execMkdir
(
path
.
join
(
trialLocalTempFolder
,
'
.nni
'
));
...
...
@@ -627,6 +645,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Copy files in codeDir to remote working directory
await
SSHClientUtility
.
copyDirectoryToRemote
(
trialLocalTempFolder
,
trialWorkingFolder
,
sshClient
,
this
.
remoteOS
);
// Execute command in remote machine
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
trialWorkingFolder
,
'
run.sh
'
)}
`
,
sshClient
);
}
...
...
@@ -636,7 +655,7 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
sshClientManager
===
undefined
)
{
throw
new
Error
(
'
sshClient not found.
'
);
}
le
t
sshClient
:
Client
=
sshClientManager
.
getFirstSSHClient
();
cons
t
sshClient
:
Client
=
sshClientManager
.
getFirstSSHClient
();
const
jobId
:
string
=
uniqueString
(
5
);
const
localDir
:
string
=
path
.
join
(
this
.
expRootDir
,
'
hostjobs-local
'
,
jobId
);
const
remoteDir
:
string
=
this
.
getHostJobRemoteDir
(
jobId
);
...
...
@@ -648,6 +667,7 @@ class RemoteMachineTrainingService implements TrainingService {
await
fs
.
promises
.
writeFile
(
path
.
join
(
localDir
,
'
run.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
await
SSHClientUtility
.
copyFileToRemote
(
path
.
join
(
localDir
,
'
run.sh
'
),
unixPathJoin
(
remoteDir
,
'
run.sh
'
),
sshClient
);
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
remoteDir
,
'
run.sh
'
)}
`
,
sshClient
);
const
jobDetail
:
RemoteMachineTrialJobDetail
=
new
RemoteMachineTrialJobDetail
(
...
...
@@ -680,8 +700,9 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
killResult
!==
0
)
{
const
trailReturnCode
:
string
=
await
SSHClientUtility
.
getRemoteFileContent
(
trialReturnCodeFilePath
,
sshClient
);
this
.
log
.
debug
(
`trailjob
${
trialJob
.
id
}
return code:
${
trailReturnCode
}
`
);
const
match
:
RegExpMatchArray
|
null
=
trailReturnCode
.
trim
().
match
(
/^
(\d
+
)\s
+
(\d
+
)
$/
);
if
(
match
)
{
const
match
:
RegExpMatchArray
|
null
=
trailReturnCode
.
trim
()
.
match
(
/^
(\d
+
)\s
+
(\d
+
)
$/
);
if
(
match
!==
null
)
{
const
{
1
:
code
,
2
:
timestamp
}
=
match
;
// Update trial job's status based on result code
if
(
parseInt
(
code
,
10
)
===
0
)
{
...
...
@@ -709,6 +730,7 @@ class RemoteMachineTrainingService implements TrainingService {
deferred
.
resolve
(
trialJob
);
}
}
return
deferred
.
promise
;
}
...
...
@@ -720,7 +742,7 @@ class RemoteMachineTrainingService implements TrainingService {
return
unixPathJoin
(
this
.
remoteExpRootDir
,
'
hostjobs
'
,
jobId
);
}
private
getRemoteExperimentRootDir
():
string
{
private
getRemoteExperimentRootDir
():
string
{
return
unixPathJoin
(
getRemoteTmpDir
(
this
.
remoteOS
),
'
nni
'
,
'
experiments
'
,
getExperimentId
());
}
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment