Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
d7456c16
Unverified
Commit
d7456c16
authored
May 19, 2020
by
SparkSnail
Committed by
GitHub
May 19, 2020
Browse files
Refactor code storage logic for trial (#2403)
parent
bd7edf36
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
149 additions
and
120 deletions
+149
-120
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+28
-30
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+17
-19
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+2
-2
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+26
-29
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+23
-16
src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
+3
-4
src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
...ager/training_service/pai/paiK8S/paiK8STrainingService.ts
+16
-5
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+6
-5
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+24
-8
src/nni_manager/training_service/remote_machine/shellExecutor.ts
..._manager/training_service/remote_machine/shellExecutor.ts
+3
-2
test/nni_test/nnitest/run_tests.py
test/nni_test/nnitest/run_tests.py
+1
-0
No files found.
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
d7456c16
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
...
@@ -72,6 +73,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -72,6 +73,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
}
// wait upload of code Dir to finish
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
}
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialJobId
:
string
=
uniqueString
(
5
);
// Set trial's NFS working folder
// Set trial's NFS working folder
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
...
@@ -81,8 +87,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -81,8 +87,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this
.
generateContainerPort
();
this
.
generateContainerPort
();
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
form
);
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
form
);
//upload
code files
//
wait
upload
of script files to finish
const
trialJobOutputUrl
:
string
=
await
this
.
upload
CodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobOutputUrl
:
string
=
await
this
.
upload
Folder
(
trialLocalTempFolder
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
if
(
!
trialJobOutputUrl
)
{
if
(
!
trialJobOutputUrl
)
{
initStatus
=
'
FAILED
'
;
initStatus
=
'
FAILED
'
;
...
@@ -151,6 +157,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -151,6 +157,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// Validate to make sure codeDir doesn't have too many files
// Validate to make sure codeDir doesn't have too many files
try
{
try
{
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
//upload codeDir to storage
this
.
copyExpCodeDirPromise
=
this
.
uploadFolder
(
this
.
fcTrialConfig
.
codeDir
,
`nni/
${
getExperimentId
()}
/nni-code`
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
...
@@ -171,41 +179,31 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -171,41 +179,31 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
}
/**
/**
* upload code files to nfs or azureStroage
* upload local folder to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
*/
private
async
upload
CodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
private
async
upload
Folder
(
srcDirectory
:
string
,
destDirectory
:
string
):
Promise
<
string
>
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
if
(
this
.
fc
TrialConfig
===
undefined
)
{
assert
(
this
.
fc
ClusterConfig
.
storage
===
undefined
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
||
this
.
fcClusterConfig
.
storage
===
'
azureStorage
'
}
||
this
.
fcClusterConfig
.
storage
===
'
nfs
'
);
let
trialJobOutputUrl
:
string
=
''
;
if
(
this
.
fcClusterConfig
.
storage
===
'
azureStorage
'
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
const
azureFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
trialJobOutputUrl
=
await
this
.
uploadFilesToAzureStorage
(
trialJobId
,
trialLocalTempFolder
,
this
.
fcTrialConfig
.
codeDir
,
azureFrameworkControllerClusterConfig
.
uploadRetryCount
);
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
// Copy codeDir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
this
.
fcTrialConfig
.
codeDir
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsFrameworkControllerClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
}
const
fcClusterConfigAzure
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
return
Promise
.
resolve
(
trialJobOutputUrl
);
return
await
this
.
uploadFolderToAzureStorage
(
srcDirectory
,
destDirectory
,
fcClusterConfigAzure
.
uploadRetryCount
);
}
else
if
(
this
.
fcClusterConfig
.
storage
===
'
nfs
'
||
this
.
fcClusterConfig
.
storage
===
undefined
)
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/
${
destDirectory
}
`
);
await
cpp
.
exec
(
`cp -r
${
srcDirectory
}
/*
${
this
.
trialLocalNFSTempFolder
}
/
${
destDirectory
}
/.`
);
const
fcClusterConfigNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
const
nfsConfig
:
NFSConfig
=
fcClusterConfigNFS
.
nfs
;
return
`nfs://
${
nfsConfig
.
server
}
:
${
destDirectory
}
`
;
}
return
''
;
}
}
/**
/**
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
d7456c16
...
@@ -74,14 +74,20 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -74,14 +74,20 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
}
// upload code Dir to storage
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
}
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
kubeflowJobName
:
string
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
kubeflowJobName
:
string
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
//prepare the runscript
//prepare the runscript
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
form
);
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
form
);
//upload files to sotrage
//upload
script
files to sotrage
const
trialJobOutputUrl
:
string
=
await
this
.
upload
CodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobOutputUrl
:
string
=
await
this
.
upload
Folder
(
trialLocalTempFolder
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
if
(
!
trialJobOutputUrl
)
{
if
(
!
trialJobOutputUrl
)
{
initStatus
=
'
FAILED
'
;
initStatus
=
'
FAILED
'
;
...
@@ -152,6 +158,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -152,6 +158,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// Validate to make sure codeDir doesn't have too many files
// Validate to make sure codeDir doesn't have too many files
try
{
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
//upload codeDir to storage
this
.
copyExpCodeDirPromise
=
this
.
uploadFolder
(
this
.
kubeflowTrialConfig
.
codeDir
,
`nni/
${
getExperimentId
()}
/nni-code`
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
...
@@ -172,12 +180,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -172,12 +180,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
}
/**
/**
* upload code files to nfs or azureStroage
* upload local folder to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
*/
private
async
upload
CodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
private
async
upload
Folder
(
srcDirectory
:
string
,
destDirectory
:
string
):
Promise
<
string
>
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
...
@@ -186,8 +191,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -186,8 +191,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw
new
Error
(
'
Kubeflow Trial config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Trial config is not initialized
'
);
}
}
let
trialJobOutputUrl
:
string
=
''
;
assert
(
this
.
kubeflowClusterConfig
.
storage
===
undefined
assert
(
this
.
kubeflowClusterConfig
.
storage
===
undefined
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
...
@@ -197,20 +200,15 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -197,20 +200,15 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
}
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
trialJobOutputUrl
=
await
this
.
uploadF
iles
ToAzureStorage
(
trialJobId
,
trialLocalTempFolder
,
this
.
kubeflowTrialConfig
.
codeDir
,
azureKubeflowClusterConfig
.
uploadRetryCount
);
return
await
this
.
uploadF
older
ToAzureStorage
(
srcDirectory
,
destDirectory
,
azureKubeflowClusterConfig
.
uploadRetryCount
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/
${
destDirectory
}
`
);
await
cpp
.
exec
(
`cp -r
${
srcDirectory
}
/*
${
this
.
trialLocalNFSTempFolder
}
/
${
destDirectory
}
/.`
);
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy script files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
// Copy codeDir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
this
.
kubeflowTrialConfig
.
codeDir
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)
}
`
;
return
`nfs://
${
nfsConfig
.
server
}
:
${
destDirectory
}
`
;
}
}
return
''
;
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
...
...
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
d7456c16
...
@@ -39,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
...
@@ -39,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
export
const
kubernetesScriptFormat
:
string
=
export
const
kubernetesScriptFormat
:
string
=
`#!/bin/bash
`#!/bin/bash
export NNI_PLATFORM={0}
export NNI_PLATFORM={0}
export NNI_SYS_DIR=
$PWD/nni/
{1}
export NNI_SYS_DIR={1}
export NNI_OUTPUT_DIR={2}
export NNI_OUTPUT_DIR={2}
export MULTI_PHASE=false
export MULTI_PHASE=false
export NNI_TRIAL_JOB_ID={3}
export NNI_TRIAL_JOB_ID={3}
...
@@ -49,7 +49,7 @@ export NNI_TRIAL_SEQ_ID={6}
...
@@ -49,7 +49,7 @@ export NNI_TRIAL_SEQ_ID={6}
{7}
{7}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_OUTPUT_DIR
mkdir -p $NNI_OUTPUT_DIR
cp -r
T
$NNI_CODE_DIR $NNI_SYS_DIR
cp -r $NNI_CODE_DIR
/.
$NNI_SYS_DIR
cd $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
...
...
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
d7456c16
...
@@ -49,6 +49,8 @@ abstract class KubernetesTrainingService {
...
@@ -49,6 +49,8 @@ abstract class KubernetesTrainingService {
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
versionCheck
:
boolean
=
true
;
protected
versionCheck
:
boolean
=
true
;
protected
logCollection
:
string
;
protected
logCollection
:
string
;
protected
copyExpCodeDirPromise
?:
Promise
<
string
>
;
protected
expContainerCodeFolder
:
string
;
constructor
()
{
constructor
()
{
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
...
@@ -57,6 +59,7 @@ abstract class KubernetesTrainingService {
...
@@ -57,6 +59,7 @@ abstract class KubernetesTrainingService {
this
.
trialLocalNFSTempFolder
=
path
.
join
(
getExperimentRootDir
(),
'
trials-nfs-tmp
'
);
this
.
trialLocalNFSTempFolder
=
path
.
join
(
getExperimentRootDir
(),
'
trials-nfs-tmp
'
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
CONTAINER_MOUNT_PATH
=
'
/tmp/mount
'
;
this
.
CONTAINER_MOUNT_PATH
=
'
/tmp/mount
'
;
this
.
expContainerCodeFolder
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
this
.
experimentId
,
'
nni-code
'
);
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
logCollection
=
'
none
'
;
this
.
logCollection
=
'
none
'
;
}
}
...
@@ -272,11 +275,11 @@ abstract class KubernetesTrainingService {
...
@@ -272,11 +275,11 @@ abstract class KubernetesTrainingService {
const
runScript
:
string
=
String
.
Format
(
const
runScript
:
string
=
String
.
Format
(
kubernetesScriptFormat
,
kubernetesScriptFormat
,
platform
,
platform
,
trial
JobId
,
trial
WorkingFolder
,
path
.
join
(
trialWorkingFolder
,
'
output
'
,
`
${
roleName
}
_output`
),
path
.
join
(
trialWorkingFolder
,
'
output
'
,
`
${
roleName
}
_output`
),
trialJobId
,
trialJobId
,
getExperimentId
(),
getExperimentId
(),
t
rialWorking
Folder
,
t
his
.
expContainerCode
Folder
,
trialSequenceId
,
trialSequenceId
,
nvidiaScript
,
nvidiaScript
,
command
,
command
,
...
@@ -330,50 +333,44 @@ abstract class KubernetesTrainingService {
...
@@ -330,50 +333,44 @@ abstract class KubernetesTrainingService {
return
registrySecretName
;
return
registrySecretName
;
}
}
protected
async
uploadFilesToAzureStorage
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
,
codeDir
:
string
,
uploadRetryCount
:
number
|
undefined
):
Promise
<
string
>
{
/**
* upload local directory to azureStorage
* @param srcDirectory the source directory of local folder
* @param destDirectory the target directory in azure
* @param uploadRetryCount the retry time when upload failed
*/
protected
async
uploadFolderToAzureStorage
(
srcDirectory
:
string
,
destDirectory
:
string
,
uploadRetryCount
:
number
|
undefined
):
Promise
<
string
>
{
if
(
this
.
azureStorageClient
===
undefined
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
}
let
trialJobOutputUrl
:
string
=
''
;
let
retryCount
:
number
=
1
;
let
retryCount
:
number
=
1
;
if
(
uploadRetryCount
)
{
if
(
uploadRetryCount
)
{
retryCount
=
uploadRetryCount
;
retryCount
=
uploadRetryCount
;
}
}
let
resultUploadNNIScript
:
boolean
=
false
;
let
uploadSuccess
:
boolean
=
false
;
let
resultUploadCodeFile
:
boolean
=
false
;
let
folderUriInAzure
=
''
;
try
{
try
{
do
{
do
{
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
uploadSuccess
=
await
AzureStorageClientUtility
.
uploadDirectory
(
if
(
!
resultUploadNNIScript
)
{
this
.
azureStorageClient
,
resultUploadNNIScript
=
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`
${
destDirectory
}
`
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
`
${
srcDirectory
}
`
);
}
if
(
!
uploadSuccess
)
{
//upload code files to azure storage
if
(
!
resultUploadCodeFile
)
{
resultUploadCodeFile
=
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
codeDir
}
`
);
}
if
(
resultUploadNNIScript
&&
resultUploadCodeFile
)
{
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
`
+
`/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
break
;
}
else
{
//wait for 5 seconds to re-upload files
//wait for 5 seconds to re-upload files
await
delay
(
5000
);
await
delay
(
5000
);
this
.
log
.
info
(
'
Upload failed, Retry: upload files to azure-storage
'
);
this
.
log
.
info
(
'
Upload failed, Retry: upload files to azure-storage
'
);
}
else
{
folderUriInAzure
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
destDirectory
}
`
;
break
;
}
}
}
while
(
retryCount
--
>=
0
)
}
while
(
retryCount
--
>=
0
)
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
//return a empty url when got error
//return a empty url when got error
return
Promise
.
resolve
(
""
);
return
Promise
.
resolve
(
''
);
}
if
(
!
trialJobOutputUrl
)
{
this
.
log
.
info
(
`Retry-count is used up, upload files to azureStorage for trial
${
trialJobId
}
failed!`
);
}
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
return
Promise
.
resolve
(
folderUriInAzure
);
}
}
}
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
d7456c16
...
@@ -361,6 +361,9 @@ class LocalTrainingService implements TrainingService {
...
@@ -361,6 +361,9 @@ class LocalTrainingService implements TrainingService {
trialJobDetail
:
TrialJobDetail
,
trialJobDetail
:
TrialJobDetail
,
resource
:
{
gpuIndices
:
number
[]
},
resource
:
{
gpuIndices
:
number
[]
},
gpuNum
:
number
|
undefined
):
{
key
:
string
;
value
:
string
}[]
{
gpuNum
:
number
|
undefined
):
{
key
:
string
;
value
:
string
}[]
{
if
(
this
.
localTrialConfig
===
undefined
)
{
throw
new
Error
(
'
localTrialConfig is not initialized!
'
);
}
const
envVariables
:
{
key
:
string
;
value
:
string
}[]
=
[
const
envVariables
:
{
key
:
string
;
value
:
string
}[]
=
[
{
key
:
'
NNI_PLATFORM
'
,
value
:
'
local
'
},
{
key
:
'
NNI_PLATFORM
'
,
value
:
'
local
'
},
{
key
:
'
NNI_EXP_ID
'
,
value
:
this
.
experimentId
},
{
key
:
'
NNI_EXP_ID
'
,
value
:
this
.
experimentId
},
...
@@ -368,7 +371,8 @@ class LocalTrainingService implements TrainingService {
...
@@ -368,7 +371,8 @@ class LocalTrainingService implements TrainingService {
{
key
:
'
NNI_TRIAL_JOB_ID
'
,
value
:
trialJobDetail
.
id
},
{
key
:
'
NNI_TRIAL_JOB_ID
'
,
value
:
trialJobDetail
.
id
},
{
key
:
'
NNI_OUTPUT_DIR
'
,
value
:
trialJobDetail
.
workingDirectory
},
{
key
:
'
NNI_OUTPUT_DIR
'
,
value
:
trialJobDetail
.
workingDirectory
},
{
key
:
'
NNI_TRIAL_SEQ_ID
'
,
value
:
trialJobDetail
.
form
.
sequenceId
.
toString
()
},
{
key
:
'
NNI_TRIAL_SEQ_ID
'
,
value
:
trialJobDetail
.
form
.
sequenceId
.
toString
()
},
{
key
:
'
MULTI_PHASE
'
,
value
:
this
.
isMultiPhase
.
toString
()
}
{
key
:
'
MULTI_PHASE
'
,
value
:
this
.
isMultiPhase
.
toString
()
},
{
key
:
'
NNI_CODE_DIR
'
,
value
:
this
.
localTrialConfig
.
codeDir
}
];
];
if
(
gpuNum
!==
undefined
)
{
if
(
gpuNum
!==
undefined
)
{
envVariables
.
push
({
envVariables
.
push
({
...
@@ -473,12 +477,16 @@ class LocalTrainingService implements TrainingService {
...
@@ -473,12 +477,16 @@ class LocalTrainingService implements TrainingService {
private
getScript
(
localTrialConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]
{
private
getScript
(
localTrialConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]
{
const
script
:
string
[]
=
[];
const
script
:
string
[]
=
[];
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
script
.
push
(
`Copy-Item $env:NNI_CODE_DIR\\* -Destination $env:NNI_SYS_DIR -Recurse`
);
script
.
push
(
`cd $env:NNI_SYS_DIR`
);
script
.
push
(
script
.
push
(
`cmd.exe /c
${
localTrialConfig
.
command
}
2>"
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
"`
,
`cmd.exe /c
${
localTrialConfig
.
command
}
2>"
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
"`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`
,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`
,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File "
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
" -NoNewline -encoding utf8`
);
`Write $LASTEXITCODE " " $NOW_DATE | Out-File "
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
" -NoNewline -encoding utf8`
);
}
else
{
}
else
{
script
.
push
(
`cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR`
);
script
.
push
(
`cd $NNI_SYS_DIR`
);
script
.
push
(
`eval
${
localTrialConfig
.
command
}
2>"
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
"`
);
script
.
push
(
`eval
${
localTrialConfig
.
command
}
2>"
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
"`
);
if
(
process
.
platform
===
'
darwin
'
)
{
if
(
process
.
platform
===
'
darwin
'
)
{
// https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x
// https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x
...
@@ -506,7 +514,6 @@ class LocalTrainingService implements TrainingService {
...
@@ -506,7 +514,6 @@ class LocalTrainingService implements TrainingService {
if
(
process
.
platform
!==
'
win32
'
)
{
if
(
process
.
platform
!==
'
win32
'
)
{
runScriptContent
.
push
(
'
#!/bin/bash
'
);
runScriptContent
.
push
(
'
#!/bin/bash
'
);
}
}
runScriptContent
.
push
(
`cd '
${
this
.
localTrialConfig
.
codeDir
}
'`
);
for
(
const
variable
of
variables
)
{
for
(
const
variable
of
variables
)
{
runScriptContent
.
push
(
setEnvironmentVariable
(
variable
));
runScriptContent
.
push
(
setEnvironmentVariable
(
variable
));
}
}
...
...
src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
View file @
d7456c16
...
@@ -31,7 +31,6 @@ fi`;
...
@@ -31,7 +31,6 @@ fi`;
export
const
PAI_K8S_TRIAL_COMMAND_FORMAT
:
string
=
export
const
PAI_K8S_TRIAL_COMMAND_FORMAT
:
string
=
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& ls $NNI_SYS_DIR \
&& NNI_CODE_DIR={6} && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR && cd $NNI_SYS_DIR && sh install_nni.sh \
&& cd $NNI_SYS_DIR && sh install_nni.sh \
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \
--nni_manager_version '{10}' --log_collection '{11}'`
;
--nni_manager_version '{9}' --log_collection '{10}'`
;
src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
View file @
d7456c16
...
@@ -53,6 +53,7 @@ const yaml = require('js-yaml');
...
@@ -53,6 +53,7 @@ const yaml = require('js-yaml');
@
component
.
Singleton
@
component
.
Singleton
class
PAIK8STrainingService
extends
PAITrainingService
{
class
PAIK8STrainingService
extends
PAITrainingService
{
protected
paiTrialConfig
:
NNIPAIK8STrialConfig
|
undefined
;
protected
paiTrialConfig
:
NNIPAIK8STrialConfig
|
undefined
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
private
paiJobConfig
:
undefined
;
private
paiJobConfig
:
undefined
;
private
nniVersion
:
string
|
undefined
;
private
nniVersion
:
string
|
undefined
;
constructor
()
{
constructor
()
{
...
@@ -78,7 +79,7 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -78,7 +79,7 @@ class PAIK8STrainingService extends PAITrainingService {
}
}
break
;
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
break
;
break
;
...
@@ -86,10 +87,15 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -86,10 +87,15 @@ class PAIK8STrainingService extends PAITrainingService {
this
.
paiTrialConfig
=
<
NNIPAIK8STrialConfig
>
JSON
.
parse
(
value
);
this
.
paiTrialConfig
=
<
NNIPAIK8STrialConfig
>
JSON
.
parse
(
value
);
// Validate to make sure codeDir doesn't have too many files
// Validate to make sure codeDir doesn't have too many files
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
const
nniManagerNFSExpCodeDir
=
path
.
join
(
this
.
paiTrialConfig
.
nniManagerNFSMountPath
,
this
.
experimentId
,
'
nni-code
'
);
await
execMkdir
(
nniManagerNFSExpCodeDir
);
//Copy codeDir files to local working folder
this
.
copyExpCodeDirPromise
=
execCopydir
(
this
.
paiTrialConfig
.
codeDir
,
nniManagerNFSExpCodeDir
);
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
this
.
paiJobConfig
=
yaml
.
safeLoad
(
fs
.
readFileSync
(
this
.
paiTrialConfig
.
paiConfigPath
,
'
utf8
'
));
this
.
paiJobConfig
=
yaml
.
safeLoad
(
fs
.
readFileSync
(
this
.
paiTrialConfig
.
paiConfigPath
,
'
utf8
'
));
}
}
break
;
break
;
}
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
this
.
nniVersion
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
this
.
nniVersion
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
...
@@ -152,6 +158,7 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -152,6 +158,7 @@ class PAIK8STrainingService extends PAITrainingService {
if
(
this
.
paiTrialConfig
===
undefined
)
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
throw
new
Error
(
'
trial config is not initialized
'
);
}
}
const
containerNFSExpCodeDir
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
/'nni-code`
;
const
containerWorkingDir
:
string
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
/
${
trialJobDetail
.
id
}
`
;
const
containerWorkingDir
:
string
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
/
${
trialJobDetail
.
id
}
`
;
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
...
@@ -162,6 +169,7 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -162,6 +169,7 @@ class PAIK8STrainingService extends PAITrainingService {
this
.
experimentId
,
this
.
experimentId
,
trialJobDetail
.
form
.
sequenceId
,
trialJobDetail
.
form
.
sequenceId
,
this
.
isMultiPhase
,
this
.
isMultiPhase
,
containerNFSExpCodeDir
,
command
,
command
,
nniManagerIp
,
nniManagerIp
,
this
.
paiRestServerPort
,
this
.
paiRestServerPort
,
...
@@ -264,15 +272,18 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -264,15 +272,18 @@ class PAIK8STrainingService extends PAITrainingService {
throw
new
Error
(
'
paiJobRestServer is not initialized
'
);
throw
new
Error
(
'
paiJobRestServer is not initialized
'
);
}
}
// Make sure experiment code files is copied from local to NFS
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
}
this
.
paiRestServerPort
=
this
.
paiJobRestServer
.
clusterRestServerPort
;
this
.
paiRestServerPort
=
this
.
paiJobRestServer
.
clusterRestServerPort
;
// Step 1. Prepare PAI job configuration
// Step 1. Prepare PAI job configuration
//create trial local working folder locally.
//create trial local working folder locally.
await
execMkdir
(
trialJobDetail
.
logPath
);
await
execMkdir
(
trialJobDetail
.
logPath
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local files
// Write NNI installation file to local files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
logPath
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
logPath
,
'
install_nni.sh
'
),
CONTAINER_INSTALL_NNI_SHELL_FORMAT
,
{
encoding
:
'
utf8
'
});
// Write file content ( parameter.cfg ) to local working folders
// Write file content ( parameter.cfg ) to local working folders
if
(
trialJobDetail
.
form
!==
undefined
)
{
if
(
trialJobDetail
.
form
!==
undefined
)
{
...
@@ -284,7 +295,7 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -284,7 +295,7 @@ class PAIK8STrainingService extends PAITrainingService {
//Generate Job Configuration in yaml format
//Generate Job Configuration in yaml format
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
trialJobDetail
);
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
trialJobDetail
);
this
.
log
.
debug
(
paiJobConfig
);
this
.
log
.
debug
(
paiJobConfig
);
// Step
3
. Submit PAI job via Rest call
// Step
2
. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const
submitJobRequest
:
request
.
Options
=
{
const
submitJobRequest
:
request
.
Options
=
{
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v2/jobs`
,
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v2/jobs`
,
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
d7456c16
...
@@ -179,13 +179,14 @@ export enum ScheduleResultType {
...
@@ -179,13 +179,14 @@ export enum ScheduleResultType {
export
const
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
:
string
=
export
const
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
:
string
=
`#!/bin/bash
`#!/bin/bash
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \
NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5}
NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} NNI_CODE_DIR={6}
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
cd $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
sh install_nni.sh
echo $$ >{
6
}
echo $$ >{
7
}
python3 -m nni_trial_tool.trial_keeper --trial_command '{
7
}' --nnimanager_ip '{
8
}' --nnimanager_port '{
9
}' \
python3 -m nni_trial_tool.trial_keeper --trial_command '{
8
}' --nnimanager_ip '{
9
}' --nnimanager_port '{
10
}' \
--nni_manager_version '{1
0
}' --log_collection '{1
1
}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
--nni_manager_version '{1
1
}' --log_collection '{1
2
}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $?
\`
date +%s%3N
\`
>{1
2
}`
;
echo $?
\`
date +%s%3N
\`
>{1
3
}`
;
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
`#!/bin/bash
`#!/bin/bash
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
d7456c16
...
@@ -26,7 +26,7 @@ import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
...
@@ -26,7 +26,7 @@ import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execCopydir
,
execMkdir
,
validateCodeDir
,
getGpuMetricsCollectorBashScriptContent
}
from
'
../common/util
'
;
import
{
execMkdir
,
validateCodeDir
,
getGpuMetricsCollectorBashScriptContent
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
import
{
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
RemoteMachineMeta
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
RemoteMachineMeta
,
...
@@ -42,11 +42,13 @@ import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
...
@@ -42,11 +42,13 @@ import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
@
component
.
Singleton
@
component
.
Singleton
class
RemoteMachineTrainingService
implements
TrainingService
{
class
RemoteMachineTrainingService
implements
TrainingService
{
private
readonly
machineExecutorManagerMap
:
Map
<
RemoteMachineMeta
,
ExecutorManager
>
;
//machine excutor map
private
readonly
machineExecutorManagerMap
:
Map
<
RemoteMachineMeta
,
ExecutorManager
>
;
//machine excutor map
private
readonly
machineCopyExpCodeDirPromiseMap
:
Map
<
RemoteMachineMeta
,
Promise
<
void
>>
;
private
readonly
trialExecutorMap
:
Map
<
string
,
ShellExecutor
>
;
//trial excutor map
private
readonly
trialExecutorMap
:
Map
<
string
,
ShellExecutor
>
;
//trial excutor map
private
readonly
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
MAX_TRIAL_NUMBER_PER_EXECUTOR
:
number
=
5
;
// every excutor has a max trial concurrency number
private
readonly
MAX_TRIAL_NUMBER_PER_EXECUTOR
:
number
=
5
;
// every excutor has a max trial concurrency number
private
readonly
expRootDir
:
string
;
private
readonly
expRootDir
:
string
;
private
readonly
remoteExpRootDir
:
string
;
private
readonly
remoteExpRootDir
:
string
;
private
readonly
remoteExpCodeDir
:
string
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
gpuScheduler
?:
GPUScheduler
;
private
gpuScheduler
?:
GPUScheduler
;
private
readonly
jobQueue
:
string
[];
private
readonly
jobQueue
:
string
[];
...
@@ -68,9 +70,11 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -68,9 +70,11 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
trialJobsMap
=
new
Map
<
string
,
RemoteMachineTrialJobDetail
>
();
this
.
trialJobsMap
=
new
Map
<
string
,
RemoteMachineTrialJobDetail
>
();
this
.
trialExecutorMap
=
new
Map
<
string
,
ShellExecutor
>
();
this
.
trialExecutorMap
=
new
Map
<
string
,
ShellExecutor
>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachineMeta
,
ExecutorManager
>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachineMeta
,
ExecutorManager
>
();
this
.
machineCopyExpCodeDirPromiseMap
=
new
Map
<
RemoteMachineMeta
,
Promise
<
void
>>
();
this
.
jobQueue
=
[];
this
.
jobQueue
=
[];
this
.
expRootDir
=
getExperimentRootDir
();
this
.
expRootDir
=
getExperimentRootDir
();
this
.
remoteExpRootDir
=
this
.
getRemoteExperimentRootDir
();
this
.
remoteExpRootDir
=
this
.
getRemoteExperimentRootDir
();
this
.
remoteExpCodeDir
=
unixPathJoin
(
this
.
remoteExpRootDir
,
'
nni-code
'
);
this
.
timer
=
timer
;
this
.
timer
=
timer
;
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
this
.
trialSequenceId
=
-
1
;
this
.
trialSequenceId
=
-
1
;
...
@@ -320,9 +324,20 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -320,9 +324,20 @@ class RemoteMachineTrainingService implements TrainingService {
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
}
}
// Validate to make sure codeDir doesn't have too many files
try
{
try
{
// Validate to make sure codeDir doesn't have too many files
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
// Copy codeDir to remote machine
for
(
const
[
rmMeta
,
executorManager
]
of
this
.
machineExecutorManagerMap
.
entries
())
{
const
executor
:
ShellExecutor
=
await
executorManager
.
getAvailableExecutor
();
if
(
executor
!==
undefined
)
{
this
.
machineCopyExpCodeDirPromiseMap
.
set
(
rmMeta
,
executor
.
copyDirectoryToRemote
(
remoteMachineTrailConfig
.
codeDir
,
this
.
remoteExpCodeDir
,
this
.
remoteOS
)
);
}
}
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
...
@@ -480,6 +495,10 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -480,6 +495,10 @@ class RemoteMachineTrainingService implements TrainingService {
const
trialWorkingFolder
:
string
=
unixPathJoin
(
this
.
remoteExpRootDir
,
'
trials
'
,
trialJobId
);
const
trialWorkingFolder
:
string
=
unixPathJoin
(
this
.
remoteExpRootDir
,
'
trials
'
,
trialJobId
);
trialJobDetail
.
rmMeta
=
rmScheduleInfo
.
rmMeta
;
trialJobDetail
.
rmMeta
=
rmScheduleInfo
.
rmMeta
;
const
copyExpCodeDirPromise
=
this
.
machineCopyExpCodeDirPromiseMap
.
get
(
trialJobDetail
.
rmMeta
);
if
(
copyExpCodeDirPromise
!==
undefined
)
{
await
copyExpCodeDirPromise
;
}
await
this
.
allocateExecutorForTrial
(
trialJobDetail
);
await
this
.
allocateExecutorForTrial
(
trialJobDetail
);
await
this
.
launchTrialOnScheduledMachine
(
await
this
.
launchTrialOnScheduledMachine
(
...
@@ -554,6 +573,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -554,6 +573,7 @@ class RemoteMachineTrainingService implements TrainingService {
getExperimentId
(),
getExperimentId
(),
trialJobDetail
.
form
.
sequenceId
.
toString
(),
trialJobDetail
.
form
.
sequenceId
.
toString
(),
this
.
isMultiPhase
,
this
.
isMultiPhase
,
this
.
remoteExpCodeDir
,
unixPathJoin
(
trialWorkingFolder
,
'
.nni
'
,
'
jobpid
'
),
unixPathJoin
(
trialWorkingFolder
,
'
.nni
'
,
'
jobpid
'
),
command
,
command
,
nniManagerIp
,
nniManagerIp
,
...
@@ -565,12 +585,8 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -565,12 +585,8 @@ class RemoteMachineTrainingService implements TrainingService {
//create tmp trial working folder locally.
//create tmp trial working folder locally.
await
execMkdir
(
path
.
join
(
trialLocalTempFolder
,
'
.nni
'
));
await
execMkdir
(
path
.
join
(
trialLocalTempFolder
,
'
.nni
'
));
// Write install_nni.sh
//create tmp trial working folder locally.
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
CONTAINER_INSTALL_NNI_SHELL_FORMAT
,
{
encoding
:
'
utf8
'
});
await
execCopydir
(
this
.
trialConfig
.
codeDir
,
trialLocalTempFolder
);
const
installScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
installScriptContent
,
{
encoding
:
'
utf8
'
});
// Write file content ( run.sh and parameter.cfg ) to local tmp files
// Write file content ( run.sh and parameter.cfg ) to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run.sh
'
),
runScriptTrialContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run.sh
'
),
runScriptTrialContent
,
{
encoding
:
'
utf8
'
});
await
this
.
writeParameterFile
(
trialJobId
,
form
.
hyperParameters
);
await
this
.
writeParameterFile
(
trialJobId
,
form
.
hyperParameters
);
...
...
src/nni_manager/training_service/remote_machine/shellExecutor.ts
View file @
d7456c16
...
@@ -183,13 +183,14 @@ class ShellExecutor {
...
@@ -183,13 +183,14 @@ class ShellExecutor {
* Copy files and directories in local directory recursively to remote directory
* Copy files and directories in local directory recursively to remote directory
* @param localDirectory local diretory
* @param localDirectory local diretory
* @param remoteDirectory remote directory
* @param remoteDirectory remote directory
* @param
sshClient SSH client
* @param
remoteOS the OS of remote machine
*/
*/
public
async
copyDirectoryToRemote
(
localDirectory
:
string
,
remoteDirectory
:
string
,
remoteOS
:
string
):
Promise
<
void
>
{
public
async
copyDirectoryToRemote
(
localDirectory
:
string
,
remoteDirectory
:
string
,
remoteOS
:
string
):
Promise
<
void
>
{
const
tmpSuffix
:
string
=
uniqueString
(
5
);
const
tmpSuffix
:
string
=
uniqueString
(
5
);
const
localTarPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
`nni_tmp_local_
${
tmpSuffix
}
.tar.gz`
);
const
localTarPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
`nni_tmp_local_
${
tmpSuffix
}
.tar.gz`
);
const
remoteTarPath
:
string
=
unixPathJoin
(
getRemoteTmpDir
(
remoteOS
),
`nni_tmp_remote_
${
tmpSuffix
}
.tar.gz`
);
const
remoteTarPath
:
string
=
unixPathJoin
(
getRemoteTmpDir
(
remoteOS
),
`nni_tmp_remote_
${
tmpSuffix
}
.tar.gz`
);
// Create remote directory
await
this
.
createFolder
(
remoteDirectory
);
// Compress files in local directory to experiment root directory
// Compress files in local directory to experiment root directory
await
tarAdd
(
localTarPath
,
localDirectory
);
await
tarAdd
(
localTarPath
,
localDirectory
);
// Copy the compressed file to remoteDirectory and delete it
// Copy the compressed file to remoteDirectory and delete it
...
...
test/nni_test/nnitest/run_tests.py
View file @
d7456c16
...
@@ -168,6 +168,7 @@ def launch_test(config_file, training_service, test_case_config):
...
@@ -168,6 +168,7 @@ def launch_test(config_file, training_service, test_case_config):
trial_stats
=
get_trial_stats
(
TRIAL_JOBS_URL
)
trial_stats
=
get_trial_stats
(
TRIAL_JOBS_URL
)
print
(
json
.
dumps
(
trial_stats
,
indent
=
4
),
flush
=
True
)
print
(
json
.
dumps
(
trial_stats
,
indent
=
4
),
flush
=
True
)
if
status
!=
'DONE'
or
trial_stats
[
'SUCCEEDED'
]
+
trial_stats
[
'EARLY_STOPPED'
]
<
max_trial_num
:
if
status
!=
'DONE'
or
trial_stats
[
'SUCCEEDED'
]
+
trial_stats
[
'EARLY_STOPPED'
]
<
max_trial_num
:
print_experiment_log
(
experiment_id
=
experiment_id
)
print_trial_job_log
(
training_service
,
TRIAL_JOBS_URL
)
print_trial_job_log
(
training_service
,
TRIAL_JOBS_URL
)
raise
AssertionError
(
'Failed to finish in maxExecDuration'
)
raise
AssertionError
(
'Failed to finish in maxExecDuration'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment