Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
d7456c16
"...mlperf_bert_paddle.git" did not exist on "581b8d151de478bcfe18caf5d707c2fb1915fb6e"
Unverified
Commit
d7456c16
authored
May 19, 2020
by
SparkSnail
Committed by
GitHub
May 19, 2020
Browse files
Refactor code storage logic for trial (#2403)
parent
bd7edf36
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
149 additions
and
120 deletions
+149
-120
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+28
-30
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+17
-19
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+2
-2
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+26
-29
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+23
-16
src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
+3
-4
src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
...ager/training_service/pai/paiK8S/paiK8STrainingService.ts
+16
-5
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+6
-5
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+24
-8
src/nni_manager/training_service/remote_machine/shellExecutor.ts
..._manager/training_service/remote_machine/shellExecutor.ts
+3
-2
test/nni_test/nnitest/run_tests.py
test/nni_test/nnitest/run_tests.py
+1
-0
No files found.
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
d7456c16
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
...
@@ -72,6 +73,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -72,6 +73,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
}
// wait upload of code Dir to finish
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
}
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialJobId
:
string
=
uniqueString
(
5
);
// Set trial's NFS working folder
// Set trial's NFS working folder
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
...
@@ -81,8 +87,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -81,8 +87,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this
.
generateContainerPort
();
this
.
generateContainerPort
();
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
form
);
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
form
);
//upload
code files
//
wait
upload
of script files to finish
const
trialJobOutputUrl
:
string
=
await
this
.
upload
CodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobOutputUrl
:
string
=
await
this
.
upload
Folder
(
trialLocalTempFolder
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
if
(
!
trialJobOutputUrl
)
{
if
(
!
trialJobOutputUrl
)
{
initStatus
=
'
FAILED
'
;
initStatus
=
'
FAILED
'
;
...
@@ -151,6 +157,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -151,6 +157,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// Validate to make sure codeDir doesn't have too many files
// Validate to make sure codeDir doesn't have too many files
try
{
try
{
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
//upload codeDir to storage
this
.
copyExpCodeDirPromise
=
this
.
uploadFolder
(
this
.
fcTrialConfig
.
codeDir
,
`nni/
${
getExperimentId
()}
/nni-code`
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
...
@@ -171,41 +179,31 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -171,41 +179,31 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
}
/**
/**
* upload code files to nfs or azureStroage
* upload local folder to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
*/
private
async
upload
CodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
private
async
upload
Folder
(
srcDirectory
:
string
,
destDirectory
:
string
):
Promise
<
string
>
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
if
(
this
.
fcTrialConfig
===
undefined
)
{
assert
(
this
.
fcClusterConfig
.
storage
===
undefined
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
||
this
.
fcClusterConfig
.
storage
===
'
azureStorage
'
}
||
this
.
fcClusterConfig
.
storage
===
'
nfs
'
);
let
trialJobOutputUrl
:
string
=
''
;
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
fcClusterConfig
.
storage
===
'
azureStorage
'
)
{
const
azureFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigAzure
=
if
(
this
.
azureStorageClient
===
undefined
)
{
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
trialJobOutputUrl
=
await
this
.
uploadFilesToAzureStorage
(
trialJobId
,
trialLocalTempFolder
,
this
.
fcTrialConfig
.
codeDir
,
}
azureFrameworkControllerClusterConfig
.
uploadRetryCount
);
const
fcClusterConfigAzure
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
return
await
this
.
uploadFolderToAzureStorage
(
srcDirectory
,
destDirectory
,
fcClusterConfigAzure
.
uploadRetryCount
);
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
}
else
if
(
this
.
fcClusterConfig
.
storage
===
'
nfs
'
||
this
.
fcClusterConfig
.
storage
===
undefined
)
{
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/
${
destDirectory
}
`
);
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`cp -r
${
srcDirectory
}
/*
${
this
.
trialLocalNFSTempFolder
}
/
${
destDirectory
}
/.`
);
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
const
fcClusterConfigNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
// Copy code files from local dir to NFS mounted dir
const
nfsConfig
:
NFSConfig
=
fcClusterConfigNFS
.
nfs
;
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
return
`nfs://
${
nfsConfig
.
server
}
:
${
destDirectory
}
`
;
// Copy codeDir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
this
.
fcTrialConfig
.
codeDir
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsFrameworkControllerClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
}
return
''
;
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
}
/**
/**
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
d7456c16
...
@@ -74,14 +74,20 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -74,14 +74,20 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
}
// upload code Dir to storage
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
}
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
kubeflowJobName
:
string
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
kubeflowJobName
:
string
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
//prepare the runscript
//prepare the runscript
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
form
);
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
form
);
//upload files to sotrage
//upload
script
files to sotrage
const
trialJobOutputUrl
:
string
=
await
this
.
upload
CodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobOutputUrl
:
string
=
await
this
.
upload
Folder
(
trialLocalTempFolder
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
if
(
!
trialJobOutputUrl
)
{
if
(
!
trialJobOutputUrl
)
{
initStatus
=
'
FAILED
'
;
initStatus
=
'
FAILED
'
;
...
@@ -152,6 +158,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -152,6 +158,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// Validate to make sure codeDir doesn't have too many files
// Validate to make sure codeDir doesn't have too many files
try
{
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
//upload codeDir to storage
this
.
copyExpCodeDirPromise
=
this
.
uploadFolder
(
this
.
kubeflowTrialConfig
.
codeDir
,
`nni/
${
getExperimentId
()}
/nni-code`
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
...
@@ -172,12 +180,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -172,12 +180,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
}
/**
/**
* upload code files to nfs or azureStroage
* upload local folder to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
*/
private
async
upload
CodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
private
async
upload
Folder
(
srcDirectory
:
string
,
destDirectory
:
string
):
Promise
<
string
>
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
...
@@ -186,8 +191,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -186,8 +191,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw
new
Error
(
'
Kubeflow Trial config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Trial config is not initialized
'
);
}
}
let
trialJobOutputUrl
:
string
=
''
;
assert
(
this
.
kubeflowClusterConfig
.
storage
===
undefined
assert
(
this
.
kubeflowClusterConfig
.
storage
===
undefined
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
...
@@ -197,20 +200,15 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -197,20 +200,15 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
}
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
trialJobOutputUrl
=
await
this
.
uploadF
iles
ToAzureStorage
(
trialJobId
,
trialLocalTempFolder
,
this
.
kubeflowTrialConfig
.
codeDir
,
azureKubeflowClusterConfig
.
uploadRetryCount
);
return
await
this
.
uploadF
older
ToAzureStorage
(
srcDirectory
,
destDirectory
,
azureKubeflowClusterConfig
.
uploadRetryCount
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/
${
destDirectory
}
`
);
await
cpp
.
exec
(
`cp -r
${
srcDirectory
}
/*
${
this
.
trialLocalNFSTempFolder
}
/
${
destDirectory
}
/.`
);
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy script files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
// Copy codeDir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
this
.
kubeflowTrialConfig
.
codeDir
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)
}
`
;
return
`nfs://
${
nfsConfig
.
server
}
:
${
destDirectory
}
`
;
}
}
return
''
;
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
...
...
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
d7456c16
...
@@ -39,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
...
@@ -39,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
export
const
kubernetesScriptFormat
:
string
=
export
const
kubernetesScriptFormat
:
string
=
`#!/bin/bash
`#!/bin/bash
export NNI_PLATFORM={0}
export NNI_PLATFORM={0}
export NNI_SYS_DIR=
$PWD/nni/
{1}
export NNI_SYS_DIR={1}
export NNI_OUTPUT_DIR={2}
export NNI_OUTPUT_DIR={2}
export MULTI_PHASE=false
export MULTI_PHASE=false
export NNI_TRIAL_JOB_ID={3}
export NNI_TRIAL_JOB_ID={3}
...
@@ -49,7 +49,7 @@ export NNI_TRIAL_SEQ_ID={6}
...
@@ -49,7 +49,7 @@ export NNI_TRIAL_SEQ_ID={6}
{7}
{7}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_OUTPUT_DIR
mkdir -p $NNI_OUTPUT_DIR
cp -r
T
$NNI_CODE_DIR $NNI_SYS_DIR
cp -r $NNI_CODE_DIR
/.
$NNI_SYS_DIR
cd $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
...
...
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
d7456c16
...
@@ -49,6 +49,8 @@ abstract class KubernetesTrainingService {
...
@@ -49,6 +49,8 @@ abstract class KubernetesTrainingService {
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
versionCheck
:
boolean
=
true
;
protected
versionCheck
:
boolean
=
true
;
protected
logCollection
:
string
;
protected
logCollection
:
string
;
protected
copyExpCodeDirPromise
?:
Promise
<
string
>
;
protected
expContainerCodeFolder
:
string
;
constructor
()
{
constructor
()
{
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
...
@@ -57,6 +59,7 @@ abstract class KubernetesTrainingService {
...
@@ -57,6 +59,7 @@ abstract class KubernetesTrainingService {
this
.
trialLocalNFSTempFolder
=
path
.
join
(
getExperimentRootDir
(),
'
trials-nfs-tmp
'
);
this
.
trialLocalNFSTempFolder
=
path
.
join
(
getExperimentRootDir
(),
'
trials-nfs-tmp
'
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
CONTAINER_MOUNT_PATH
=
'
/tmp/mount
'
;
this
.
CONTAINER_MOUNT_PATH
=
'
/tmp/mount
'
;
this
.
expContainerCodeFolder
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
this
.
experimentId
,
'
nni-code
'
);
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
logCollection
=
'
none
'
;
this
.
logCollection
=
'
none
'
;
}
}
...
@@ -272,11 +275,11 @@ abstract class KubernetesTrainingService {
...
@@ -272,11 +275,11 @@ abstract class KubernetesTrainingService {
const
runScript
:
string
=
String
.
Format
(
const
runScript
:
string
=
String
.
Format
(
kubernetesScriptFormat
,
kubernetesScriptFormat
,
platform
,
platform
,
trial
JobId
,
trial
WorkingFolder
,
path
.
join
(
trialWorkingFolder
,
'
output
'
,
`
${
roleName
}
_output`
),
path
.
join
(
trialWorkingFolder
,
'
output
'
,
`
${
roleName
}
_output`
),
trialJobId
,
trialJobId
,
getExperimentId
(),
getExperimentId
(),
t
rialWorking
Folder
,
t
his
.
expContainerCode
Folder
,
trialSequenceId
,
trialSequenceId
,
nvidiaScript
,
nvidiaScript
,
command
,
command
,
...
@@ -329,51 +332,45 @@ abstract class KubernetesTrainingService {
...
@@ -329,51 +332,45 @@ abstract class KubernetesTrainingService {
);
);
return
registrySecretName
;
return
registrySecretName
;
}
}
protected
async
uploadFilesToAzureStorage
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
,
codeDir
:
string
,
uploadRetryCount
:
number
|
undefined
):
Promise
<
string
>
{
/**
* upload local directory to azureStorage
* @param srcDirectory the source directory of local folder
* @param destDirectory the target directory in azure
* @param uploadRetryCount the retry time when upload failed
*/
protected
async
uploadFolderToAzureStorage
(
srcDirectory
:
string
,
destDirectory
:
string
,
uploadRetryCount
:
number
|
undefined
):
Promise
<
string
>
{
if
(
this
.
azureStorageClient
===
undefined
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
}
let
trialJobOutputUrl
:
string
=
''
;
let
retryCount
:
number
=
1
;
let
retryCount
:
number
=
1
;
if
(
uploadRetryCount
)
{
if
(
uploadRetryCount
)
{
retryCount
=
uploadRetryCount
;
retryCount
=
uploadRetryCount
;
}
}
let
resultUploadNNIScript
:
boolean
=
false
;
let
uploadSuccess
:
boolean
=
false
;
let
resultUploadCodeFile
:
boolean
=
false
;
let
folderUriInAzure
=
''
;
try
{
try
{
do
{
do
{
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
uploadSuccess
=
await
AzureStorageClientUtility
.
uploadDirectory
(
if
(
!
resultUploadNNIScript
)
{
this
.
azureStorageClient
,
resultUploadNNIScript
=
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`
${
destDirectory
}
`
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
`
${
srcDirectory
}
`
);
}
if
(
!
uploadSuccess
)
{
//upload code files to azure storage
if
(
!
resultUploadCodeFile
)
{
resultUploadCodeFile
=
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
codeDir
}
`
);
}
if
(
resultUploadNNIScript
&&
resultUploadCodeFile
)
{
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
`
+
`/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
break
;
}
else
{
//wait for 5 seconds to re-upload files
//wait for 5 seconds to re-upload files
await
delay
(
5000
);
await
delay
(
5000
);
this
.
log
.
info
(
'
Upload failed, Retry: upload files to azure-storage
'
);
this
.
log
.
info
(
'
Upload failed, Retry: upload files to azure-storage
'
);
}
else
{
folderUriInAzure
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
destDirectory
}
`
;
break
;
}
}
}
while
(
retryCount
--
>=
0
)
}
while
(
retryCount
--
>=
0
)
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
//return a empty url when got error
//return a empty url when got error
return
Promise
.
resolve
(
""
);
return
Promise
.
resolve
(
''
);
}
if
(
!
trialJobOutputUrl
)
{
this
.
log
.
info
(
`Retry-count is used up, upload files to azureStorage for trial
${
trialJobId
}
failed!`
);
}
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
return
Promise
.
resolve
(
folderUriInAzure
);
}
}
}
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
d7456c16
...
@@ -361,21 +361,25 @@ class LocalTrainingService implements TrainingService {
...
@@ -361,21 +361,25 @@ class LocalTrainingService implements TrainingService {
trialJobDetail
:
TrialJobDetail
,
trialJobDetail
:
TrialJobDetail
,
resource
:
{
gpuIndices
:
number
[]
},
resource
:
{
gpuIndices
:
number
[]
},
gpuNum
:
number
|
undefined
):
{
key
:
string
;
value
:
string
}[]
{
gpuNum
:
number
|
undefined
):
{
key
:
string
;
value
:
string
}[]
{
const
envVariables
:
{
key
:
string
;
value
:
string
}[]
=
[
if
(
this
.
localTrialConfig
===
undefined
)
{
{
key
:
'
NNI_PLATFORM
'
,
value
:
'
local
'
},
throw
new
Error
(
'
localTrialConfig is not initialized!
'
);
{
key
:
'
NNI_EXP_ID
'
,
value
:
this
.
experimentId
},
}
{
key
:
'
NNI_SYS_DIR
'
,
value
:
trialJobDetail
.
workingDirectory
},
const
envVariables
:
{
key
:
string
;
value
:
string
}[]
=
[
{
key
:
'
NNI_TRIAL_JOB_ID
'
,
value
:
trialJobDetail
.
id
},
{
key
:
'
NNI_PLATFORM
'
,
value
:
'
local
'
},
{
key
:
'
NNI_OUTPUT_DIR
'
,
value
:
trialJobDetail
.
workingDirectory
},
{
key
:
'
NNI_EXP_ID
'
,
value
:
this
.
experimentId
},
{
key
:
'
NNI_TRIAL_SEQ_ID
'
,
value
:
trialJobDetail
.
form
.
sequenceId
.
toString
()
},
{
key
:
'
NNI_SYS_DIR
'
,
value
:
trialJobDetail
.
workingDirectory
},
{
key
:
'
MULTI_PHASE
'
,
value
:
this
.
isMultiPhase
.
toString
()
}
{
key
:
'
NNI_TRIAL_JOB_ID
'
,
value
:
trialJobDetail
.
id
},
];
{
key
:
'
NNI_OUTPUT_DIR
'
,
value
:
trialJobDetail
.
workingDirectory
},
if
(
gpuNum
!==
undefined
)
{
{
key
:
'
NNI_TRIAL_SEQ_ID
'
,
value
:
trialJobDetail
.
form
.
sequenceId
.
toString
()
},
envVariables
.
push
({
{
key
:
'
MULTI_PHASE
'
,
value
:
this
.
isMultiPhase
.
toString
()
},
key
:
'
CUDA_VISIBLE_DEVICES
'
,
{
key
:
'
NNI_CODE_DIR
'
,
value
:
this
.
localTrialConfig
.
codeDir
}
value
:
this
.
gpuScheduler
===
undefined
?
'
-1
'
:
resource
.
gpuIndices
.
join
(
'
,
'
)
];
});
if
(
gpuNum
!==
undefined
)
{
}
envVariables
.
push
({
key
:
'
CUDA_VISIBLE_DEVICES
'
,
value
:
this
.
gpuScheduler
===
undefined
?
'
-1
'
:
resource
.
gpuIndices
.
join
(
'
,
'
)
});
}
return
envVariables
;
return
envVariables
;
}
}
...
@@ -473,12 +477,16 @@ class LocalTrainingService implements TrainingService {
...
@@ -473,12 +477,16 @@ class LocalTrainingService implements TrainingService {
private
getScript
(
localTrialConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]
{
private
getScript
(
localTrialConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]
{
const
script
:
string
[]
=
[];
const
script
:
string
[]
=
[];
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
script
.
push
(
`Copy-Item $env:NNI_CODE_DIR\\* -Destination $env:NNI_SYS_DIR -Recurse`
);
script
.
push
(
`cd $env:NNI_SYS_DIR`
);
script
.
push
(
script
.
push
(
`cmd.exe /c
${
localTrialConfig
.
command
}
2>"
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
"`
,
`cmd.exe /c
${
localTrialConfig
.
command
}
2>"
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
"`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`
,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`
,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File "
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
" -NoNewline -encoding utf8`
);
`Write $LASTEXITCODE " " $NOW_DATE | Out-File "
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
" -NoNewline -encoding utf8`
);
}
else
{
}
else
{
script
.
push
(
`cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR`
);
script
.
push
(
`cd $NNI_SYS_DIR`
);
script
.
push
(
`eval
${
localTrialConfig
.
command
}
2>"
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
"`
);
script
.
push
(
`eval
${
localTrialConfig
.
command
}
2>"
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
"`
);
if
(
process
.
platform
===
'
darwin
'
)
{
if
(
process
.
platform
===
'
darwin
'
)
{
// https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x
// https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x
...
@@ -506,7 +514,6 @@ class LocalTrainingService implements TrainingService {
...
@@ -506,7 +514,6 @@ class LocalTrainingService implements TrainingService {
if
(
process
.
platform
!==
'
win32
'
)
{
if
(
process
.
platform
!==
'
win32
'
)
{
runScriptContent
.
push
(
'
#!/bin/bash
'
);
runScriptContent
.
push
(
'
#!/bin/bash
'
);
}
}
runScriptContent
.
push
(
`cd '
${
this
.
localTrialConfig
.
codeDir
}
'`
);
for
(
const
variable
of
variables
)
{
for
(
const
variable
of
variables
)
{
runScriptContent
.
push
(
setEnvironmentVariable
(
variable
));
runScriptContent
.
push
(
setEnvironmentVariable
(
variable
));
}
}
...
...
src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
View file @
d7456c16
...
@@ -31,7 +31,6 @@ fi`;
...
@@ -31,7 +31,6 @@ fi`;
export
const
PAI_K8S_TRIAL_COMMAND_FORMAT
:
string
=
export
const
PAI_K8S_TRIAL_COMMAND_FORMAT
:
string
=
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& ls $NNI_SYS_DIR \
&& NNI_CODE_DIR={6} && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR && cd $NNI_SYS_DIR && sh install_nni.sh \
&& cd $NNI_SYS_DIR && sh install_nni.sh \
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' \
--nni_manager_version '{10}' --log_collection '{11}'`
;
--nni_manager_version '{9}' --log_collection '{10}'`
;
src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
View file @
d7456c16
...
@@ -53,6 +53,7 @@ const yaml = require('js-yaml');
...
@@ -53,6 +53,7 @@ const yaml = require('js-yaml');
@
component
.
Singleton
@
component
.
Singleton
class
PAIK8STrainingService
extends
PAITrainingService
{
class
PAIK8STrainingService
extends
PAITrainingService
{
protected
paiTrialConfig
:
NNIPAIK8STrialConfig
|
undefined
;
protected
paiTrialConfig
:
NNIPAIK8STrialConfig
|
undefined
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
private
paiJobConfig
:
undefined
;
private
paiJobConfig
:
undefined
;
private
nniVersion
:
string
|
undefined
;
private
nniVersion
:
string
|
undefined
;
constructor
()
{
constructor
()
{
...
@@ -78,7 +79,7 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -78,7 +79,7 @@ class PAIK8STrainingService extends PAITrainingService {
}
}
break
;
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
break
;
break
;
...
@@ -86,10 +87,15 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -86,10 +87,15 @@ class PAIK8STrainingService extends PAITrainingService {
this
.
paiTrialConfig
=
<
NNIPAIK8STrialConfig
>
JSON
.
parse
(
value
);
this
.
paiTrialConfig
=
<
NNIPAIK8STrialConfig
>
JSON
.
parse
(
value
);
// Validate to make sure codeDir doesn't have too many files
// Validate to make sure codeDir doesn't have too many files
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
const
nniManagerNFSExpCodeDir
=
path
.
join
(
this
.
paiTrialConfig
.
nniManagerNFSMountPath
,
this
.
experimentId
,
'
nni-code
'
);
await
execMkdir
(
nniManagerNFSExpCodeDir
);
//Copy codeDir files to local working folder
this
.
copyExpCodeDirPromise
=
execCopydir
(
this
.
paiTrialConfig
.
codeDir
,
nniManagerNFSExpCodeDir
);
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
if
(
this
.
paiTrialConfig
.
paiConfigPath
)
{
this
.
paiJobConfig
=
yaml
.
safeLoad
(
fs
.
readFileSync
(
this
.
paiTrialConfig
.
paiConfigPath
,
'
utf8
'
));
this
.
paiJobConfig
=
yaml
.
safeLoad
(
fs
.
readFileSync
(
this
.
paiTrialConfig
.
paiConfigPath
,
'
utf8
'
));
}
}
break
;
break
;
}
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
this
.
nniVersion
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
this
.
nniVersion
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
...
@@ -152,6 +158,7 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -152,6 +158,7 @@ class PAIK8STrainingService extends PAITrainingService {
if
(
this
.
paiTrialConfig
===
undefined
)
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
throw
new
Error
(
'
trial config is not initialized
'
);
}
}
const
containerNFSExpCodeDir
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
/'nni-code`
;
const
containerWorkingDir
:
string
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
/
${
trialJobDetail
.
id
}
`
;
const
containerWorkingDir
:
string
=
`
${
this
.
paiTrialConfig
.
containerNFSMountPath
}
/
${
this
.
experimentId
}
/
${
trialJobDetail
.
id
}
`
;
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
...
@@ -162,6 +169,7 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -162,6 +169,7 @@ class PAIK8STrainingService extends PAITrainingService {
this
.
experimentId
,
this
.
experimentId
,
trialJobDetail
.
form
.
sequenceId
,
trialJobDetail
.
form
.
sequenceId
,
this
.
isMultiPhase
,
this
.
isMultiPhase
,
containerNFSExpCodeDir
,
command
,
command
,
nniManagerIp
,
nniManagerIp
,
this
.
paiRestServerPort
,
this
.
paiRestServerPort
,
...
@@ -264,15 +272,18 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -264,15 +272,18 @@ class PAIK8STrainingService extends PAITrainingService {
throw
new
Error
(
'
paiJobRestServer is not initialized
'
);
throw
new
Error
(
'
paiJobRestServer is not initialized
'
);
}
}
// Make sure experiment code files is copied from local to NFS
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
}
this
.
paiRestServerPort
=
this
.
paiJobRestServer
.
clusterRestServerPort
;
this
.
paiRestServerPort
=
this
.
paiJobRestServer
.
clusterRestServerPort
;
// Step 1. Prepare PAI job configuration
// Step 1. Prepare PAI job configuration
//create trial local working folder locally.
//create trial local working folder locally.
await
execMkdir
(
trialJobDetail
.
logPath
);
await
execMkdir
(
trialJobDetail
.
logPath
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local files
// Write NNI installation file to local files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
logPath
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
logPath
,
'
install_nni.sh
'
),
CONTAINER_INSTALL_NNI_SHELL_FORMAT
,
{
encoding
:
'
utf8
'
});
// Write file content ( parameter.cfg ) to local working folders
// Write file content ( parameter.cfg ) to local working folders
if
(
trialJobDetail
.
form
!==
undefined
)
{
if
(
trialJobDetail
.
form
!==
undefined
)
{
...
@@ -284,7 +295,7 @@ class PAIK8STrainingService extends PAITrainingService {
...
@@ -284,7 +295,7 @@ class PAIK8STrainingService extends PAITrainingService {
//Generate Job Configuration in yaml format
//Generate Job Configuration in yaml format
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
trialJobDetail
);
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
trialJobDetail
);
this
.
log
.
debug
(
paiJobConfig
);
this
.
log
.
debug
(
paiJobConfig
);
// Step
3
. Submit PAI job via Rest call
// Step
2
. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const
submitJobRequest
:
request
.
Options
=
{
const
submitJobRequest
:
request
.
Options
=
{
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v2/jobs`
,
uri
:
`
${
this
.
protocol
}
://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v2/jobs`
,
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
d7456c16
...
@@ -179,13 +179,14 @@ export enum ScheduleResultType {
...
@@ -179,13 +179,14 @@ export enum ScheduleResultType {
export
const
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
:
string
=
export
const
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
:
string
=
`#!/bin/bash
`#!/bin/bash
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \
NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5}
NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} NNI_CODE_DIR={6}
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
cd $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
sh install_nni.sh
echo $$ >{
6
}
echo $$ >{
7
}
python3 -m nni_trial_tool.trial_keeper --trial_command '{
7
}' --nnimanager_ip '{
8
}' --nnimanager_port '{
9
}' \
python3 -m nni_trial_tool.trial_keeper --trial_command '{
8
}' --nnimanager_ip '{
9
}' --nnimanager_port '{
10
}' \
--nni_manager_version '{1
0
}' --log_collection '{1
1
}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
--nni_manager_version '{1
1
}' --log_collection '{1
2
}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $?
\`
date +%s%3N
\`
>{1
2
}`
;
echo $?
\`
date +%s%3N
\`
>{1
3
}`
;
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
`#!/bin/bash
`#!/bin/bash
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
d7456c16
...
@@ -26,7 +26,7 @@ import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
...
@@ -26,7 +26,7 @@ import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execCopydir
,
execMkdir
,
validateCodeDir
,
getGpuMetricsCollectorBashScriptContent
}
from
'
../common/util
'
;
import
{
execMkdir
,
validateCodeDir
,
getGpuMetricsCollectorBashScriptContent
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
import
{
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
RemoteMachineMeta
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
RemoteMachineMeta
,
...
@@ -42,11 +42,13 @@ import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
...
@@ -42,11 +42,13 @@ import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
@
component
.
Singleton
@
component
.
Singleton
class
RemoteMachineTrainingService
implements
TrainingService
{
class
RemoteMachineTrainingService
implements
TrainingService
{
private
readonly
machineExecutorManagerMap
:
Map
<
RemoteMachineMeta
,
ExecutorManager
>
;
//machine excutor map
private
readonly
machineExecutorManagerMap
:
Map
<
RemoteMachineMeta
,
ExecutorManager
>
;
//machine excutor map
private
readonly
machineCopyExpCodeDirPromiseMap
:
Map
<
RemoteMachineMeta
,
Promise
<
void
>>
;
private
readonly
trialExecutorMap
:
Map
<
string
,
ShellExecutor
>
;
//trial excutor map
private
readonly
trialExecutorMap
:
Map
<
string
,
ShellExecutor
>
;
//trial excutor map
private
readonly
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
MAX_TRIAL_NUMBER_PER_EXECUTOR
:
number
=
5
;
// every excutor has a max trial concurrency number
private
readonly
MAX_TRIAL_NUMBER_PER_EXECUTOR
:
number
=
5
;
// every excutor has a max trial concurrency number
private
readonly
expRootDir
:
string
;
private
readonly
expRootDir
:
string
;
private
readonly
remoteExpRootDir
:
string
;
private
readonly
remoteExpRootDir
:
string
;
private
readonly
remoteExpCodeDir
:
string
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
gpuScheduler
?:
GPUScheduler
;
private
gpuScheduler
?:
GPUScheduler
;
private
readonly
jobQueue
:
string
[];
private
readonly
jobQueue
:
string
[];
...
@@ -68,9 +70,11 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -68,9 +70,11 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
trialJobsMap
=
new
Map
<
string
,
RemoteMachineTrialJobDetail
>
();
this
.
trialJobsMap
=
new
Map
<
string
,
RemoteMachineTrialJobDetail
>
();
this
.
trialExecutorMap
=
new
Map
<
string
,
ShellExecutor
>
();
this
.
trialExecutorMap
=
new
Map
<
string
,
ShellExecutor
>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachineMeta
,
ExecutorManager
>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachineMeta
,
ExecutorManager
>
();
this
.
machineCopyExpCodeDirPromiseMap
=
new
Map
<
RemoteMachineMeta
,
Promise
<
void
>>
();
this
.
jobQueue
=
[];
this
.
jobQueue
=
[];
this
.
expRootDir
=
getExperimentRootDir
();
this
.
expRootDir
=
getExperimentRootDir
();
this
.
remoteExpRootDir
=
this
.
getRemoteExperimentRootDir
();
this
.
remoteExpRootDir
=
this
.
getRemoteExperimentRootDir
();
this
.
remoteExpCodeDir
=
unixPathJoin
(
this
.
remoteExpRootDir
,
'
nni-code
'
);
this
.
timer
=
timer
;
this
.
timer
=
timer
;
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
this
.
trialSequenceId
=
-
1
;
this
.
trialSequenceId
=
-
1
;
...
@@ -320,9 +324,20 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -320,9 +324,20 @@ class RemoteMachineTrainingService implements TrainingService {
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
}
}
// Validate to make sure codeDir doesn't have too many files
try
{
try
{
// Validate to make sure codeDir doesn't have too many files
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
// Copy codeDir to remote machine
for
(
const
[
rmMeta
,
executorManager
]
of
this
.
machineExecutorManagerMap
.
entries
())
{
const
executor
:
ShellExecutor
=
await
executorManager
.
getAvailableExecutor
();
if
(
executor
!==
undefined
)
{
this
.
machineCopyExpCodeDirPromiseMap
.
set
(
rmMeta
,
executor
.
copyDirectoryToRemote
(
remoteMachineTrailConfig
.
codeDir
,
this
.
remoteExpCodeDir
,
this
.
remoteOS
)
);
}
}
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
...
@@ -480,6 +495,10 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -480,6 +495,10 @@ class RemoteMachineTrainingService implements TrainingService {
const
trialWorkingFolder
:
string
=
unixPathJoin
(
this
.
remoteExpRootDir
,
'
trials
'
,
trialJobId
);
const
trialWorkingFolder
:
string
=
unixPathJoin
(
this
.
remoteExpRootDir
,
'
trials
'
,
trialJobId
);
trialJobDetail
.
rmMeta
=
rmScheduleInfo
.
rmMeta
;
trialJobDetail
.
rmMeta
=
rmScheduleInfo
.
rmMeta
;
const
copyExpCodeDirPromise
=
this
.
machineCopyExpCodeDirPromiseMap
.
get
(
trialJobDetail
.
rmMeta
);
if
(
copyExpCodeDirPromise
!==
undefined
)
{
await
copyExpCodeDirPromise
;
}
await
this
.
allocateExecutorForTrial
(
trialJobDetail
);
await
this
.
allocateExecutorForTrial
(
trialJobDetail
);
await
this
.
launchTrialOnScheduledMachine
(
await
this
.
launchTrialOnScheduledMachine
(
...
@@ -554,6 +573,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -554,6 +573,7 @@ class RemoteMachineTrainingService implements TrainingService {
getExperimentId
(),
getExperimentId
(),
trialJobDetail
.
form
.
sequenceId
.
toString
(),
trialJobDetail
.
form
.
sequenceId
.
toString
(),
this
.
isMultiPhase
,
this
.
isMultiPhase
,
this
.
remoteExpCodeDir
,
unixPathJoin
(
trialWorkingFolder
,
'
.nni
'
,
'
jobpid
'
),
unixPathJoin
(
trialWorkingFolder
,
'
.nni
'
,
'
jobpid
'
),
command
,
command
,
nniManagerIp
,
nniManagerIp
,
...
@@ -565,12 +585,8 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -565,12 +585,8 @@ class RemoteMachineTrainingService implements TrainingService {
//create tmp trial working folder locally.
//create tmp trial working folder locally.
await
execMkdir
(
path
.
join
(
trialLocalTempFolder
,
'
.nni
'
));
await
execMkdir
(
path
.
join
(
trialLocalTempFolder
,
'
.nni
'
));
// Write install_nni.sh
//create tmp trial working folder locally.
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
CONTAINER_INSTALL_NNI_SHELL_FORMAT
,
{
encoding
:
'
utf8
'
});
await
execCopydir
(
this
.
trialConfig
.
codeDir
,
trialLocalTempFolder
);
const
installScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
installScriptContent
,
{
encoding
:
'
utf8
'
});
// Write file content ( run.sh and parameter.cfg ) to local tmp files
// Write file content ( run.sh and parameter.cfg ) to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run.sh
'
),
runScriptTrialContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run.sh
'
),
runScriptTrialContent
,
{
encoding
:
'
utf8
'
});
await
this
.
writeParameterFile
(
trialJobId
,
form
.
hyperParameters
);
await
this
.
writeParameterFile
(
trialJobId
,
form
.
hyperParameters
);
...
...
src/nni_manager/training_service/remote_machine/shellExecutor.ts
View file @
d7456c16
...
@@ -183,13 +183,14 @@ class ShellExecutor {
...
@@ -183,13 +183,14 @@ class ShellExecutor {
* Copy files and directories in local directory recursively to remote directory
* Copy files and directories in local directory recursively to remote directory
* @param localDirectory local diretory
* @param localDirectory local diretory
* @param remoteDirectory remote directory
* @param remoteDirectory remote directory
* @param
sshClient SSH client
* @param
remoteOS the OS of remote machine
*/
*/
public
async
copyDirectoryToRemote
(
localDirectory
:
string
,
remoteDirectory
:
string
,
remoteOS
:
string
):
Promise
<
void
>
{
public
async
copyDirectoryToRemote
(
localDirectory
:
string
,
remoteDirectory
:
string
,
remoteOS
:
string
):
Promise
<
void
>
{
const
tmpSuffix
:
string
=
uniqueString
(
5
);
const
tmpSuffix
:
string
=
uniqueString
(
5
);
const
localTarPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
`nni_tmp_local_
${
tmpSuffix
}
.tar.gz`
);
const
localTarPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
`nni_tmp_local_
${
tmpSuffix
}
.tar.gz`
);
const
remoteTarPath
:
string
=
unixPathJoin
(
getRemoteTmpDir
(
remoteOS
),
`nni_tmp_remote_
${
tmpSuffix
}
.tar.gz`
);
const
remoteTarPath
:
string
=
unixPathJoin
(
getRemoteTmpDir
(
remoteOS
),
`nni_tmp_remote_
${
tmpSuffix
}
.tar.gz`
);
// Create remote directory
await
this
.
createFolder
(
remoteDirectory
);
// Compress files in local directory to experiment root directory
// Compress files in local directory to experiment root directory
await
tarAdd
(
localTarPath
,
localDirectory
);
await
tarAdd
(
localTarPath
,
localDirectory
);
// Copy the compressed file to remoteDirectory and delete it
// Copy the compressed file to remoteDirectory and delete it
...
...
test/nni_test/nnitest/run_tests.py
View file @
d7456c16
...
@@ -168,6 +168,7 @@ def launch_test(config_file, training_service, test_case_config):
...
@@ -168,6 +168,7 @@ def launch_test(config_file, training_service, test_case_config):
trial_stats
=
get_trial_stats
(
TRIAL_JOBS_URL
)
trial_stats
=
get_trial_stats
(
TRIAL_JOBS_URL
)
print
(
json
.
dumps
(
trial_stats
,
indent
=
4
),
flush
=
True
)
print
(
json
.
dumps
(
trial_stats
,
indent
=
4
),
flush
=
True
)
if
status
!=
'DONE'
or
trial_stats
[
'SUCCEEDED'
]
+
trial_stats
[
'EARLY_STOPPED'
]
<
max_trial_num
:
if
status
!=
'DONE'
or
trial_stats
[
'SUCCEEDED'
]
+
trial_stats
[
'EARLY_STOPPED'
]
<
max_trial_num
:
print_experiment_log
(
experiment_id
=
experiment_id
)
print_trial_job_log
(
training_service
,
TRIAL_JOBS_URL
)
print_trial_job_log
(
training_service
,
TRIAL_JOBS_URL
)
raise
AssertionError
(
'Failed to finish in maxExecDuration'
)
raise
AssertionError
(
'Failed to finish in maxExecDuration'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment