Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
d48ad027
Unverified
Commit
d48ad027
authored
Jun 20, 2019
by
SparkSnail
Committed by
GitHub
Jun 20, 2019
Browse files
Merge pull request #184 from microsoft/master
merge master
parents
9352cc88
22993e5d
Changes
187
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
998 additions
and
847 deletions
+998
-847
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+232
-219
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
...anager/training_service/kubernetes/kubernetesApiClient.ts
+34
-25
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+41
-35
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+4
-5
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
...training_service/kubernetes/kubernetesJobInfoCollector.ts
+8
-9
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
...er/training_service/kubernetes/kubernetesJobRestServer.ts
+9
-8
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+155
-133
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+28
-26
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+49
-45
src/nni_manager/training_service/pai/hdfsClientUtility.ts
src/nni_manager/training_service/pai/hdfsClientUtility.ts
+51
-39
src/nni_manager/training_service/pai/paiConfig.ts
src/nni_manager/training_service/pai/paiConfig.ts
+25
-13
src/nni_manager/training_service/pai/paiData.ts
src/nni_manager/training_service/pai/paiData.ts
+16
-11
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
+36
-28
src/nni_manager/training_service/pai/paiJobRestServer.ts
src/nni_manager/training_service/pai/paiJobRestServer.ts
+6
-5
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+68
-53
src/nni_manager/training_service/pai/paiTrialConfig.ts
src/nni_manager/training_service/pai/paiTrialConfig.ts
+9
-5
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
...i_manager/training_service/remote_machine/gpuScheduler.ts
+31
-23
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+76
-68
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
...ning_service/remote_machine/remoteMachineJobRestServer.ts
+6
-5
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+114
-92
No files found.
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
d48ad027
...
...
@@ -17,35 +17,34 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
JobApplicationForm
,
TrialJobApplicationForm
,
TrialJobDetail
,
NNIManagerIpConfig
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
KubeflowClusterConfigNFS
,
KubeflowClusterConfigAzure
,
KubeflowTrialConfigPytorch
,
KubeflowTrialConfigTensorflow
,
KubeflowClusterConfigFactory
,
KubeflowTrialConfigFactory
,
KubeflowTrialConfig
,
KubeflowClusterConfig
}
from
'
./kubeflowConfig
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
KubeflowOperatorClient
}
from
'
./kubeflowApiClient
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
import
{
KubeflowClusterConfig
,
KubeflowClusterConfigAzure
,
KubeflowClusterConfigFactory
,
KubeflowClusterConfigNFS
,
KubeflowTrialConfig
,
KubeflowTrialConfigFactory
,
KubeflowTrialConfigPytorch
,
KubeflowTrialConfigTensorflow
}
from
'
./kubeflowConfig
'
;
import
{
KubeflowJobInfoCollector
}
from
'
./kubeflowJobInfoCollector
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
// tslint:disable: no-unsafe-any no-any
/**
* Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
...
...
@@ -54,12 +53,12 @@ import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
class
KubeflowTrainingService
extends
KubernetesTrainingService
implements
KubernetesTrainingService
{
private
kubeflowClusterConfig
?:
KubeflowClusterConfig
;
private
kubeflowTrialConfig
?:
KubeflowTrialConfig
;
private
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
private
readonly
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
constructor
()
{
super
();
super
();
this
.
kubeflowJobInfoCollector
=
new
KubeflowJobInfoCollector
(
this
.
trialJobsMap
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
this
.
log
.
info
(
'
Construct Kubeflow training service.
'
);
}
...
...
@@ -67,17 +66,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
public
async
run
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Run Kubeflow training service.
'
);
this
.
kubernetesJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
if
(
!
this
.
kubernetesJobRestServer
)
{
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
...
...
@@ -86,17 +85,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow job operator client is undefined
'
);
}
if
(
!
this
.
kubernetesRestServerPort
)
{
if
(
this
.
kubernetesRestServerPort
===
undefined
)
{
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
kubeflowJobName
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
kubeflowJobName
:
string
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
curTrialSequenceId
:
number
=
this
.
generateSequenceId
();
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
//prepare the runscript
...
...
@@ -113,226 +112,239 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
curTrialSequenceId
,
trialJobOutputUrl
);
// Generate kubeflow job resource config object
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
await
this
.
prepareKubeflowConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
);
// Create kubeflow job based on generated kubeflow job resource config
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
kubeflowJobConfig
);
// Set trial job detail until create Kubeflow job successfully
// Set trial job detail until create Kubeflow job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
}
// tslint:disable:no-redundant-jsdoc
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
const
kubeflowClusterJsonObject
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
await
this
.
createNFSStorage
(
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
);
const
kubeflowTrialJsonObjsect
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
}
return
Promise
.
resolve
();
}
/**
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
let
trialJobOutputUrl
:
string
=
''
;
assert
(
!
this
.
kubeflowClusterConfig
.
storage
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
assert
(
this
.
kubeflowClusterConfig
.
storage
===
undefined
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
try
{
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
try
{
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
}
catch
(
error
){
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
\
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
le
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
cons
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
// initialize kubeflow trial config to specific type
let
kubeflowTrialConfig
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
kubeflowTrialConfig
:
any
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
;
}
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
)
{
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_worker.sh
'
),
workerRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
// Write parameter server file content run_ps.sh to local tmp folders
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
){
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
}
// Write parameter server file content run_ps.sh to local tmp folders
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_ps.sh
'
),
psRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
){
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
!==
undefined
)
{
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_master.sh
'
),
masterRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
;
if
(
trialForm
!==
undefined
&&
trialForm
.
hyperParameters
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
private
async
prepareKubeflowConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
// initialize kubeflow trial config to specific type
let
kubeflowTrialConfig
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
kubeflowTrialConfig
:
any
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
;
}
const
workerPodResources
:
any
=
{};
if
(
kubeflowTrialConfig
.
worker
)
{
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
kubeflowTrialConfig
.
worker
.
gpuNum
)
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
kubeflowTrialConfig
.
worker
.
gpuNum
)
;
}
workerPodResources
.
limits
=
Object
.
assign
({},
workerPodResources
.
requests
)
;
le
t
nonWorkerResources
:
any
=
{};
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
le
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
)
{
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
tensorflowTrialConfig
.
ps
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
workerPodResources
.
limits
=
{...
workerPodResources
.
requests
}
;
cons
t
nonWorkerResources
:
any
=
{};
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
cons
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
tensorflowTrialConfig
.
ps
.
gpuNum
)
;
nonWorkerResources
.
limits
=
{...
nonWorkerResources
.
requests
};
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
let
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
pyTorchTrialConfig
.
master
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
const
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
pyTorchTrialConfig
.
master
.
gpuNum
);
nonWorkerResources
.
limits
=
{...
nonWorkerResources
.
requests
};
}
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
let
kubeflowClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
let
azureKubeflowClusterConfig
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsKubeflowClusterConfig
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
await
this
.
createNFSStorage
(
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
!
this
.
kubeflowClusterConfig
){
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
)
let
kubeflowTrialJsonObjsect
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
break
;
}
return
Promise
.
resolve
();
}
/**
...
...
@@ -343,49 +355,48 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master
*/
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
any
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
const
replicaSpecsObj
:
any
=
{};
let
replicaSpecsObjMap
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
if
(
tensorflowTrialConfig
.
ps
){
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
const
replicaSpecsObjMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
tfReplicaSpecs
'
:
replicaSpecsObj
})
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
)
{
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
tfReplicaSpecs
:
replicaSpecsObj
});
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
!==
undefined
)
{
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
}
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
pytorchReplicaSpecs
'
:
replicaSpecsObj
})
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
pytorchReplicaSpecs
:
replicaSpecsObj
})
;
}
return
{
apiVersion
:
`kubeflow.org/
${
this
.
kubernetesCRDClient
.
apiVersion
}
`
,
kind
:
this
.
kubernetesCRDClient
.
jobKind
,
metadata
:
{
metadata
:
{
name
:
kubeflowJobName
,
namespace
:
'
default
'
,
labels
:
{
...
...
@@ -395,7 +406,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
},
spec
:
replicaSpecsObjMap
.
get
(
this
.
kubernetesCRDClient
.
jobKind
)
};
};
}
/**
...
...
@@ -406,21 +417,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param runScriptFile script file name
* @param podResources pod resource config section
*/
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
):
any
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
):
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
le
t
volumeSpecMap
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
){
cons
t
volumeSpecMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -429,9 +441,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
shareName
:
`
${
this
.
azureStorageShare
}
`
,
readonly
:
false
}
}])
}
else
{
le
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
}])
;
}
else
{
cons
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -439,13 +451,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
server
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
server
}
`
,
path
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
path
}
`
}
}])
}])
;
}
return
{
replicas
:
replicaNumber
,
template
:
{
metadata
:
{
// tslint:disable-next-line:no-null-keyword
creationTimestamp
:
null
},
spec
:
{
...
...
@@ -455,7 +468,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// TODO: change the name based on operator's type
name
:
this
.
kubernetesCRDClient
.
containerName
,
image
:
replicaImage
,
args
:
[
"
sh
"
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
args
:
[
'
sh
'
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
volumeMounts
:
[
{
name
:
'
nni-vol
'
,
...
...
@@ -470,5 +483,5 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
};
}
}
export
{
KubeflowTrainingService
}
// tslint:enable: no-unsafe-any no-any
export
{
KubeflowTrainingService
}
;
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
View file @
d48ad027
...
...
@@ -19,44 +19,46 @@
'
use strict
'
;
import
*
as
os
from
'
os
'
import
*
as
path
from
'
path
'
;
import
{
Client1_10
,
config
}
from
'
kubernetes-client
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
var
K8SClient
=
require
(
'
kubernetes-client
'
).
Client
;
var
K8SConfig
=
require
(
'
kubernetes-client
'
).
config
;
/**
* Generict Kubernetes client, target version >= 1.9
*/
// tslint:disable: no-any no-unsafe-any
class
GeneralK8sClient
{
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
constructor
()
{
this
.
client
=
new
K8S
Client
({
config
:
K8SC
onfig
.
fromKubeconfig
(),
version
:
'
1.9
'
});
this
.
client
=
new
Client
1_10
({
config
:
c
onfig
.
fromKubeconfig
(),
version
:
'
1.9
'
});
this
.
client
.
loadSpec
();
}
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`Create secrets failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
}
/**
* Kubernetes CRD client
*/
abstract
class
KubernetesCRDClient
{
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
protected
crdSchema
:
any
;
constructor
()
{
this
.
client
=
new
K8S
Client
({
config
:
K8SC
onfig
.
fromKubeconfig
()
});
this
.
client
=
new
Client
1_10
({
config
:
c
onfig
.
fromKubeconfig
()
});
this
.
client
.
loadSpec
();
}
...
...
@@ -65,8 +67,8 @@ abstract class KubernetesCRDClient {
public
abstract
get
containerName
():
string
;
public
get
jobKind
():
string
{
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
names
&&
this
.
crdSchema
.
spec
.
names
.
kind
)
{
return
this
.
crdSchema
.
spec
.
names
.
kind
;
...
...
@@ -76,55 +78,62 @@ abstract class KubernetesCRDClient {
}
public
get
apiVersion
():
string
{
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
version
)
{
return
this
.
crdSchema
.
spec
.
version
;
}
else
{
throw
new
Error
(
'
KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!
'
);
}
}
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`Create kubernetes job failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
//TODO : replace any
public
async
getKubernetesJob
(
kubeflowJobName
:
string
):
Promise
<
any
>
{
let
result
:
Promise
<
any
>
;
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
).
get
();
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
)
.
get
();
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
response
.
body
);
}
else
{
result
=
Promise
.
reject
(
`KubeflowOperatorClient get tfjobs failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
public
async
deleteKubernetesJob
(
labels
:
Map
<
string
,
string
>
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
// construct match query from labels for deleting tfjob
const
matchQuery
:
string
=
Array
.
from
(
labels
.
keys
()).
map
(
labelKey
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
).
join
(
'
,
'
);
const
matchQuery
:
string
=
Array
.
from
(
labels
.
keys
())
.
map
((
labelKey
:
string
)
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
)
.
join
(
'
,
'
);
try
{
const
deleteResult
:
any
=
await
this
.
operator
().
delete
({
const
deleteResult
:
any
=
await
this
.
operator
()
.
delete
({
qs
:
{
labelSelector
:
matchQuery
,
propagationPolicy
:
"
Background
"
}
propagationPolicy
:
'
Background
'
}
});
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`KubeflowOperatorClient, delete labels
${
matchQuery
}
get wrong statusCode
${
deleteResult
.
statusCode
}
`
);
result
=
Promise
.
reject
(
`KubeflowOperatorClient, delete labels
${
matchQuery
}
get wrong statusCode
${
deleteResult
.
statusCode
}
`
);
}
}
catch
(
err
)
{
}
catch
(
err
)
{
result
=
Promise
.
reject
(
err
);
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
d48ad027
...
...
@@ -22,16 +22,17 @@
export
type
KubernetesStorageKind
=
'
nfs
'
|
'
azureStorage
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
// tslint:disable: completed-docs function-name
export
abstract
class
KubernetesClusterConfig
{
public
readonly
storage
?:
KubernetesStorageKind
;
public
readonly
apiVersion
:
string
;
constructor
(
apiVersion
:
string
,
storage
?:
KubernetesStorageKind
)
{
this
.
storage
=
storage
;
this
.
apiVersion
=
apiVersion
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
throw
new
MethodNotImplementedError
();
}
}
...
...
@@ -48,7 +49,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
public
readonly
nfs
:
NFSConfig
;
constructor
(
apiVersion
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
)
{
...
...
@@ -56,12 +57,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
this
.
nfs
=
nfs
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
nfs
'
;
}
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigNFS
{
let
kubernetesClusterConfigObjectNFS
=
<
KubernetesClusterConfigNFS
>
jsonObject
;
const
kubernetesClusterConfigObjectNFS
:
KubernetesClusterConfigNFS
=
<
KubernetesClusterConfigNFS
>
jsonObject
;
return
new
KubernetesClusterConfigNFS
(
kubernetesClusterConfigObjectNFS
.
apiVersion
,
kubernetesClusterConfigObjectNFS
.
nfs
,
...
...
@@ -71,13 +73,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
}
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
public
readonly
keyVault
:
k
eyVaultConfig
;
public
readonly
keyVault
:
K
eyVaultConfig
;
public
readonly
azureStorage
:
AzureStorage
;
constructor
(
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
azureStorage
:
AzureStorage
,
apiVersion
:
string
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
super
(
apiVersion
,
storage
);
...
...
@@ -85,12 +87,13 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
this
.
azureStorage
=
azureStorage
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
azureStorage
'
;
}
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigAzure
{
let
kubernetesClusterConfigObjectAzure
=
<
KubernetesClusterConfigAzure
>
jsonObject
;
const
kubernetesClusterConfigObjectAzure
:
KubernetesClusterConfigAzure
=
<
KubernetesClusterConfigAzure
>
jsonObject
;
return
new
KubernetesClusterConfigAzure
(
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
...
...
@@ -100,17 +103,20 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
}
}
// tslint:disable-next-line:no-unnecessary-class
export
class
KubernetesClusterConfigFactory
{
public
static
generateKubernetesClusterConfig
(
jsonObject
:
object
):
KubernetesClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
switch
(
storageConfig
.
storage
)
{
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
switch
(
storageConfig
.
storage
)
{
case
'
azureStorage
'
:
return
KubernetesClusterConfigAzure
.
getInstance
(
jsonObject
);
case
'
nfs
'
||
undefined
:
case
'
nfs
'
:
case
undefined
:
return
KubernetesClusterConfigNFS
.
getInstance
(
jsonObject
);
default
:
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
}
...
...
@@ -118,9 +124,9 @@ export class KubernetesClusterConfigFactory {
* NFS configuration to store Kubeflow job related files
*/
export
class
NFSConfig
{
/
**
IP Adress of NFS server
*/
/
/
IP Adress of NFS server
public
readonly
server
:
string
;
/
**
exported NFS path on NFS server
*/
/
/
exported NFS path on NFS server
public
readonly
path
:
string
;
constructor
(
server
:
string
,
path
:
string
)
{
...
...
@@ -133,13 +139,13 @@ export class NFSConfig {
* KeyVault configuration to store the key of Azure Storage Service
* Refer https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2
*/
export
class
k
eyVaultConfig
{
/
**
The vault-name to specify vault
*/
export
class
K
eyVaultConfig
{
/
/
The vault-name to specify vault
public
readonly
vaultName
:
string
;
/
**
The name to specify private key
*/
/
/
The name to specify private key
public
readonly
name
:
string
;
constructor
(
vaultName
:
string
,
name
:
string
){
constructor
(
vaultName
:
string
,
name
:
string
)
{
this
.
vaultName
=
vaultName
;
this
.
name
=
name
;
}
...
...
@@ -149,12 +155,12 @@ export class keyVaultConfig {
* Azure Storage Service
*/
export
class
AzureStorage
{
/
**
The azure share to storage files
*/
/
/
The azure share to storage files
public
readonly
azureShare
:
string
;
/
**
The account name of sotrage service
*/
/
/
The account name of sotrage service
public
readonly
accountName
:
string
;
constructor
(
azureShare
:
string
,
accountName
:
string
){
constructor
(
azureShare
:
string
,
accountName
:
string
)
{
this
.
azureShare
=
azureShare
;
this
.
accountName
=
accountName
;
}
...
...
@@ -164,23 +170,23 @@ export class AzureStorage {
* Trial job configuration for Kubernetes
*/
export
class
KubernetesTrialConfigTemplate
{
/
**
CPU number
*/
/
/
CPU number
public
readonly
cpuNum
:
number
;
/
**
Memory
*/
/
/
Memory
public
readonly
memoryMB
:
number
;
/
**
Docker image
*/
/
/
Docker image
public
readonly
image
:
string
;
/
**
Trail command
*/
/
/
Trail command
public
readonly
command
:
string
;
/
**
Required GPU number for trial job. The number should be in [0,100]
*/
/
/
Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
constructor
(
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
this
.
command
=
command
;
this
.
gpuNum
=
gpuNum
;
this
.
cpuNum
=
cpuNum
;
...
...
@@ -195,4 +201,4 @@ export class KubernetesTrialConfig {
constructor
(
codeDir
:
string
)
{
this
.
codeDir
=
codeDir
;
}
}
\ No newline at end of file
}
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
d48ad027
...
...
@@ -24,7 +24,6 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo
/**
* KubeflowTrialJobDetail
*/
// tslint:disable-next-line:max-classes-per-file
export
class
KubernetesTrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
status
:
TrialJobStatus
;
...
...
@@ -40,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
public
queryJobFailedCount
:
number
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
kubernetesJobName
:
string
,
sequenceId
:
number
,
url
:
string
)
{
this
.
id
=
id
;
this
.
status
=
status
;
...
...
@@ -55,7 +54,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
}
}
export
const
K
ubernetesScriptFormat
=
export
const
k
ubernetesScriptFormat
:
string
=
`#!/bin/bash
export NNI_PLATFORM={0}
export NNI_SYS_DIR=$PWD/nni/{1}
...
...
@@ -71,5 +70,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10}
--nni_manager_version '{11}' --log_collection '{12}'`
+
`
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10}
\
--nni_manager_version '{11}' --log_collection '{12}'
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
;
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
View file @
d48ad027
...
...
@@ -20,11 +20,10 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
MethodNotImplementedError
,
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
/**
...
...
@@ -43,22 +42,22 @@ export class KubernetesJobInfoCollector {
public
async
retrieveTrialStatus
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
)
:
Promise
<
void
>
{
assert
(
kubernetesCRDClient
!==
undefined
);
const
updateKubernetesTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
le
t
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
!
kubernetesTrialJob
)
{
for
(
cons
t
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
kubernetesTrialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
if
(
Date
.
now
()
-
kubernetesTrialJob
.
submitTime
<
20
*
1000
)
{
if
(
Date
.
now
()
-
kubernetesTrialJob
.
submitTime
<
20
*
1000
)
{
return
Promise
.
resolve
();
}
updateKubernetesTrialJobs
.
push
(
this
.
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
,
kubernetesTrialJob
))
updateKubernetesTrialJobs
.
push
(
this
.
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
,
kubernetesTrialJob
))
;
}
await
Promise
.
all
(
updateKubernetesTrialJobs
);
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
throw
new
MethodNotImplementedError
();
}
}
\ No newline at end of file
}
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
View file @
d48ad027
...
...
@@ -19,19 +19,19 @@
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
KubernetesTrainingService
}
from
'
./kubernetesTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*
*/
@
component
.
Singleton
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
@
Inject
private
kubernetesTrainingService
?
:
KubernetesTrainingService
;
private
readonly
kubernetesTrainingService
?
:
KubernetesTrainingService
;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
...
...
@@ -41,8 +41,9 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
this
.
kubernetesTrainingService
=
kubernetesTrainingService
;
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
if
(
!
this
.
kubernetesTrainingService
)
{
if
(
this
.
kubernetesTrainingService
===
undefined
)
{
throw
Error
(
'
kubernetesTrainingService not initialized!
'
);
}
// Split metrics array into single metric, then emit
...
...
@@ -53,5 +54,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
data
:
singleMetric
});
}
}
}
\ No newline at end of file
}
}
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
d48ad027
...
...
@@ -17,35 +17,36 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
path
from
'
path
'
;
import
*
as
azureStorage
from
'
azure-storage
'
;
import
{
EventEmitter
}
from
'
events
'
;
import
{
Base64
}
from
'
js-base64
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
getExperimentId
,
getInitTrialSequenceId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getExperimentRootDir
,
uniqueString
,
getJobCancelStatus
,
getIPV4Address
,
getVersion
}
from
'
../../common/utils
'
;
import
{
TrialJobDetail
,
TrialJobMetric
,
NNIManagerIpConfig
NNIManagerIpConfig
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
import
{
KubernetesTrialJobDetail
,
KubernetesScriptFormat
}
from
'
./kubernetesData
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
*
as
azureStorage
from
'
azure-storage
'
;
var
azure
=
require
(
'
azure-storage
'
);
var
base64
=
require
(
'
js-base64
'
).
Base64
;
/**
* Training Service implementation for Kubernetes
*/
abstract
class
KubernetesTrainingService
{
protected
readonly
NNI_KUBERNETES_TRIAL_LABEL
:
string
=
'
nni-kubernetes-trial
'
;
protected
readonly
log
!
:
Logger
;
protected
readonly
metricsEmitter
:
EventEmitter
;
protected
readonly
trialJobsMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
;
/
**
experiment root dir in NFS
*/
/
/
experiment root dir in NFS
protected
readonly
trialLocalNFSTempFolder
:
string
;
protected
stopping
:
boolean
=
false
;
protected
experimentId
!
:
string
;
...
...
@@ -63,35 +64,36 @@ abstract class KubernetesTrainingService {
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
versionCheck
:
boolean
=
true
;
protected
logCollection
:
string
;
constructor
()
{
this
.
log
=
getLogger
();
this
.
metricsEmitter
=
new
EventEmitter
();
this
.
trialJobsMap
=
new
Map
<
string
,
KubernetesTrialJobDetail
>
();
this
.
trialLocalNFSTempFolder
=
path
.
join
(
getExperimentRootDir
(),
'
trials-nfs-tmp
'
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
this
.
CONTAINER_MOUNT_PATH
=
'
/tmp/mount
'
;
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
logCollection
=
'
none
'
;
}
public
generatePodResource
(
memory
:
number
,
cpuNum
:
number
,
gpuNum
:
number
)
{
// tslint:disable:no-any
public
generatePodResource
(
memory
:
number
,
cpuNum
:
number
,
gpuNum
:
number
):
any
{
return
{
'
memory
'
:
`
${
memory
}
Mi`
,
'
cpu
'
:
`
${
cpuNum
}
`
,
memory
:
`
${
memory
}
Mi`
,
cpu
:
`
${
cpuNum
}
`
,
'
nvidia.com/gpu
'
:
`
${
gpuNum
}
`
}
}
}
;
}
// tslint:enable:no-any
public
async
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
const
jobs
:
TrialJobDetail
[]
=
[];
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
value
.
form
.
jobType
===
'
TRIAL
'
)
{
jobs
.
push
(
await
this
.
getTrialJob
(
key
));
}
}
;
}
return
Promise
.
resolve
(
jobs
);
}
...
...
@@ -100,21 +102,21 @@ abstract class KubernetesTrainingService {
const
kubernetesTrialJob
:
TrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
kubernetesTrialJob
)
{
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
)
}
if
(
kubernetesTrialJob
===
undefined
)
{
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
)
;
}
return
Promise
.
resolve
(
kubernetesTrialJob
);
}
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
{
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
:
void
{
this
.
metricsEmitter
.
on
(
'
metric
'
,
listener
);
}
public
removeTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
{
public
removeTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
:
void
{
this
.
metricsEmitter
.
off
(
'
metric
'
,
listener
);
}
public
get
isMultiPhaseJobSupported
():
boolean
{
return
false
;
}
...
...
@@ -127,6 +129,96 @@ abstract class KubernetesTrainingService {
return
this
.
metricsEmitter
;
}
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
KubernetesTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
not found`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
failed because operatorClient is undefined`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
try
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()],
[
'
trialId
'
,
trialJobId
]
]
));
}
catch
(
err
)
{
const
errorMessage
:
string
=
`Delete trial
${
trialJobId
}
failed:
${
err
}
`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
trialJobDetail
.
endTime
=
Date
.
now
();
trialJobDetail
.
status
=
getJobCancelStatus
(
isEarlyStopped
);
return
Promise
.
resolve
();
}
public
async
cleanUp
():
Promise
<
void
>
{
this
.
stopping
=
true
;
// First, cancel all running kubernetes jobs
for
(
const
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
([
'
RUNNING
'
,
'
WAITING
'
,
'
UNKNOWN
'
].
includes
(
kubernetesTrialJob
.
status
))
{
try
{
await
this
.
cancelTrialJob
(
trialJobId
);
}
catch
(
error
)
{
// DONT throw error during cleanup
}
kubernetesTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
}
// Delete all kubernetes jobs whose expId label is current experiment id
try
{
if
(
this
.
kubernetesCRDClient
!==
undefined
)
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()]
]
));
}
}
catch
(
error
)
{
this
.
log
.
error
(
`Delete kubernetes job with label: app=
${
this
.
NNI_KUBERNETES_TRIAL_LABEL
}
,\
expId=
${
getExperimentId
()}
failed, error is
${
error
}
`
);
}
// Unmount NFS
try
{
await
cpp
.
exec
(
`sudo umount
${
this
.
trialLocalNFSTempFolder
}
`
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Unmount
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
);
}
// Stop kubernetes rest server
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
try
{
await
this
.
kubernetesJobRestServer
.
stop
();
this
.
log
.
info
(
'
Kubernetes Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
// tslint:disable-next-line: no-unsafe-any
this
.
log
.
error
(
`Kubernetes Training service rest server stopped failed, error:
${
error
.
message
}
`
);
return
Promise
.
reject
(
error
);
}
return
Promise
.
resolve
();
}
protected
generateSequenceId
():
number
{
if
(
this
.
nextTrialSequenceId
===
-
1
)
{
this
.
nextTrialSequenceId
=
getInitTrialSequenceId
();
...
...
@@ -135,25 +227,31 @@ abstract class KubernetesTrainingService {
return
this
.
nextTrialSequenceId
++
;
}
// tslint:disable: no-unsafe-any no-any
protected
async
createAzureStorage
(
vaultName
:
string
,
valutKeyName
:
string
,
accountName
:
string
,
azureShare
:
string
):
Promise
<
void
>
{
try
{
const
result
=
await
cpp
.
exec
(
`az keyvault secret show --name
${
valutKeyName
}
--vault-name
${
vaultName
}
`
);
if
(
result
.
stderr
)
{
const
result
:
any
=
await
cpp
.
exec
(
`az keyvault secret show --name
${
valutKeyName
}
--vault-name
${
vaultName
}
`
);
if
(
result
.
stderr
)
{
const
errorMessage
:
string
=
result
.
stderr
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
const
storageAccountKey
=
JSON
.
parse
(
result
.
stdout
).
value
;
const
storageAccountKey
:
any
=
JSON
.
parse
(
result
.
stdout
).
value
;
if
(
this
.
azureStorageAccountName
===
undefined
)
{
throw
new
Error
(
'
azureStorageAccountName not initialized!
'
);
}
//create storage client
this
.
azureStorageClient
=
azure
.
createFileService
(
this
.
azureStorageAccountName
,
storageAccountKey
);
this
.
azureStorageClient
=
azure
Storage
.
createFileService
(
this
.
azureStorageAccountName
,
storageAccountKey
);
await
AzureStorageClientUtility
.
createShare
(
this
.
azureStorageClient
,
this
.
azureStorageShare
);
//create sotrage secret
this
.
azureStorageSecretName
=
'
nni-secret-
'
+
uniqueString
(
8
).
toLowerCase
();
this
.
azureStorageSecretName
=
String
.
Format
(
'
nni-secret-{0}
'
,
uniqueString
(
8
)
.
toLowerCase
());
await
this
.
genericK8sClient
.
createSecret
(
{
apiVersion
:
'
v1
'
,
kind
:
'
Secret
'
,
metadata
:
{
metadata
:
{
name
:
this
.
azureStorageSecretName
,
namespace
:
'
default
'
,
labels
:
{
...
...
@@ -163,38 +261,42 @@ abstract class KubernetesTrainingService {
},
type
:
'
Opaque
'
,
data
:
{
azurestorageaccountname
:
b
ase64
.
encode
(
this
.
azureStorageAccountName
),
azurestorageaccountkey
:
b
ase64
.
encode
(
storageAccountKey
)
azurestorageaccountname
:
B
ase64
.
encode
(
this
.
azureStorageAccountName
),
azurestorageaccountkey
:
B
ase64
.
encode
(
storageAccountKey
)
}
}
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
return
Promise
.
resolve
();
}
/**
// tslint:enable: no-unsafe-any no-any
/**
* Genereate run script for different roles(like worker or ps)
* @param trialJobId trial job id
* @param trialWorkingFolder working folder
* @param command
* @param command
command
* @param trialSequenceId sequence id
*/
protected
async
generateRunScript
(
platform
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
command
:
string
,
trialSequenceId
:
string
,
roleName
:
string
,
gpuNum
:
number
):
Promise
<
string
>
{
let
nvidia
_s
cript
:
string
=
''
;
protected
async
generateRunScript
(
platform
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
command
:
string
,
trialSequenceId
:
string
,
roleName
:
string
,
gpuNum
:
number
):
Promise
<
string
>
{
let
nvidia
S
cript
:
string
=
''
;
// Nvidia devcie plugin for K8S has a known issue that requesting zero GPUs allocates all GPUs
// Refer https://github.com/NVIDIA/k8s-device-plugin/issues/61
// So we have to explicitly set CUDA_VISIBLE_DEVICES to empty if user sets gpuNum to 0 in NNI config file
if
(
gpuNum
===
0
)
{
nvidia
_s
cript
=
`export CUDA_VISIBLE_DEVICES='0'`
;
if
(
gpuNum
===
0
)
{
nvidia
S
cript
=
`export CUDA_VISIBLE_DEVICES='0'`
;
}
const
nniManagerIp
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
=
this
.
versionCheck
?
await
getVersion
():
''
;
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
runScript
:
string
=
String
.
Format
(
K
ubernetesScriptFormat
,
k
ubernetesScriptFormat
,
platform
,
trialJobId
,
path
.
join
(
trialWorkingFolder
,
'
output
'
,
`
${
roleName
}
_output`
),
...
...
@@ -202,108 +304,28 @@ abstract class KubernetesTrainingService {
getExperimentId
(),
trialWorkingFolder
,
trialSequenceId
,
nvidia
_s
cript
,
nvidia
S
cript
,
command
,
nniManagerIp
,
this
.
kubernetesRestServerPort
,
version
,
this
.
logCollection
);
return
Promise
.
resolve
(
runScript
);
}
protected
async
createNFSStorage
(
nfsServer
:
string
,
nfsPath
:
string
):
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
`
);
try
{
await
cpp
.
exec
(
`sudo mount
${
nfsServer
}
:
${
nfsPath
}
${
this
.
trialLocalNFSTempFolder
}
`
);
}
catch
(
error
)
{
}
catch
(
error
)
{
const
mountError
:
string
=
`Mount NFS
${
nfsServer
}
:
${
nfsPath
}
to
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
;
this
.
log
.
error
(
mountError
);
return
Promise
.
reject
(
mountError
);
}
return
Promise
.
resolve
();
}
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
KubernetesTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJobDetail
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
not found`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
failed because operatorClient is undefined`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
try
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()],
[
'
trialId
'
,
trialJobId
]
]
));
}
catch
(
err
)
{
const
errorMessage
:
string
=
`Delete trial
${
trialJobId
}
failed:
${
err
}
`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
trialJobDetail
.
endTime
=
Date
.
now
();
trialJobDetail
.
status
=
getJobCancelStatus
(
isEarlyStopped
);
return
Promise
.
resolve
();
}
public
async
cleanUp
():
Promise
<
void
>
{
this
.
stopping
=
true
;
// First, cancel all running kubernetes jobs
for
(
let
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
([
'
RUNNING
'
,
'
WAITING
'
,
'
UNKNOWN
'
].
includes
(
kubernetesTrialJob
.
status
))
{
try
{
await
this
.
cancelTrialJob
(
trialJobId
);
}
catch
(
error
)
{}
// DONT throw error during cleanup
kubernetesTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
}
// Delete all kubernetes jobs whose expId label is current experiment id
try
{
if
(
this
.
kubernetesCRDClient
)
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()]
]
));
}
}
catch
(
error
)
{
this
.
log
.
error
(
`Delete kubernetes job with label: app=
${
this
.
NNI_KUBERNETES_TRIAL_LABEL
}
,expId=
${
getExperimentId
()}
failed, error is
${
error
}
`
);
}
// Unmount NFS
try
{
await
cpp
.
exec
(
`sudo umount
${
this
.
trialLocalNFSTempFolder
}
`
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Unmount
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
);
}
// Stop kubernetes rest server
if
(
!
this
.
kubernetesJobRestServer
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
try
{
await
this
.
kubernetesJobRestServer
.
stop
();
this
.
log
.
info
(
'
Kubernetes Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Kubernetes Training service rest server stopped failed, error:
${
error
.
message
}
`
);
Promise
.
reject
(
error
);
return
Promise
.
reject
(
mountError
);
}
return
Promise
.
resolve
();
}
}
export
{
KubernetesTrainingService
}
export
{
KubernetesTrainingService
};
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
d48ad027
...
...
@@ -25,10 +25,10 @@ import * as fs from 'fs';
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
execMkdir
,
getScriptName
,
getgpuMetricsCollectorScriptContent
,
execScript
,
execTail
,
execRemove
,
execKill
}
from
'
../common/util
'
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
execKill
,
execMkdir
,
execRemove
,
execTail
,
getgpuMetricsCollectorScriptContent
,
getScriptName
,
runScript
}
from
'
../common/util
'
;
/**
* GPUScheduler for local training service
...
...
@@ -37,8 +37,8 @@ class GPUScheduler {
private
gpuSummary
!
:
GPUSummary
;
private
stopping
:
boolean
;
private
log
:
Logger
;
private
gpuMetricCollectorScriptFolder
:
string
;
private
readonly
log
:
Logger
;
private
readonly
gpuMetricCollectorScriptFolder
:
string
;
constructor
()
{
this
.
stopping
=
false
;
...
...
@@ -58,28 +58,15 @@ class GPUScheduler {
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
);
//generate gpu_metrics_collector script
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
getScriptName
(
'
gpu_metrics_collector
'
));
const
gpuMetricsCollectorScriptContent
:
string
=
getgpuMetricsCollectorScriptContent
(
this
.
gpuMetricCollectorScriptFolder
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
execScript
(
gpuMetricsCollectorScriptPath
)
}
public
getAvailableGPUIndices
(
useActiveGpu
:
boolean
,
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
):
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
if
(
process
.
platform
===
'
win32
'
||
useActiveGpu
)
{
if
(
process
.
platform
===
'
win32
'
||
useActiveGpu
)
{
return
this
.
gpuSummary
.
gpuInfos
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
}
else
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
=
==
undefined
&&
info
.
activeProcessNum
===
0
||
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
!==
undefined
)
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
}
else
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
===
undefined
&&
info
.
activeProcessNum
===
0
||
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
!
==
undefined
)
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
}
}
...
...
@@ -105,17 +92,32 @@ class GPUScheduler {
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
);
//generate gpu_metrics_collector script
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
getScriptName
(
'
gpu_metrics_collector
'
));
const
gpuMetricsCollectorScriptContent
:
string
=
getgpuMetricsCollectorScriptContent
(
this
.
gpuMetricCollectorScriptFolder
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
runScript
(
gpuMetricsCollectorScriptPath
);
}
// tslint:disable:non-literal-fs-path
private
async
updateGPUSummary
():
Promise
<
void
>
{
le
t
gpuMetricPath
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
);
cons
t
gpuMetricPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
);
if
(
fs
.
existsSync
(
gpuMetricPath
))
{
const
cmdresult
:
cpp
.
childProcessPromise
.
Result
=
await
execTail
(
gpuMetricPath
);
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
if
(
cmdresult
!==
undefined
&&
cmdresult
.
stdout
!==
undefined
)
{
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
else
{
this
.
log
.
error
(
'
Could not get gpu metrics information!
'
);
}
}
else
{
this
.
log
.
warning
(
'
gpu_metrics file does not exist!
'
)
}
else
{
this
.
log
.
warning
(
'
gpu_metrics file does not exist!
'
)
;
}
}
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
d48ad027
...
...
@@ -24,6 +24,7 @@ import { EventEmitter } from 'events';
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
ts
from
'
tail-stream
'
;
import
*
as
tkill
from
'
tree-kill
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getInitTrialSequenceId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
...
...
@@ -31,14 +32,14 @@ import {
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
uniqueString
,
isAlive
,
getNewLine
}
from
'
../../common/utils
'
;
import
{
execMkdir
,
getScriptName
,
execScript
,
setEnvironmentVariable
,
execNewFile
}
from
'
../common/util
'
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
getNewLine
,
isAlive
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execMkdir
,
execNewFile
,
getScriptName
,
runScript
,
setEnvironmentVariable
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
const
tkill
=
require
(
'
tree-kill
'
);
/**
* Decode a command
* @param Buffer binary incoming data
...
...
@@ -46,7 +47,7 @@ const tkill = require('tree-kill');
* success: true if the buffer contains at least one complete command; otherwise false
* remain: remaining data after the first command
*/
// tslint:disable
-next-line:
informative-docs
// tslint:disable
:newline-per-chained-call
informative-docs
function
decodeCommand
(
data
:
Buffer
):
[
boolean
,
string
,
string
,
Buffer
]
{
if
(
data
.
length
<
8
)
{
return
[
false
,
''
,
''
,
data
];
...
...
@@ -61,6 +62,7 @@ function decodeCommand(data: Buffer): [boolean, string, string, Buffer] {
return
[
true
,
commandType
,
content
,
remain
];
}
// tslint:enable:newline-per-chained-call informative-docs
/**
* LocalTrialJobDetail
...
...
@@ -117,21 +119,21 @@ class LocalConfig {
* Local machine training service
*/
class
LocalTrainingService
implements
TrainingService
{
private
eventEmitter
:
EventEmitter
;
private
jobMap
:
Map
<
string
,
LocalTrialJobDetail
>
;
private
jobQueue
:
string
[];
private
readonly
eventEmitter
:
EventEmitter
;
private
readonly
jobMap
:
Map
<
string
,
LocalTrialJobDetail
>
;
private
readonly
jobQueue
:
string
[];
private
initialized
:
boolean
;
private
stopping
:
boolean
;
private
rootDir
!
:
string
;
private
trialSequenceId
:
number
;
private
gpuScheduler
!
:
GPUScheduler
;
private
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
;
private
readonly
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
;
private
designatedGpuIndices
!
:
Set
<
number
>
;
private
log
:
Logger
;
private
readonly
log
:
Logger
;
private
localTrailConfig
?:
TrialConfig
;
private
localConfig
?:
LocalConfig
;
private
isMultiPhase
:
boolean
;
private
jobStreamMap
:
Map
<
string
,
ts
.
Stream
>
;
private
readonly
jobStreamMap
:
Map
<
string
,
ts
.
Stream
>
;
private
maxTrialNumPerGpu
:
number
;
private
useActiveGpu
:
boolean
;
...
...
@@ -182,7 +184,7 @@ class LocalTrainingService implements TrainingService {
return
this
.
getHostJob
(
trialJobId
);
}
if
(
trialJob
.
status
===
'
RUNNING
'
)
{
le
t
alive
:
boolean
=
await
isAlive
(
trialJob
.
pid
);
cons
t
alive
:
boolean
=
await
isAlive
(
trialJob
.
pid
);
if
(
!
alive
)
{
trialJob
.
endTime
=
Date
.
now
();
this
.
setTrialJobStatus
(
trialJob
,
'
FAILED
'
);
...
...
@@ -276,7 +278,7 @@ class LocalTrainingService implements TrainingService {
return
Promise
.
resolve
();
}
if
(
trialJob
.
form
.
jobType
===
'
TRIAL
'
)
{
await
tkill
(
trialJob
.
pid
,
'
SIGKILL
'
);
tkill
(
trialJob
.
pid
,
'
SIGKILL
'
);
}
else
if
(
trialJob
.
form
.
jobType
===
'
HOST
'
)
{
await
cpp
.
exec
(
`pkill -9 -P
${
trialJob
.
pid
}
`
);
}
else
{
...
...
@@ -290,7 +292,8 @@ class LocalTrainingService implements TrainingService {
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
if
(
!
this
.
initialized
)
{
this
.
rootDir
=
getExperimentRootDir
();
if
(
!
fs
.
existsSync
(
this
.
rootDir
)){
// tslint:disable-next-line:non-literal-fs-path
if
(
!
fs
.
existsSync
(
this
.
rootDir
))
{
await
cpp
.
exec
(
`powershell.exe mkdir
${
this
.
rootDir
}
`
);
}
this
.
initialized
=
true
;
...
...
@@ -299,7 +302,7 @@ class LocalTrainingService implements TrainingService {
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
this
.
localTrailConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
if
(
!
this
.
localTrailConfig
)
{
if
(
this
.
localTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
}
this
.
log
.
info
(
`required GPU number is
${
this
.
localTrailConfig
.
gpuNum
}
`
);
...
...
@@ -336,10 +339,10 @@ class LocalTrainingService implements TrainingService {
switch
(
key
)
{
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
let
getResult
:
Promise
<
string
>
;
if
(
!
this
.
localTrailConfig
)
{
if
(
this
.
localTrailConfig
===
undefined
)
{
getResult
=
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`
${
key
}
is never set yet`
));
}
else
{
getResult
=
Promise
.
resolve
(
!
this
.
localTrailConfig
?
''
:
JSON
.
stringify
(
this
.
localTrailConfig
));
getResult
=
Promise
.
resolve
(
JSON
.
stringify
(
this
.
localTrailConfig
));
}
return
getResult
;
...
...
@@ -366,7 +369,7 @@ class LocalTrainingService implements TrainingService {
if
([
'
SUCCEEDED
'
,
'
FAILED
'
,
'
USER_CANCELED
'
,
'
SYS_CANCELED
'
,
'
EARLY_STOPPED
'
].
includes
(
trialJob
.
status
))
{
if
(
this
.
jobStreamMap
.
has
(
trialJob
.
id
))
{
const
stream
:
ts
.
Stream
|
undefined
=
this
.
jobStreamMap
.
get
(
trialJob
.
id
);
if
(
!
stream
)
{
if
(
stream
===
undefined
)
{
throw
new
Error
(
`Could not find stream in trial
${
trialJob
.
id
}
`
);
}
stream
.
destroy
();
...
...
@@ -376,13 +379,13 @@ class LocalTrainingService implements TrainingService {
if
(
trialJob
.
gpuIndices
!==
undefined
&&
trialJob
.
gpuIndices
.
length
>
0
&&
this
.
gpuScheduler
!==
undefined
)
{
if
(
oldStatus
===
'
RUNNING
'
&&
trialJob
.
status
!==
'
RUNNING
'
)
{
for
(
const
index
of
trialJob
.
gpuIndices
)
{
le
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
cons
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
throw
new
Error
(
`gpu resource schedule error`
);
}
else
if
(
num
===
1
)
{
}
else
if
(
num
===
1
)
{
this
.
occupiedGpuIndexNumMap
.
delete
(
index
);
}
else
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
-
1
)
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
-
1
)
;
}
}
}
...
...
@@ -424,10 +427,10 @@ class LocalTrainingService implements TrainingService {
}
let
selectedGPUIndices
:
number
[]
=
[];
le
t
availableGpuIndices
:
number
[]
=
this
.
gpuScheduler
.
getAvailableGPUIndices
(
this
.
useActiveGpu
,
this
.
occupiedGpuIndexNumMap
);
for
(
le
t
index
of
availableGpuIndices
)
{
le
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
||
num
<
this
.
maxTrialNumPerGpu
)
{
cons
t
availableGpuIndices
:
number
[]
=
this
.
gpuScheduler
.
getAvailableGPUIndices
(
this
.
useActiveGpu
,
this
.
occupiedGpuIndexNumMap
);
for
(
cons
t
index
of
availableGpuIndices
)
{
cons
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
||
num
<
this
.
maxTrialNumPerGpu
)
{
selectedGPUIndices
.
push
(
index
);
}
}
...
...
@@ -461,11 +464,11 @@ class LocalTrainingService implements TrainingService {
private
occupyResource
(
resource
:
{
gpuIndices
:
number
[]}):
void
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
for
(
const
index
of
resource
.
gpuIndices
)
{
le
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
1
)
cons
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
1
)
;
}
else
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
+
1
)
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
+
1
)
;
}
}
}
...
...
@@ -498,20 +501,20 @@ class LocalTrainingService implements TrainingService {
}
}
private
getScript
(
localTrailConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]{
le
t
script
:
string
[]
=
[];
if
(
process
.
platform
===
"
win32
"
)
{
private
getScript
(
localTrailConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]
{
cons
t
script
:
string
[]
=
[];
if
(
process
.
platform
===
'
win32
'
)
{
script
.
push
(
`cmd /c
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = "$NOW_DATE" + "000"`
,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
-NoNewline -encoding utf8`
);
}
else
{
}
else
{
script
.
push
(
`eval
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s000
\`
>
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
}
return
script
;
}
...
...
@@ -519,28 +522,29 @@ class LocalTrainingService implements TrainingService {
const
trialJobDetail
:
LocalTrialJobDetail
=
<
LocalTrialJobDetail
>
this
.
jobMap
.
get
(
trialJobId
);
const
variables
:
{
key
:
string
;
value
:
string
}[]
=
this
.
getEnvironmentVariables
(
trialJobDetail
,
resource
);
if
(
!
this
.
localTrailConfig
)
{
if
(
this
.
localTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
runScript
Lines
:
string
[]
=
[];
if
(
process
.
platform
!==
"
win32
"
)
{
runScript
Lines
.
push
(
'
#!/bin/bash
'
);
const
runScript
Content
:
string
[]
=
[];
if
(
process
.
platform
!==
'
win32
'
)
{
runScript
Content
.
push
(
'
#!/bin/bash
'
);
}
runScript
Lines
.
push
(
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
runScript
Content
.
push
(
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
for
(
const
variable
of
variables
)
{
runScript
Lines
.
push
(
setEnvironmentVariable
(
variable
));
runScript
Content
.
push
(
setEnvironmentVariable
(
variable
));
}
const
scripts
:
string
[]
=
this
.
getScript
(
this
.
localTrailConfig
,
trialJobDetail
.
workingDirectory
);
scripts
.
forEach
(
script
=>
{
runScript
Lines
.
push
(
script
);
scripts
.
forEach
(
(
script
:
string
)
=>
{
runScript
Content
.
push
(
script
);
});
await
execMkdir
(
trialJobDetail
.
workingDirectory
);
await
execMkdir
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
));
await
execNewFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
metrics
'
));
const
scriptName
:
string
=
getScriptName
(
'
run
'
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
),
runScriptLines
.
join
(
getNewLine
()),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
),
runScriptContent
.
join
(
getNewLine
()),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
await
this
.
writeParameterFile
(
trialJobDetail
.
workingDirectory
,
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
).
hyperParameters
);
const
trialJobProcess
:
cp
.
ChildProcess
=
exec
Script
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
));
const
trialJobProcess
:
cp
.
ChildProcess
=
run
Script
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
));
this
.
setTrialJobStatus
(
trialJobDetail
,
'
RUNNING
'
);
trialJobDetail
.
startTime
=
Date
.
now
();
trialJobDetail
.
pid
=
trialJobProcess
.
pid
;
...
...
src/nni_manager/training_service/pai/hdfsClientUtility.ts
View file @
d48ad027
...
...
@@ -17,12 +17,12 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
import
*
as
path
from
'
path
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
}
from
'
../../common/log
'
;
import
{
unixPathJoin
}
from
'
../../common/utils
'
import
{
unixPathJoin
}
from
'
../../common/utils
'
;
/**
* HDFS client utility, including copy file/directory
...
...
@@ -33,6 +33,7 @@ export namespace HDFSClientUtility {
* @param hdfsUserName HDFS user name
*/
function
hdfsExpRootDir
(
hdfsUserName
:
string
):
string
{
// tslint:disable-next-line:prefer-template
return
'
/
'
+
unixPathJoin
(
hdfsUserName
,
'
nni
'
,
'
experiments
'
,
getExperimentId
());
}
...
...
@@ -50,63 +51,70 @@ export namespace HDFSClientUtility {
* @param trialId NNI trial ID
*/
export
function
getHdfsTrialWorkDir
(
hdfsUserName
:
string
,
trialId
:
string
):
string
{
le
t
root
=
hdfsExpRootDir
(
hdfsUserName
)
console
.
log
(
root
)
cons
t
root
:
string
=
hdfsExpRootDir
(
hdfsUserName
)
;
return
unixPathJoin
(
root
,
'
trials
'
,
trialId
);
}
/**
* Copy a local file to hdfs directory
*
*
* @param localFilePath local file path(source)
* @param hdfsFilePath hdfs file path(target)
* @param hdfsClient hdfs client
*/
// tslint:disable: no-unsafe-any non-literal-fs-path no-any
export
async
function
copyFileToHdfs
(
localFilePath
:
string
,
hdfsFilePath
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
// tslint:disable-next-line:non-literal-fs-path
fs
.
exists
(
localFilePath
,
(
exists
:
boolean
)
=>
{
// Detect if local file exist
if
(
exists
)
{
var
localFileStream
=
fs
.
createReadStream
(
localFilePath
);
var
hdfsFileStream
=
hdfsClient
.
createWriteStream
(
hdfsFilePath
);
const
localFileStream
:
fs
.
ReadStream
=
fs
.
createReadStream
(
localFilePath
);
const
hdfsFileStream
:
any
=
hdfsClient
.
createWriteStream
(
hdfsFilePath
);
localFileStream
.
pipe
(
hdfsFileStream
);
hdfsFileStream
.
on
(
'
finish
'
,
function
onFinish
()
{
hdfsFileStream
.
on
(
'
finish
'
,
()
=>
{
deferred
.
resolve
();
});
hdfsFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
getLogger
().
error
(
`HDFSCientUtility:copyFileToHdfs, copy file failed, err is
${
err
.
message
}
`
);
getLogger
()
.
error
(
`HDFSCientUtility:copyFileToHdfs, copy file failed, err is
${
err
.
message
}
`
);
deferred
.
reject
(
err
);
});
}
else
{
getLogger
().
error
(
`HDFSCientUtility:copyFileToHdfs,
${
localFilePath
}
doesn't exist locally`
);
getLogger
()
.
error
(
`HDFSCientUtility:copyFileToHdfs,
${
localFilePath
}
doesn't exist locally`
);
deferred
.
reject
(
'
file not exist!
'
);
}
});
return
deferred
.
promise
;
}
/**
* Recursively copy local directory to hdfs directory
*
*
* @param localDirectory local directory
* @param hdfsDirectory HDFS directory
* @param hdfsClient HDFS client
*/
export
async
function
copyDirectoryToHdfs
(
localDirectory
:
string
,
hdfsDirectory
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
export
async
function
copyDirectoryToHdfs
(
localDirectory
:
string
,
hdfsDirectory
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
// TODO: fs.readdirSync doesn't support ~($HOME)
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
for
(
var
fileName
of
fileNameArray
){
for
(
const
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
if
(
fs
.
lstatSync
(
fullFilePath
).
isFile
())
{
// tslint:disable-next-line:non-literal-fs-path
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
await
copyFileToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
}
else
{
// If filePath is a directory, recuisively copy it to remote directory
await
copyDirectoryToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
}
}
catch
(
error
)
{
}
catch
(
error
)
{
deferred
.
reject
(
error
);
}
}
...
...
@@ -118,20 +126,20 @@ export namespace HDFSClientUtility {
/**
* Read content from HDFS file
*
*
* @param hdfsPath HDFS file path
* @param hdfsClient HDFS client
*/
export
async
function
readFileFromHDFS
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
Buffer
>
{
export
async
function
readFileFromHDFS
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
Buffer
>
{
const
deferred
:
Deferred
<
Buffer
>
=
new
Deferred
<
Buffer
>
();
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
if
(
!
exist
)
{
if
(
!
exist
)
{
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
}
const
remoteFileStream
=
hdfsClient
.
createReadStream
(
hdfsPath
);
const
remoteFileStream
:
any
=
hdfsClient
.
createReadStream
(
hdfsPath
);
remoteFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
// Reject with the error
deferred
.
reject
(
err
);
...
...
@@ -141,8 +149,8 @@ export namespace HDFSClientUtility {
// Concat the data chunk to buffer
buffer
=
Buffer
.
concat
([
buffer
,
chunk
]);
});
remoteFileStream
.
on
(
'
finish
'
,
function
onFinish
()
{
remoteFileStream
.
on
(
'
finish
'
,
()
=>
{
// Upload is done, resolve
deferred
.
resolve
(
buffer
);
});
...
...
@@ -152,36 +160,38 @@ export namespace HDFSClientUtility {
/**
* Check if an HDFS path already exists
*
*
* @param hdfsPath target path need to check in HDFS
* @param hdfsClient HDFS client
*/
export
async
function
pathExists
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
exists
(
hdfsPath
,
(
exist
:
boolean
)
=>
{
hdfsClient
.
exists
(
hdfsPath
,
(
exist
:
boolean
)
=>
{
deferred
.
resolve
(
exist
);
});
let
timeoutId
:
NodeJS
.
Timer
let
timeoutId
:
NodeJS
.
Timer
;
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
)
:
void
=>
{
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId
=
setTimeout
(()
=>
deferred
.
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
),
5000
);
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId
=
setTimeout
(()
=>
{
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
)
;
}
,
5000
);
});
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
]).
finally
(()
=>
clearTimeout
(
timeoutId
));
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
])
.
finally
(()
=>
{
clearTimeout
(
timeoutId
);
});
}
/**
* Mkdir in HDFS, use default permission 755
*
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
HDFS client
*/
export
function
mkdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
mkdir
(
hdfsPath
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
hdfsClient
.
mkdir
(
hdfsPath
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
deferred
.
resolve
(
true
);
}
else
{
deferred
.
reject
(
err
.
message
);
...
...
@@ -193,19 +203,19 @@ export namespace HDFSClientUtility {
/**
* Read directory contents
*
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
HDFS client
*/
export
async
function
readdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
string
[]
>
{
const
deferred
:
Deferred
<
string
[]
>
=
new
Deferred
<
string
[]
>
();
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
if
(
!
exist
)
{
if
(
!
exist
)
{
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
}
hdfsClient
.
readdir
(
hdfsPath
,
(
err
:
any
,
files
:
any
[]
)
=>
{
if
(
err
)
{
hdfsClient
.
readdir
(
hdfsPath
,
(
err
:
any
,
files
:
any
[])
=>
{
if
(
err
)
{
deferred
.
reject
(
err
);
}
...
...
@@ -218,18 +228,20 @@ export namespace HDFSClientUtility {
/**
* Delete HDFS path
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
HDFS client
* @param recursive Mark if need to delete recursively
*/
export
function
deletePath
(
hdfsPath
:
string
,
hdfsClient
:
any
,
recursive
:
boolean
=
true
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
unlink
(
hdfsPath
,
recursive
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
hdfsClient
.
unlink
(
hdfsPath
,
recursive
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
deferred
.
resolve
(
true
);
}
else
{
deferred
.
reject
(
err
.
message
);
}
});
return
deferred
.
promise
;
}
// tslint:enable: no-unsafe-any non-literal-fs-path no-any
}
src/nni_manager/training_service/pai/paiConfig.ts
View file @
d48ad027
...
...
@@ -19,8 +19,11 @@
'
use strict
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
/**
* Task role for PAI
*/
export
class
PAITaskRole
{
// Name for the task role
public
readonly
name
:
string
;
...
...
@@ -36,7 +39,7 @@ export class PAITaskRole {
public
readonly
command
:
string
;
//Shared memory for one task in the task role
public
readonly
shmMB
?:
number
;
/**
* Constructor
* @param name Name for the task role
...
...
@@ -46,18 +49,22 @@ export class PAITaskRole {
* @param gpuNumber GPU number for one task in the task role, no less than 0
* @param command Executable command for tasks in the task role, can not be empty
*/
constructor
(
name
:
string
,
taskNumber
:
number
,
cpuNumber
:
number
,
memoryMB
:
number
,
gpuNumber
:
number
,
command
:
string
,
shmMB
?:
number
)
{
constructor
(
name
:
string
,
taskNumber
:
number
,
cpuNumber
:
number
,
memoryMB
:
number
,
gpuNumber
:
number
,
command
:
string
,
shmMB
?:
number
)
{
this
.
name
=
name
;
this
.
taskNumber
=
taskNumber
;
this
.
cpuNumber
=
cpuNumber
;
this
.
memoryMB
=
memoryMB
;
this
.
gpuNumber
=
gpuNumber
;
this
.
command
=
command
;
this
.
command
=
command
;
this
.
shmMB
=
shmMB
;
}
}
export
class
PAIJobConfig
{
/**
* Trial job configuration submitted to PAI
*/
export
class
PAIJobConfig
{
// Name for the job, need to be unique
public
readonly
jobName
:
string
;
// URL pointing to the Docker image for all tasks in the job
...
...
@@ -83,8 +90,8 @@ export class PAIJobConfig{
* @param outputDir Output directory on HDFS
* @param taskRoles List of taskRole, one task role at least
*/
constructor
(
jobName
:
string
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
codeDir
:
string
,
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
)
{
constructor
(
jobName
:
string
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
codeDir
:
string
,
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
)
{
this
.
jobName
=
jobName
;
this
.
image
=
image
;
this
.
dataDir
=
dataDir
;
...
...
@@ -95,6 +102,9 @@ export class PAIJobConfig{
}
}
/**
* PAI cluster configuration
*/
export
class
PAIClusterConfig
{
public
readonly
userName
:
string
;
public
readonly
passWord
:
string
;
...
...
@@ -106,18 +116,21 @@ export class PAIClusterConfig {
* @param passWord password of PAI Cluster
* @param host Host IP of PAI Cluster
*/
constructor
(
userName
:
string
,
passWord
:
string
,
host
:
string
){
constructor
(
userName
:
string
,
passWord
:
string
,
host
:
string
)
{
this
.
userName
=
userName
;
this
.
passWord
=
passWord
;
this
.
host
=
host
;
}
}
export
class
NNIPAITrialConfig
extends
TrialConfig
{
/**
* PAI trial configuration
*/
export
class
NNIPAITrialConfig
extends
TrialConfig
{
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
dataDir
:
string
;
public
outputDir
:
string
;
//The virtual cluster job runs on. If omitted, the job will run on default virtual cluster
...
...
@@ -125,8 +138,8 @@ export class NNIPAITrialConfig extends TrialConfig{
//Shared memory for one task in the task role
public
shmMB
?:
number
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
)
{
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
)
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
...
...
@@ -137,4 +150,3 @@ export class NNIPAITrialConfig extends TrialConfig{
this
.
shmMB
=
shmMB
;
}
}
src/nni_manager/training_service/pai/paiData.ts
View file @
d48ad027
...
...
@@ -19,8 +19,11 @@
'
use strict
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
common/trainingService
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../
common/trainingService
'
;
/**
* PAI trial job detail
*/
export
class
PAITrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
status
:
TrialJobStatus
;
...
...
@@ -36,8 +39,8 @@ export class PAITrialJobDetail implements TrialJobDetail {
public
hdfsLogPath
:
string
;
public
isEarlyStopped
?:
boolean
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
,
hdfsLogPath
:
string
)
{
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
,
hdfsLogPath
:
string
)
{
this
.
id
=
id
;
this
.
status
=
status
;
this
.
paiJobName
=
paiJobName
;
...
...
@@ -50,7 +53,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
}
}
export
const
PAI_INSTALL_NNI_SHELL_FORMAT
:
string
=
export
const
PAI_INSTALL_NNI_SHELL_FORMAT
:
string
=
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
...
...
@@ -61,13 +64,15 @@ else
fi`
;
export
const
PAI_TRIAL_COMMAND_FORMAT
:
string
=
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --nni_manager_version '{12}' --log_collection '{13}'`
;
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} \
&& cd $NNI_SYS_DIR && sh install_nni.sh \
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}' \
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' \
--nni_manager_version '{12}' --log_collection '{13}'`
;
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
`hdfs://{0}:9000/`
;
export
const
PAI_LOG_PATH_FORMAT
:
string
=
`http://{0}/webhdfs/explorer.html#{1}`
// tslint:disable:no-http-string
export
const
PAI_LOG_PATH_FORMAT
:
string
=
`http://{0}/webhdfs/explorer.html#{1}`
;
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
View file @
d48ad027
...
...
@@ -19,13 +19,14 @@
'
use strict
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
request
from
'
request
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
PAITrialJobDetail
}
from
'
./paiData
'
;
import
{
PAIClusterConfig
}
from
'
./paiConfig
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
PAIClusterConfig
}
from
'
./paiConfig
'
;
import
{
PAITrialJobDetail
}
from
'
./paiData
'
;
/**
* Collector PAI jobs info from PAI cluster, and update pai job status locally
...
...
@@ -43,60 +44,65 @@ export class PAIJobInfoCollector {
}
public
async
retrieveTrialStatus
(
paiToken
?
:
string
,
paiClusterConfig
?:
PAIClusterConfig
)
:
Promise
<
void
>
{
if
(
!
paiClusterConfig
||
!
paiToken
)
{
return
Promise
.
resolve
();
if
(
paiClusterConfig
===
undefined
||
paiToken
===
undefined
)
{
return
Promise
.
resolve
();
}
const
updatePaiTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
le
t
[
trialJobId
,
paiTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
!
paiTrialJob
)
{
for
(
cons
t
[
trialJobId
,
paiTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
paiTrialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
updatePaiTrialJobs
.
push
(
this
.
getSinglePAITrialJobInfo
(
paiTrialJob
,
paiToken
,
paiClusterConfig
))
updatePaiTrialJobs
.
push
(
this
.
getSinglePAITrialJobInfo
(
paiTrialJob
,
paiToken
,
paiClusterConfig
))
;
}
await
Promise
.
all
(
updatePaiTrialJobs
);
}
private
getSinglePAITrialJobInfo
(
paiTrialJob
:
PAITrialJobDetail
,
paiToken
:
string
,
paiClusterConfig
:
PAIClusterConfig
)
:
Promise
<
void
>
{
private
getSinglePAITrialJobInfo
(
paiTrialJob
:
PAITrialJobDetail
,
paiToken
:
string
,
paiClusterConfig
:
PAIClusterConfig
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
!
this
.
statusesNeedToCheck
.
includes
(
paiTrialJob
.
status
))
{
deferred
.
resolve
();
return
deferred
.
promise
;
}
// Rest call to get PAI job info and update status
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const
getJobInfoRequest
:
request
.
Options
=
{
// tslint:disable-next-line:no-http-string
uri
:
`http://
${
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
paiClusterConfig
.
userName
}
/jobs/
${
paiTrialJob
.
paiJobName
}
`
,
method
:
'
GET
'
,
json
:
true
,
headers
:
{
"
Content-Type
"
:
"
application/json
"
,
"
Authorization
"
:
'
Bearer
'
+
paiToken
'
Content-Type
'
:
'
application/json
'
,
Authorization
:
`
Bearer
${
paiToken
}
`
}
};
//TODO : pass in request timeout param?
// tslint:disable: no-unsafe-any no-any cyclomatic-complexity
//TODO : pass in request timeout param?
request
(
getJobInfoRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
500
)
{
if
(
(
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
500
)
{
this
.
log
.
error
(
`PAI Training service: get job info for trial
${
paiTrialJob
.
id
}
from PAI Cluster failed!`
);
// Queried PAI job info failed, set job status to UNKNOWN
if
(
paiTrialJob
.
status
===
'
WAITING
'
||
paiTrialJob
.
status
===
'
RUNNING
'
)
{
if
(
paiTrialJob
.
status
===
'
WAITING
'
||
paiTrialJob
.
status
===
'
RUNNING
'
)
{
paiTrialJob
.
status
=
'
UNKNOWN
'
;
}
}
else
{
if
(
response
.
body
.
jobStatus
&&
response
.
body
.
jobStatus
.
state
)
{
switch
(
response
.
body
.
jobStatus
.
state
)
{
case
'
WAITING
'
:
if
(
response
.
body
.
jobStatus
&&
response
.
body
.
jobStatus
.
state
)
{
switch
(
response
.
body
.
jobStatus
.
state
)
{
case
'
WAITING
'
:
paiTrialJob
.
status
=
'
WAITING
'
;
break
;
case
'
RUNNING
'
:
paiTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
paiTrialJob
.
startTime
)
{
if
(
paiTrialJob
.
startTime
===
undefined
)
{
paiTrialJob
.
startTime
=
response
.
body
.
jobStatus
.
appLaunchedTime
;
}
if
(
!
paiTrialJob
.
url
)
{
paiTrialJob
.
url
=
response
.
body
.
jobStatus
.
appTrackingUrl
;
if
(
paiTrialJob
.
url
===
undefined
)
{
paiTrialJob
.
url
=
response
.
body
.
jobStatus
.
appTrackingUrl
;
}
break
;
case
'
SUCCEEDED
'
:
...
...
@@ -104,30 +110,31 @@ export class PAIJobInfoCollector {
break
;
case
'
STOPPED
'
:
if
(
paiTrialJob
.
isEarlyStopped
!==
undefined
)
{
paiTrialJob
.
status
=
paiTrialJob
.
isEarlyStopped
===
true
?
paiTrialJob
.
status
=
paiTrialJob
.
isEarlyStopped
===
true
?
'
EARLY_STOPPED
'
:
'
USER_CANCELED
'
;
}
else
{
// if paiTrialJob's isEarlyStopped is undefined, that mean we didn't stop it via cancellation, mark it as SYS_CANCELLED by PAI
/* if paiTrialJob's isEarlyStopped is undefined, that mean we didn't stop it via cancellation,
* mark it as SYS_CANCELLED by PAI
*/
paiTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
break
;
case
'
FAILED
'
:
paiTrialJob
.
status
=
'
FAILED
'
;
paiTrialJob
.
status
=
'
FAILED
'
;
break
;
default
:
paiTrialJob
.
status
=
'
UNKNOWN
'
;
break
;
}
// For final job statues, update startTime, endTime and url
if
(
this
.
finalStatuses
.
includes
(
paiTrialJob
.
status
))
{
if
(
!
paiTrialJob
.
startTime
)
{
if
(
this
.
finalStatuses
.
includes
(
paiTrialJob
.
status
))
{
if
(
paiTrialJob
.
startTime
===
undefined
)
{
paiTrialJob
.
startTime
=
response
.
body
.
jobStatus
.
appLaunchedTime
;
}
if
(
!
paiTrialJob
.
endTime
)
{
if
(
paiTrialJob
.
endTime
===
undefined
)
{
paiTrialJob
.
endTime
=
response
.
body
.
jobStatus
.
completedTime
;
}
// Set pai trial job's url to WebHDFS output path
if
(
paiTrialJob
.
hdfsLogPath
)
{
if
(
paiTrialJob
.
hdfsLogPath
!==
undefined
)
{
paiTrialJob
.
url
+=
`,
${
paiTrialJob
.
hdfsLogPath
}
`
;
}
}
...
...
@@ -138,4 +145,5 @@ export class PAIJobInfoCollector {
return
deferred
.
promise
;
}
// tslint:enable: no-unsafe-any no-any
}
src/nni_manager/training_service/pai/paiJobRestServer.ts
View file @
d48ad027
...
...
@@ -19,17 +19,17 @@
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
PAITrainingService
}
from
'
./paiTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
* PAI Training service Rest server, provides rest API to support pai job metrics update
*
*
*/
@
component
.
Singleton
export
class
PAIJobRestServer
extends
ClusterJobRestServer
{
export
class
PAIJobRestServer
extends
ClusterJobRestServer
{
@
Inject
private
readonly
paiTrainingService
:
PAITrainingService
;
...
...
@@ -41,6 +41,7 @@ export class PAIJobRestServer extends ClusterJobRestServer{
this
.
paiTrainingService
=
component
.
get
(
PAITrainingService
);
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
...
...
@@ -51,4 +52,4 @@ export class PAIJobRestServer extends ClusterJobRestServer{
});
}
}
}
\ No newline at end of file
}
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
d48ad027
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
...
...
@@ -23,6 +22,7 @@
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
request
from
'
request
'
;
import
*
as
component
from
'
../../common/component
'
;
...
...
@@ -37,18 +37,17 @@ import {
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
getExperimentRootDir
,
getIPV4Address
,
getVersion
,
uniqueString
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
,
execMkdir
}
from
'
../common/util
'
;
import
{
unixPathJoin
}
from
'
../../common/utils
'
import
{
execMkdir
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
HDFSClientUtility
}
from
'
./hdfsClientUtility
'
;
import
{
NNIPAITrialConfig
,
PAIClusterConfig
,
PAIJobConfig
,
PAITaskRole
}
from
'
./paiConfig
'
;
import
{
PAI_LOG_PATH_FORMAT
,
PAI_OUTPUT_DIR_FORMAT
,
PAI_TRIAL_COMMAND_FORMAT
,
PAITrialJobDetail
}
from
'
./paiData
'
;
import
{
PAIJobInfoCollector
}
from
'
./paiJobInfoCollector
'
;
import
{
PAIJobRestServer
}
from
'
./paiJobRestServer
'
;
const
WebHDFS
=
require
(
'
webhdfs
'
)
;
import
*
as
WebHDFS
from
'
webhdfs
'
;
/**
* Training Service implementation for OpenPAI (Open Platform for AI)
...
...
@@ -62,13 +61,14 @@ class PAITrainingService implements TrainingService {
private
readonly
expRootDir
:
string
;
private
paiTrialConfig
:
NNIPAITrialConfig
|
undefined
;
private
paiClusterConfig
?:
PAIClusterConfig
;
private
jobQueue
:
string
[];
private
readonly
jobQueue
:
string
[];
private
stopping
:
boolean
=
false
;
// tslint:disable-next-line:no-any
private
hdfsClient
:
any
;
private
paiToken
?
:
string
;
private
paiTokenUpdateTime
?:
number
;
private
paiTokenUpdateInterval
:
number
;
private
experimentId
!
:
string
;
private
readonly
paiTokenUpdateInterval
:
number
;
private
readonly
experimentId
!
:
string
;
private
readonly
paiJobCollector
:
PAIJobInfoCollector
;
private
readonly
hdfsDirPattern
:
string
;
private
hdfsBaseDir
:
string
|
undefined
;
...
...
@@ -121,13 +121,13 @@ class PAITrainingService implements TrainingService {
}
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
const
paiTrialJob
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
paiTrialJob
)
{
if
(
paiTrialJob
===
undefined
)
{
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
);
}
...
...
@@ -144,7 +144,7 @@ class PAITrainingService implements TrainingService {
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
const
deferred
:
Deferred
<
PAITrialJobDetail
>
=
new
Deferred
<
PAITrialJobDetail
>
();
if
(
!
this
.
hdfsBaseDir
)
{
if
(
this
.
hdfsBaseDir
===
undefined
)
{
throw
new
Error
(
'
hdfsBaseDir is not initialized
'
);
}
...
...
@@ -187,24 +187,26 @@ class PAITrainingService implements TrainingService {
return
false
;
}
// tslint:disable:no-http-string
public
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
!
trialJobDetail
)
{
if
(
trialJobDetail
===
undefined
)
{
this
.
log
.
error
(
`cancelTrialJob: trial job id
${
trialJobId
}
not found`
);
return
Promise
.
reject
();
}
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
!
this
.
paiToken
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
}
const
stopJobRequest
:
request
.
Options
=
{
uri
:
`http://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
this
.
paiClusterConfig
.
userName
}
/jobs/
${
trialJobDetail
.
paiJobName
}
/executionType`
,
uri
:
`http://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
this
.
paiClusterConfig
.
userName
}
\
/jobs/
${
trialJobDetail
.
paiJobName
}
/executionType`
,
method
:
'
PUT
'
,
json
:
true
,
body
:
{
value
:
'
STOP
'
},
...
...
@@ -217,10 +219,12 @@ class PAITrainingService implements TrainingService {
// Set trialjobDetail's early stopped field, to mark the job's cancellation source
trialJobDetail
.
isEarlyStopped
=
isEarlyStopped
;
// tslint:disable-next-line:no-any
request
(
stopJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
400
)
{
if
(
(
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
this
.
log
.
error
(
`PAI Training service: stop trial
${
trialJobId
}
to PAI Cluster failed!`
);
deferred
.
reject
(
error
?
error
.
message
:
`Stop trial failed, http code:
${
response
.
statusCode
}
`
);
deferred
.
reject
((
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`Stop trial failed, http code:
${
response
.
statusCode
}
`
);
}
else
{
deferred
.
resolve
();
}
...
...
@@ -229,6 +233,7 @@ class PAITrainingService implements TrainingService {
return
deferred
.
promise
;
}
// tslint:disable: no-unsafe-any no-any
// tslint:disable-next-line:max-func-body-length
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -256,47 +261,47 @@ class PAITrainingService implements TrainingService {
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
deferred
.
reject
(
new
Error
(
'
pai cluster config is not initialized
'
));
break
;
}
this
.
paiTrialConfig
=
<
NNIPAITrialConfig
>
JSON
.
parse
(
value
);
//paiTrialConfig.outputDir could be null if it is not set in nnictl
if
(
this
.
paiTrialConfig
.
outputDir
===
undefined
||
this
.
paiTrialConfig
.
outputDir
===
null
){
if
(
this
.
paiTrialConfig
.
outputDir
===
undefined
||
this
.
paiTrialConfig
.
outputDir
===
null
)
{
this
.
paiTrialConfig
.
outputDir
=
String
.
Format
(
PAI_OUTPUT_DIR_FORMAT
,
this
.
paiClusterConfig
.
host
).
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
}
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
deferred
.
reject
(
new
Error
(
error
));
break
;
}
const
hdfsDirContent
=
this
.
paiTrialConfig
.
outputDir
.
match
(
this
.
hdfsDirPattern
);
const
hdfsDirContent
:
any
=
this
.
paiTrialConfig
.
outputDir
.
match
(
this
.
hdfsDirPattern
);
if
(
hdfsDirContent
===
null
)
{
throw
new
Error
(
'
Trial outputDir format Error
'
);
}
const
groups
=
hdfsDirContent
.
groups
;
const
groups
:
any
=
hdfsDirContent
.
groups
;
if
(
groups
===
undefined
)
{
throw
new
Error
(
'
Trial outputDir format Error
'
);
}
this
.
hdfsOutputHost
=
groups
[
'
host
'
];
this
.
hdfsOutputHost
=
groups
.
host
;
//TODO: choose to use /${username} as baseDir
this
.
hdfsBaseDir
=
groups
[
'
baseDir
'
]
;
if
(
this
.
hdfsBaseDir
===
undefined
)
{
this
.
hdfsBaseDir
=
groups
.
baseDir
;
if
(
this
.
hdfsBaseDir
===
undefined
)
{
this
.
hdfsBaseDir
=
'
/
'
;
}
let
dataOutputHdfsClient
;
let
dataOutputHdfsClient
:
any
;
if
(
this
.
paiClusterConfig
.
host
===
this
.
hdfsOutputHost
&&
this
.
hdfsClient
)
{
dataOutputHdfsClient
=
this
.
hdfsClient
;
}
else
{
...
...
@@ -338,6 +343,7 @@ class PAITrainingService implements TrainingService {
return
deferred
.
promise
;
}
// tslint:enable: no-unsafe-any
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
...
...
@@ -358,6 +364,7 @@ class PAITrainingService implements TrainingService {
deferred
.
resolve
();
this
.
log
.
info
(
'
PAI Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
// tslint:disable-next-line: no-unsafe-any
this
.
log
.
error
(
`PAI Training service rest server stopped failed, error:
${
error
.
message
}
`
);
deferred
.
reject
(
error
);
}
...
...
@@ -374,35 +381,35 @@ class PAITrainingService implements TrainingService {
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJobDetail
)
{
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`Failed to find PAITrialJobDetail for job
${
trialJobId
}
`
);
}
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
if
(
!
this
.
paiTrialConfig
)
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
if
(
!
this
.
paiToken
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
}
if
(
!
this
.
hdfsBaseDir
)
{
if
(
this
.
hdfsBaseDir
===
undefined
)
{
throw
new
Error
(
'
hdfsBaseDir is not initialized
'
);
}
if
(
!
this
.
hdfsOutputHost
)
{
if
(
this
.
hdfsOutputHost
===
undefined
)
{
throw
new
Error
(
'
hdfsOutputHost is not initialized
'
);
}
if
(
!
this
.
paiRestServerPort
)
{
if
(
this
.
paiRestServerPort
===
undefined
)
{
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
this
.
paiRestServerPort
=
restServer
.
clusterRestServerPort
;
}
// Make sure experiment code files is copied from local to HDFS
if
(
this
.
copyExpCodeDirPromise
)
{
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
}
...
...
@@ -420,13 +427,14 @@ class PAITrainingService implements TrainingService {
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
);
if
(
trialForm
)
{
if
(
trialForm
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
}
);
}
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
...
...
@@ -446,8 +454,10 @@ class PAITrainingService implements TrainingService {
HDFSClientUtility
.
getHdfsExpCodeDir
(
this
.
paiClusterConfig
.
userName
),
version
,
this
.
logCollection
).
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
// tslint:disable-next-line:no-console
console
.
log
(
`nniPAItrial command is
${
nniPaiTrialCommand
.
trim
()}
`
);
const
paiTaskRoles
:
PAITaskRole
[]
=
[
new
PAITaskRole
(
...
...
@@ -489,7 +499,10 @@ class PAITrainingService implements TrainingService {
await
HDFSClientUtility
.
copyDirectoryToHdfs
(
trialLocalTempFolder
,
hdfsCodeDir
,
this
.
hdfsClient
);
}
catch
(
error
)
{
this
.
log
.
error
(
`PAI Training service: copy
${
this
.
paiTrialConfig
.
codeDir
}
to HDFS
${
hdfsCodeDir
}
failed, error is
${
error
}
`
);
throw
new
Error
(
error
.
message
);
trialJobDetail
.
status
=
'
FAILED
'
;
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
}
// Step 3. Submit PAI job via Rest call
...
...
@@ -504,13 +517,14 @@ class PAITrainingService implements TrainingService {
Authorization
:
`Bearer
${
this
.
paiToken
}
`
}
};
// tslint:disable:no-any no-unsafe-any
request
(
submitJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
error
?
error
.
message
:
if
(
(
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`Submit trial
${
trialJobId
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
response
.
body
}
`
;
this
.
log
.
error
(
errorMessage
);
trialJobDetail
.
status
=
'
FAILED
'
;
deferred
.
re
ject
(
new
Error
(
errorMessage
)
);
deferred
.
re
solve
(
true
);
}
else
{
trialJobDetail
.
submitTime
=
Date
.
now
();
deferred
.
resolve
(
true
);
...
...
@@ -530,18 +544,18 @@ class PAITrainingService implements TrainingService {
private
async
statusCheckingLoop
():
Promise
<
void
>
{
while
(
!
this
.
stopping
)
{
try
{
try
{
await
this
.
updatePaiToken
();
}
catch
(
error
){
}
catch
(
error
)
{
this
.
log
.
error
(
`
${
error
}
`
);
//only throw error when initlize paiToken first time
if
(
!
this
.
paiToken
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
error
);
}
}
await
this
.
paiJobCollector
.
retrieveTrialStatus
(
this
.
paiToken
,
this
.
paiClusterConfig
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
if
(
restServer
.
getErrorMessage
)
{
if
(
restServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
}
await
delay
(
3000
);
...
...
@@ -572,17 +586,17 @@ class PAITrainingService implements TrainingService {
const
currentTime
:
number
=
new
Date
().
getTime
();
//If pai token initialized and not reach the interval time, do not update
if
(
this
.
paiTokenUpdateTime
&&
(
currentTime
-
this
.
paiTokenUpdateTime
)
<
this
.
paiTokenUpdateInterval
){
if
(
this
.
paiTokenUpdateTime
!==
undefined
&&
(
currentTime
-
this
.
paiTokenUpdateTime
)
<
this
.
paiTokenUpdateInterval
)
{
return
Promise
.
resolve
();
}
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
const
paiClusterConfigError
:
string
=
`pai cluster config not initialized!`
;
this
.
log
.
error
(
`
${
paiClusterConfigError
}
`
);
throw
Error
(
`
${
paiClusterConfigError
}
`
);
}
const
authentication
_r
eq
:
request
.
Options
=
{
const
authentication
R
eq
:
request
.
Options
=
{
uri
:
`http://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/token`
,
method
:
'
POST
'
,
json
:
true
,
...
...
@@ -592,12 +606,12 @@ class PAITrainingService implements TrainingService {
}
};
request
(
authentication
_r
eq
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
)
{
request
(
authentication
R
eq
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
!==
undefined
&&
error
!==
null
)
{
this
.
log
.
error
(
`Get PAI token failed:
${
error
.
message
}
`
);
deferred
.
reject
(
new
Error
(
`Get PAI token failed:
${
error
.
message
}
`
));
}
else
{
if
(
response
.
statusCode
!==
200
){
if
(
response
.
statusCode
!==
200
)
{
this
.
log
.
error
(
`Get PAI token failed: get PAI Rest return code
${
response
.
statusCode
}
`
);
deferred
.
reject
(
new
Error
(
`Get PAI token failed:
${
response
.
body
}
, please check paiConfig username or password`
));
}
...
...
@@ -616,8 +630,9 @@ class PAITrainingService implements TrainingService {
});
return
Promise
.
race
([
timeoutDelay
,
deferred
.
promise
])
.
finally
(()
=>
clearTimeout
(
timeoutId
));
.
finally
(()
=>
{
clearTimeout
(
timeoutId
)
;
}
);
}
// tslint:enable:no-any no-unsafe-any no-http-string
}
export
{
PAITrainingService
};
src/nni_manager/training_service/pai/paiTrialConfig.ts
View file @
d48ad027
...
...
@@ -19,16 +19,20 @@
'
use strict
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
export
class
PAITrialConfig
extends
TrialConfig
{
/**
* PAI configuration to run trials
*/
export
class
PAITrialConfig
extends
TrialConfig
{
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
outputDir
:
string
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
)
{
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
)
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
...
...
@@ -36,4 +40,4 @@ export class PAITrialConfig extends TrialConfig{
this
.
dataDir
=
dataDir
;
this
.
outputDir
=
outputDir
;
}
}
\ No newline at end of file
}
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
View file @
d48ad027
...
...
@@ -21,10 +21,12 @@
import
*
as
assert
from
'
assert
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialJobDetail
}
from
'
../../common/trainingService
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
RemoteMachineTrialJobDetail
,
parseGpuIndices
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
ScheduleResultType
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
import
{
TrialJobDetail
}
from
'
common/trainingService
'
;
import
{
parseGpuIndices
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
/**
* A simple GPU scheduler implementation
...
...
@@ -32,7 +34,7 @@ import { TrialJobDetail } from 'common/trainingService';
export
class
GPUScheduler
{
private
readonly
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
private
log
:
Logger
=
getLogger
();
private
readonly
log
:
Logger
=
getLogger
();
/**
* Constructor
...
...
@@ -89,21 +91,21 @@ export class GPUScheduler {
* remove the job's gpu reversion
*/
public
removeGpuReservation
(
trialJobId
:
string
,
trialJobMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
):
void
{
le
t
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
trialJobMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
cons
t
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
trialJobMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`could not get trialJobDetail by id
${
trialJobId
}
`
);
}
if
(
trialJobDetail
.
rmMeta
!==
undefined
&&
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
!==
undefined
&&
trialJobDetail
.
gpuIndices
!==
undefined
&&
}
if
(
trialJobDetail
.
rmMeta
!==
undefined
&&
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
!==
undefined
&&
trialJobDetail
.
gpuIndices
!==
undefined
&&
trialJobDetail
.
gpuIndices
.
length
>
0
)
{
for
(
const
gpuInfo
of
trialJobDetail
.
gpuIndices
)
{
le
t
num
:
number
|
undefined
=
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
!==
undefined
)
{
if
(
num
===
1
)
{
cons
t
num
:
number
|
undefined
=
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
!==
undefined
)
{
if
(
num
===
1
)
{
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
delete
(
gpuInfo
.
index
);
}
else
{
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
-
1
)
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
-
1
)
;
}
}
}
...
...
@@ -116,7 +118,6 @@ export class GPUScheduler {
const
totalResourceMap
:
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
=
this
.
gpuResourceDetection
();
const
qualifiedRMs
:
RemoteMachineMeta
[]
=
[];
totalResourceMap
.
forEach
((
gpuInfos
:
GPUInfo
[],
rmMeta
:
RemoteMachineMeta
)
=>
{
if
(
gpuInfos
!==
undefined
&&
gpuInfos
.
length
>=
requiredGPUNum
)
{
qualifiedRMs
.
push
(
rmMeta
);
}
...
...
@@ -154,6 +155,7 @@ export class GPUScheduler {
}
}
this
.
log
.
debug
(
`designated gpu indices:
${
designatedGpuIndices
}
`
);
// tslint:disable: strict-boolean-expressions
rmMeta
.
gpuSummary
.
gpuInfos
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
// if the GPU has active process, OR be reserved by a job,
// or index not in gpuIndices configuration in machineList,
...
...
@@ -161,10 +163,10 @@ export class GPUScheduler {
// We should NOT allocate this GPU
// if users set useActiveGpu, use the gpu whether there is another activeProcess
if
(
designatedGpuIndices
===
undefined
||
designatedGpuIndices
.
has
(
gpuInfo
.
index
))
{
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
le
t
num
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
le
t
maxTrialNumPerGpu
:
number
=
rmMeta
.
maxTrialNumPerGpu
?
rmMeta
.
maxTrialNumPerGpu
:
1
;
if
((
num
===
undefined
&&
(
!
rmMeta
.
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
rmMeta
.
useActiveGpu
))
||
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
cons
t
num
:
number
|
undefined
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
cons
t
maxTrialNumPerGpu
:
number
=
rmMeta
.
maxTrialNumPerGpu
?
rmMeta
.
maxTrialNumPerGpu
:
1
;
if
((
num
===
undefined
&&
(
!
rmMeta
.
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
rmMeta
.
useActiveGpu
))
||
(
num
!==
undefined
&&
num
<
maxTrialNumPerGpu
))
{
availableGPUs
.
push
(
gpuInfo
);
}
...
...
@@ -179,6 +181,7 @@ export class GPUScheduler {
return
totalResourceMap
;
}
// tslint:enable: strict-boolean-expressions
private
selectMachine
(
rmMetas
:
RemoteMachineMeta
[]):
RemoteMachineMeta
{
assert
(
rmMetas
!==
undefined
&&
rmMetas
.
length
>
0
);
...
...
@@ -196,23 +199,28 @@ export class GPUScheduler {
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
let
num
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
===
undefined
)
{
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
let
num
:
number
|
undefined
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
===
undefined
)
{
num
=
0
;
}
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
+
1
);
}
else
{
}
else
{
throw
new
Error
(
`Machine
${
rmMeta
.
ip
}
occupiedGpuIndexMap initialize error!`
);
}
});
trialJobDetail
.
gpuIndices
=
allocatedGPUs
;
trialJobDetail
.
rmMeta
=
rmMeta
;
return
{
resultType
:
ScheduleResultType
.
SUCCEED
,
scheduleInfo
:
{
rmMeta
:
rmMeta
,
cuda_visible_device
:
allocatedGPUs
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
return
gpuInfo
.
index
;
}).
join
(
'
,
'
)
cuda_visible_device
:
allocatedGPUs
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
return
gpuInfo
.
index
;
})
.
join
(
'
,
'
)
}
};
}
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
d48ad027
...
...
@@ -23,7 +23,7 @@ import * as fs from 'fs';
import
{
Client
,
ConnectConfig
}
from
'
ssh2
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
GPU
Summary
,
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
GPU
Info
,
GPUSummary
}
from
'
../common/gpuData
'
;
/**
* Metadata of remote machine for configuration and statuc query
...
...
@@ -73,7 +73,6 @@ export class RemoteCommandResult {
/**
* RemoteMachineTrialJobDetail
*/
// tslint:disable-next-line:max-classes-per-file
export
class
RemoteMachineTrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
status
:
TrialJobStatus
;
...
...
@@ -98,7 +97,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
this
.
form
=
form
;
this
.
sequenceId
=
sequenceId
;
this
.
tags
=
[];
this
.
gpuIndices
=
[]
this
.
gpuIndices
=
[]
;
}
}
...
...
@@ -112,7 +111,7 @@ export class SSHClient {
this
.
sshClient
=
sshClient
;
this
.
usedConnectionNumber
=
usedConnectionNumber
;
}
public
get
getSSHClientInstance
():
Client
{
return
this
.
sshClient
;
}
...
...
@@ -121,17 +120,20 @@ export class SSHClient {
return
this
.
usedConnectionNumber
;
}
public
addUsedConnectionNumber
()
{
public
addUsedConnectionNumber
()
:
void
{
this
.
usedConnectionNumber
+=
1
;
}
public
minusUsedConnectionNumber
()
{
public
minusUsedConnectionNumber
()
:
void
{
this
.
usedConnectionNumber
-=
1
;
}
}
/**
* The remote machine ssh client manager
*/
export
class
SSHClientManager
{
private
sshClientArray
:
SSHClient
[];
private
readonly
sshClientArray
:
SSHClient
[];
private
readonly
maxTrialNumberPerConnection
:
number
;
private
readonly
rmMeta
:
RemoteMachineMeta
;
constructor
(
sshClientArray
:
SSHClient
[],
maxTrialNumberPerConnection
:
number
,
rmMeta
:
RemoteMachineMeta
)
{
...
...
@@ -140,122 +142,128 @@ export class SSHClientManager {
this
.
maxTrialNumberPerConnection
=
maxTrialNumberPerConnection
;
}
/**
* Create a new ssh connection client and initialize it
*/
private
initNewSSHClient
():
Promise
<
Client
>
{
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
const
conn
:
Client
=
new
Client
();
let
connectConfig
:
ConnectConfig
=
{
host
:
this
.
rmMeta
.
ip
,
port
:
this
.
rmMeta
.
port
,
username
:
this
.
rmMeta
.
username
};
if
(
this
.
rmMeta
.
passwd
)
{
connectConfig
.
password
=
this
.
rmMeta
.
passwd
;
}
else
if
(
this
.
rmMeta
.
sshKeyPath
)
{
if
(
!
fs
.
existsSync
(
this
.
rmMeta
.
sshKeyPath
))
{
//SSh key path is not a valid file, reject
deferred
.
reject
(
new
Error
(
`
${
this
.
rmMeta
.
sshKeyPath
}
does not exist.`
));
}
const
privateKey
:
string
=
fs
.
readFileSync
(
this
.
rmMeta
.
sshKeyPath
,
'
utf8
'
);
connectConfig
.
privateKey
=
privateKey
;
connectConfig
.
passphrase
=
this
.
rmMeta
.
passphrase
;
}
else
{
deferred
.
reject
(
new
Error
(
`No valid passwd or sshKeyPath is configed.`
));
}
conn
.
on
(
'
ready
'
,
()
=>
{
this
.
addNewSSHClient
(
conn
);
deferred
.
resolve
(
conn
);
}).
on
(
'
error
'
,
(
err
:
Error
)
=>
{
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
}).
connect
(
connectConfig
);
return
deferred
.
promise
;
}
/**
* find a available ssh client in ssh array, if no ssh client available, return undefined
*/
public
async
getAvailableSSHClient
():
Promise
<
Client
>
{
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
for
(
const
index
in
this
.
sshClientArray
)
{
le
t
connectionNumber
:
number
=
this
.
sshClientArray
[
index
].
getUsedConnectionNumber
;
if
(
connectionNumber
<
this
.
maxTrialNumberPerConnection
)
{
for
(
const
index
of
this
.
sshClientArray
.
keys
()
)
{
cons
t
connectionNumber
:
number
=
this
.
sshClientArray
[
index
].
getUsedConnectionNumber
;
if
(
connectionNumber
<
this
.
maxTrialNumberPerConnection
)
{
this
.
sshClientArray
[
index
].
addUsedConnectionNumber
();
deferred
.
resolve
(
this
.
sshClientArray
[
index
].
getSSHClientInstance
);
return
deferred
.
promise
;
}
};
}
//init a new ssh client if could not get an available one
return
await
this
.
initNewSSHClient
();
return
this
.
initNewSSHClient
();
}
/**
* add a new ssh client to sshClientArray
* @param sshClient
* @param sshClient
SSH Client
*/
public
addNewSSHClient
(
client
:
Client
)
{
public
addNewSSHClient
(
client
:
Client
)
:
void
{
this
.
sshClientArray
.
push
(
new
SSHClient
(
client
,
1
));
}
/**
* first ssh cli
l
ent instance is used for gpu collector and host job
* first ssh client instance is used for gpu collector and host job
*/
public
getFirstSSHClient
()
{
public
getFirstSSHClient
()
:
Client
{
return
this
.
sshClientArray
[
0
].
getSSHClientInstance
;
}
/**
* close all of ssh client
*/
public
closeAllSSHClient
()
{
for
(
le
t
sshClient
of
this
.
sshClientArray
)
{
public
closeAllSSHClient
()
:
void
{
for
(
cons
t
sshClient
of
this
.
sshClientArray
)
{
sshClient
.
getSSHClientInstance
.
end
();
}
}
/**
* retrieve resource, minus a number for given ssh client
* @param client
* @param client
SSH Client
*/
public
releaseConnection
(
client
:
Client
|
undefined
)
{
if
(
!
client
)
{
public
releaseConnection
(
client
:
Client
|
undefined
)
:
void
{
if
(
client
===
undefined
)
{
throw
new
Error
(
`could not release a undefined ssh client`
);
}
for
(
le
t
index
in
this
.
sshClientArray
)
{
if
(
this
.
sshClientArray
[
index
].
getSSHClientInstance
===
client
)
{
for
(
cons
t
index
of
this
.
sshClientArray
.
keys
()
)
{
if
(
this
.
sshClientArray
[
index
].
getSSHClientInstance
===
client
)
{
this
.
sshClientArray
[
index
].
minusUsedConnectionNumber
();
break
;
}
}
}
}
/**
* Create a new ssh connection client and initialize it
*/
// tslint:disable:non-literal-fs-path
private
initNewSSHClient
():
Promise
<
Client
>
{
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
const
conn
:
Client
=
new
Client
();
const
connectConfig
:
ConnectConfig
=
{
host
:
this
.
rmMeta
.
ip
,
port
:
this
.
rmMeta
.
port
,
username
:
this
.
rmMeta
.
username
};
if
(
this
.
rmMeta
.
passwd
!==
undefined
)
{
connectConfig
.
password
=
this
.
rmMeta
.
passwd
;
}
else
if
(
this
.
rmMeta
.
sshKeyPath
!==
undefined
)
{
if
(
!
fs
.
existsSync
(
this
.
rmMeta
.
sshKeyPath
))
{
//SSh key path is not a valid file, reject
deferred
.
reject
(
new
Error
(
`
${
this
.
rmMeta
.
sshKeyPath
}
does not exist.`
));
}
const
privateKey
:
string
=
fs
.
readFileSync
(
this
.
rmMeta
.
sshKeyPath
,
'
utf8
'
);
connectConfig
.
privateKey
=
privateKey
;
connectConfig
.
passphrase
=
this
.
rmMeta
.
passphrase
;
}
else
{
deferred
.
reject
(
new
Error
(
`No valid passwd or sshKeyPath is configed.`
));
}
conn
.
on
(
'
ready
'
,
()
=>
{
this
.
addNewSSHClient
(
conn
);
deferred
.
resolve
(
conn
);
})
.
on
(
'
error
'
,
(
err
:
Error
)
=>
{
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
})
.
connect
(
connectConfig
);
return
deferred
.
promise
;
}
}
export
type
RemoteMachineScheduleResult
=
{
scheduleInfo
:
RemoteMachineScheduleInfo
|
undefined
;
resultType
:
ScheduleResultType
};
export
type
RemoteMachineScheduleInfo
=
{
rmMeta
:
RemoteMachineMeta
;
cuda_visible_device
:
string
};
export
enum
ScheduleResultType
{
/
*
Schedule succeeded
*/
/
/
Schedule succeeded
SUCCEED
,
/
*
Temporarily, no enough available GPU right now
*/
/
/
Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU
,
/
*
Cannot match requirement even if all GPU are a
*/
/
/
Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
export
const
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
:
string
=
`#!/bin/bash
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5}
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \
NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5}
cd $NNI_SYS_DIR
sh install_nni.sh
echo $$ >{6}
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $?
\`
date +%s%3N
\`
>{12}`
;
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
...
...
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
View file @
d48ad027
...
...
@@ -19,17 +19,17 @@
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
RemoteMachineTrainingService
}
from
'
./remoteMachineTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
* RemoteMachine Training service Rest server, provides rest RemoteMachine to support remotemachine job metrics update
*
*
*/
@
component
.
Singleton
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
@
Inject
private
readonly
remoteMachineTrainingService
:
RemoteMachineTrainingService
;
...
...
@@ -41,6 +41,7 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer{
this
.
remoteMachineTrainingService
=
component
.
get
(
RemoteMachineTrainingService
);
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWNls
...
...
@@ -51,4 +52,4 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer{
});
}
}
}
\ No newline at end of file
}
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
d48ad027
...
...
@@ -34,42 +34,45 @@ import { getExperimentId, getInitTrialSequenceId } from '../../common/experiment
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
ObservableTimer
}
from
'
../../common/observableTimer
'
;
import
{
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
NNIManagerIpConfig
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
,
getJobCancelStatus
,
getRemoteTmpDir
,
getIPV4Address
,
getVersion
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getRemoteTmpDir
,
getVersion
,
uniqueString
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execCopydir
,
execMkdir
,
execRemove
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
SSHClient
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
}
from
'
../common/gpuData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
,
execRemove
,
execMkdir
,
execCopydir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
/**
* Training Service implementation for Remote Machine (Linux)
*/
@
component
.
Singleton
class
RemoteMachineTrainingService
implements
TrainingService
{
private
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
//machine ssh client map
private
trialSSHClientMap
:
Map
<
string
,
Client
>
;
//trial ssh client map
private
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
:
number
=
5
// every ssh client has a max trial concurrency number
private
expRootDir
:
string
;
private
remoteExpRootDir
:
string
;
private
readonly
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
//machine ssh client map
private
readonly
trialSSHClientMap
:
Map
<
string
,
Client
>
;
//trial ssh client map
private
readonly
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
:
number
=
5
;
// every ssh client has a max trial concurrency number
private
readonly
expRootDir
:
string
;
private
readonly
remoteExpRootDir
:
string
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
gpuScheduler
:
GPUScheduler
;
private
jobQueue
:
string
[];
private
timer
:
ObservableTimer
;
private
readonly
gpuScheduler
:
GPUScheduler
;
private
readonly
jobQueue
:
string
[];
private
readonly
timer
:
ObservableTimer
;
private
stopping
:
boolean
=
false
;
private
metricsEmitter
:
EventEmitter
;
private
log
:
Logger
;
private
readonly
metricsEmitter
:
EventEmitter
;
private
readonly
log
:
Logger
;
private
isMultiPhase
:
boolean
=
false
;
private
trialSequenceId
:
number
;
private
remoteRestServerPort
?:
number
;
...
...
@@ -117,7 +120,7 @@ class RemoteMachineTrainingService implements TrainingService {
break
;
}
}
if
(
restServer
.
getErrorMessage
)
{
if
(
restServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
...
...
@@ -125,36 +128,37 @@ class RemoteMachineTrainingService implements TrainingService {
}
this
.
log
.
info
(
'
Remote machine training service exit.
'
);
}
/**
* give trial a ssh connection
* @param trial
* @param trial
remote machine trial job detail
*/
public
async
allocateSSHClientForTrial
(
trial
:
RemoteMachineTrialJobDetail
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
!
trial
.
rmMeta
)
{
if
(
trial
.
rmMeta
===
undefined
)
{
throw
new
Error
(
`rmMeta not set in trial
${
trial
.
id
}
`
);
}
le
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
if
(
!
sshClientManager
)
{
cons
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
if
(
sshClientManager
===
undefined
)
{
throw
new
Error
(
`remoteSSHClient not initialized`
);
}
le
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
cons
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
this
.
trialSSHClientMap
.
set
(
trial
.
id
,
sshClient
);
deferred
.
resolve
();
return
deferred
.
promise
;
}
/**
* If a trial is finished, release the connection resource
* @param trial
* @param trial
remote machine trial job detail
*/
public
releaseTrialSSHClient
(
trial
:
RemoteMachineTrialJobDetail
):
void
{
if
(
!
trial
.
rmMeta
)
{
if
(
trial
.
rmMeta
===
undefined
)
{
throw
new
Error
(
`rmMeta not set in trial
${
trial
.
id
}
`
);
}
le
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
if
(
!
sshClientManager
)
{
cons
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
if
(
sshClientManager
===
undefined
)
{
throw
new
Error
(
`sshClientManager not initialized`
);
}
sshClientManager
.
releaseConnection
(
this
.
trialSSHClientMap
.
get
(
trial
.
id
));
...
...
@@ -167,11 +171,11 @@ class RemoteMachineTrainingService implements TrainingService {
const
jobs
:
TrialJobDetail
[]
=
[];
const
deferred
:
Deferred
<
TrialJobDetail
[]
>
=
new
Deferred
<
TrialJobDetail
[]
>
();
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
value
.
form
.
jobType
===
'
TRIAL
'
)
{
jobs
.
push
(
await
this
.
getTrialJob
(
key
));
}
}
;
}
deferred
.
resolve
(
jobs
);
return
deferred
.
promise
;
...
...
@@ -183,7 +187,7 @@ class RemoteMachineTrainingService implements TrainingService {
*/
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
const
trialJob
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJob
)
{
if
(
trialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
//TO DO: add another job status, and design new job status change logic
...
...
@@ -193,7 +197,7 @@ class RemoteMachineTrainingService implements TrainingService {
throw
new
Error
(
`rmMeta not set for submitted job
${
trialJobId
}
`
);
}
const
sshClient
:
Client
|
undefined
=
this
.
trialSSHClientMap
.
get
(
trialJob
.
id
);
if
(
!
sshClient
)
{
if
(
sshClient
===
undefined
)
{
throw
new
Error
(
`Invalid job id:
${
trialJobId
}
, cannot find ssh client`
);
}
...
...
@@ -223,8 +227,9 @@ class RemoteMachineTrainingService implements TrainingService {
* Submit trial job
* @param form trial job description form
*/
// tslint:disable-next-line:informative-docs
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
trialConfig
)
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
...
...
@@ -275,17 +280,6 @@ class RemoteMachineTrainingService implements TrainingService {
return
trialJobDetail
;
}
/**
* remove gpu reversion when job is not running
*/
private
updateGpuReservation
()
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
}
};
}
/**
* Is multiphase job supported in current training service
...
...
@@ -298,10 +292,11 @@ class RemoteMachineTrainingService implements TrainingService {
* Cancel trial job
* @param trialJobId ID of trial job
*/
// tslint:disable:informative-docs no-unsafe-any
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
trialJob
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJob
)
{
if
(
trialJob
===
undefined
)
{
deferred
.
reject
();
throw
new
Error
(
`trial job id
${
trialJobId
}
not found`
);
}
...
...
@@ -316,7 +311,7 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
trialJob
.
rmMeta
!==
undefined
)
{
// If the trial job is already scheduled, check its status and kill the trial process in remote machine
const
sshClient
:
Client
|
undefined
=
this
.
trialSSHClientMap
.
get
(
trialJob
.
id
);
if
(
!
sshClient
)
{
if
(
sshClient
===
undefined
)
{
deferred
.
reject
();
throw
new
Error
(
`Invalid job id
${
trialJobId
}
, cannot find ssh client`
);
}
...
...
@@ -358,20 +353,23 @@ class RemoteMachineTrainingService implements TrainingService {
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
if
(
!
remoteMachineTrailConfig
)
{
if
(
remoteMachineTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
}
// codeDir is not a valid directory, throw Error
if
(
!
fs
.
lstatSync
(
remoteMachineTrailConfig
.
codeDir
).
isDirectory
())
{
// tslint:disable-next-line:non-literal-fs-path
if
(
!
fs
.
lstatSync
(
remoteMachineTrailConfig
.
codeDir
)
.
isDirectory
())
{
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
}
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
return
Promise
.
reject
(
new
Error
(
error
));
}
this
.
trialConfig
=
remoteMachineTrailConfig
;
...
...
@@ -400,60 +398,73 @@ class RemoteMachineTrainingService implements TrainingService {
return
deferred
.
promise
;
}
/**
* cleanup() has a time out of 10s to clean remote connections
* cleanup() has a time out of 10s to clean remote connections
*/
public
async
cleanUp
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Stopping remote machine training service...
'
);
this
.
stopping
=
true
;
await
Promise
.
race
([
delay
(
10000
),
this
.
cleanupConnections
()]);
}
/**
* remove gpu reversion when job is not running
*/
private
updateGpuReservation
():
void
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
}
}
}
/**
* stop gpu_metric_collector process in remote machine and remove unused scripts
*/
private
async
cleanupConnections
():
Promise
<
void
>
{
try
{
try
{
for
(
const
[
rmMeta
,
sshClientManager
]
of
this
.
machineSSHClientMap
.
entries
())
{
le
t
jobpidPath
:
string
=
unixPathJoin
(
this
.
getRemoteScriptsPath
(
rmMeta
.
username
),
'
pid
'
);
le
t
client
:
Client
|
undefined
=
sshClientManager
.
getFirstSSHClient
();
if
(
client
)
{
cons
t
jobpidPath
:
string
=
unixPathJoin
(
this
.
getRemoteScriptsPath
(
rmMeta
.
username
),
'
pid
'
);
cons
t
client
:
Client
|
undefined
=
sshClientManager
.
getFirstSSHClient
();
if
(
client
!==
undefined
)
{
await
SSHClientUtility
.
remoteExeCommand
(
`pkill -P
\`
cat
${
jobpidPath
}
\`
`
,
client
);
await
SSHClientUtility
.
remoteExeCommand
(
`rm -rf
${
this
.
getRemoteScriptsPath
(
rmMeta
.
username
)}
`
,
client
);
}
sshClientManager
.
closeAllSSHClient
();
}
}
catch
(
error
)
{
}
catch
(
error
)
{
//ignore error, this function is called to cleanup remote connections when experiment is stopping
this
.
log
.
error
(
`Cleanup connection exception, error is
${
error
.
message
}
`
);
}
return
Promise
.
resolve
();
}
}
/**
* Generate gpu metric collector directory to store temp gpu metric collector script files
*/
private
getLocalGpuMetricCollectorDir
():
string
{
let
userName
:
string
=
path
.
basename
(
os
.
homedir
());
//get current user name of os
const
userName
:
string
=
path
.
basename
(
os
.
homedir
());
//get current user name of os
return
path
.
join
(
os
.
tmpdir
(),
userName
,
'
nni
'
,
'
scripts
'
);
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
generateGpuMetricsCollectorScript
(
userName
:
string
):
Promise
<
void
>
{
le
t
gpuMetricCollectorScriptFolder
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
cons
t
gpuMetricCollectorScriptFolder
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
await
execMkdir
(
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
));
//generate gpu_metrics_collector.sh
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
// This directory is used to store gpu_metrics and pid created by script
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
remoteGPUScriptsDir
,
unixPathJoin
(
remoteGPUScriptsDir
,
'
pid
'
)
,
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
remoteGPUScriptsDir
,
unixPathJoin
(
remoteGPUScriptsDir
,
'
pid
'
)
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
}
...
...
@@ -467,39 +478,44 @@ class RemoteMachineTrainingService implements TrainingService {
rmMetaList
.
forEach
(
async
(
rmMeta
:
RemoteMachineMeta
)
=>
{
rmMeta
.
occupiedGpuIndexMap
=
new
Map
<
number
,
number
>
();
le
t
sshClientManager
:
SSHClientManager
=
new
SSHClientManager
([],
this
.
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
,
rmMeta
);
le
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
cons
t
sshClientManager
:
SSHClientManager
=
new
SSHClientManager
([],
this
.
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
,
rmMeta
);
cons
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
this
.
machineSSHClientMap
.
set
(
rmMeta
,
sshClientManager
);
await
this
.
initRemoteMachineOnConnected
(
rmMeta
,
sshClient
);
if
(
++
connectedRMNum
===
rmMetaList
.
length
)
{
deferred
.
resolve
();
}
});
return
deferred
.
promise
;
}
private
async
initRemoteMachineOnConnected
(
rmMeta
:
RemoteMachineMeta
,
conn
:
Client
):
Promise
<
void
>
{
// Create root working directory after ssh connection is ready
await
this
.
generateGpuMetricsCollectorScript
(
rmMeta
.
username
);
//generate gpu script in local machine first, will copy to remote machine later
// generate gpu script in local machine first, will copy to remote machine later
await
this
.
generateGpuMetricsCollectorScript
(
rmMeta
.
username
);
const
nniRootDir
:
string
=
unixPathJoin
(
getRemoteTmpDir
(
this
.
remoteOS
),
'
nni
'
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
this
.
remoteExpRootDir
}
`
,
conn
);
// Copy NNI scripts to remote expeirment working directory
const
localGpuScriptCollectorDir
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
const
remoteGpuScriptCollectorDir
:
string
=
this
.
getRemoteScriptsPath
(
rmMeta
.
username
);
//the directory to store temp scripts in remote machine
// the directory to store temp scripts in remote machine
const
remoteGpuScriptCollectorDir
:
string
=
this
.
getRemoteScriptsPath
(
rmMeta
.
username
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
remoteGpuScriptCollectorDir
}
`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`chmod 777
${
nniRootDir
}
${
nniRootDir
}
/*
${
nniRootDir
}
/scripts/*`
,
conn
);
//copy gpu_metrics_collector.sh to remote
await
SSHClientUtility
.
copyFileToRemote
(
path
.
join
(
localGpuScriptCollectorDir
,
rmMeta
.
username
,
'
gpu_metrics_collector.sh
'
),
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
),
conn
);
await
SSHClientUtility
.
copyFileToRemote
(
path
.
join
(
localGpuScriptCollectorDir
,
rmMeta
.
username
,
'
gpu_metrics_collector.sh
'
),
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
),
conn
);
//Begin to execute gpu_metrics_collection scripts
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
)}
`
,
conn
);
this
.
timer
.
subscribe
(
async
(
tick
:
number
)
=>
{
const
cmdresult
:
RemoteCommandResult
=
await
SSHClientUtility
.
remoteExeCommand
(
`tail -n 1
${
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics
'
)}
`
,
conn
);
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
if
(
cmdresult
!==
undefined
&&
cmdresult
.
stdout
!==
undefined
)
{
rmMeta
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
}
...
...
@@ -509,7 +525,7 @@ class RemoteMachineTrainingService implements TrainingService {
private
async
prepareTrialJob
(
trialJobId
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
if
(
!
this
.
trialConfig
)
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
...
...
@@ -519,6 +535,7 @@ class RemoteMachineTrainingService implements TrainingService {
// If job is not WATIING, Don't prepare and resolve true immediately
if
(
trialJobDetail
.
status
!==
'
WAITING
'
)
{
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
}
// get an ssh client from scheduler
...
...
@@ -557,7 +574,7 @@ class RemoteMachineTrainingService implements TrainingService {
private
async
launchTrialOnScheduledMachine
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
TrialJobApplicationForm
,
rmScheduleInfo
:
RemoteMachineScheduleInfo
):
Promise
<
void
>
{
if
(
!
this
.
trialConfig
)
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
const
cuda_visible_device
:
string
=
rmScheduleInfo
.
cuda_visible_device
;
...
...
@@ -584,18 +601,19 @@ class RemoteMachineTrainingService implements TrainingService {
let
command
:
string
;
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
if
(
typeof
cuda_visible_device
===
'
string
'
&&
cuda_visible_device
.
length
>
0
)
{
if
(
typeof
cuda_visible_device
===
'
string
'
&&
cuda_visible_device
.
length
>
0
)
{
command
=
`CUDA_VISIBLE_DEVICES=
${
cuda_visible_device
}
${
this
.
trialConfig
.
command
}
`
;
}
else
{
command
=
`CUDA_VISIBLE_DEVICES=" "
${
this
.
trialConfig
.
command
}
`
;
}
const
nniManagerIp
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
if
(
!
this
.
remoteRestServerPort
)
{
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
if
(
this
.
remoteRestServerPort
===
undefined
)
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
this
.
remoteRestServerPort
=
restServer
.
clusterRestServerPort
;
}
const
version
=
this
.
versionCheck
?
await
getVersion
():
''
;
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
runScriptTrialContent
:
string
=
String
.
Format
(
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
trialWorkingFolder
,
...
...
@@ -611,7 +629,7 @@ class RemoteMachineTrainingService implements TrainingService {
version
,
this
.
logCollection
,
unixPathJoin
(
trialWorkingFolder
,
'
.nni
'
,
'
code
'
)
)
)
;
//create tmp trial working folder locally.
await
execMkdir
(
path
.
join
(
trialLocalTempFolder
,
'
.nni
'
));
...
...
@@ -627,6 +645,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Copy files in codeDir to remote working directory
await
SSHClientUtility
.
copyDirectoryToRemote
(
trialLocalTempFolder
,
trialWorkingFolder
,
sshClient
,
this
.
remoteOS
);
// Execute command in remote machine
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
trialWorkingFolder
,
'
run.sh
'
)}
`
,
sshClient
);
}
...
...
@@ -636,7 +655,7 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
sshClientManager
===
undefined
)
{
throw
new
Error
(
'
sshClient not found.
'
);
}
le
t
sshClient
:
Client
=
sshClientManager
.
getFirstSSHClient
();
cons
t
sshClient
:
Client
=
sshClientManager
.
getFirstSSHClient
();
const
jobId
:
string
=
uniqueString
(
5
);
const
localDir
:
string
=
path
.
join
(
this
.
expRootDir
,
'
hostjobs-local
'
,
jobId
);
const
remoteDir
:
string
=
this
.
getHostJobRemoteDir
(
jobId
);
...
...
@@ -648,6 +667,7 @@ class RemoteMachineTrainingService implements TrainingService {
await
fs
.
promises
.
writeFile
(
path
.
join
(
localDir
,
'
run.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
await
SSHClientUtility
.
copyFileToRemote
(
path
.
join
(
localDir
,
'
run.sh
'
),
unixPathJoin
(
remoteDir
,
'
run.sh
'
),
sshClient
);
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
remoteDir
,
'
run.sh
'
)}
`
,
sshClient
);
const
jobDetail
:
RemoteMachineTrialJobDetail
=
new
RemoteMachineTrialJobDetail
(
...
...
@@ -680,8 +700,9 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
killResult
!==
0
)
{
const
trailReturnCode
:
string
=
await
SSHClientUtility
.
getRemoteFileContent
(
trialReturnCodeFilePath
,
sshClient
);
this
.
log
.
debug
(
`trailjob
${
trialJob
.
id
}
return code:
${
trailReturnCode
}
`
);
const
match
:
RegExpMatchArray
|
null
=
trailReturnCode
.
trim
().
match
(
/^
(\d
+
)\s
+
(\d
+
)
$/
);
if
(
match
)
{
const
match
:
RegExpMatchArray
|
null
=
trailReturnCode
.
trim
()
.
match
(
/^
(\d
+
)\s
+
(\d
+
)
$/
);
if
(
match
!==
null
)
{
const
{
1
:
code
,
2
:
timestamp
}
=
match
;
// Update trial job's status based on result code
if
(
parseInt
(
code
,
10
)
===
0
)
{
...
...
@@ -709,6 +730,7 @@ class RemoteMachineTrainingService implements TrainingService {
deferred
.
resolve
(
trialJob
);
}
}
return
deferred
.
promise
;
}
...
...
@@ -720,7 +742,7 @@ class RemoteMachineTrainingService implements TrainingService {
return
unixPathJoin
(
this
.
remoteExpRootDir
,
'
hostjobs
'
,
jobId
);
}
private
getRemoteExperimentRootDir
():
string
{
private
getRemoteExperimentRootDir
():
string
{
return
unixPathJoin
(
getRemoteTmpDir
(
this
.
remoteOS
),
'
nni
'
,
'
experiments
'
,
getExperimentId
());
}
...
...
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment