Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
1328f412
Unverified
Commit
1328f412
authored
Dec 10, 2019
by
chicm-ms
Committed by
GitHub
Dec 10, 2019
Browse files
Fix eslint errors (#1836)
* update eslint rules * auto fix eslint * manually fix eslint (#1833)
parent
8c07cf41
Changes
42
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
155 additions
and
162 deletions
+155
-162
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
...er/training_service/kubernetes/kubeflow/kubeflowConfig.ts
+1
-1
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
...g_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
+2
-2
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+15
-17
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
...anager/training_service/kubernetes/kubernetesApiClient.ts
+5
-4
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+11
-11
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
...training_service/kubernetes/kubernetesJobInfoCollector.ts
+4
-4
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
...er/training_service/kubernetes/kubernetesJobRestServer.ts
+1
-1
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+8
-9
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+0
-2
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+3
-2
src/nni_manager/training_service/pai/hdfsClientUtility.ts
src/nni_manager/training_service/pai/hdfsClientUtility.ts
+42
-42
src/nni_manager/training_service/pai/paiConfig.ts
src/nni_manager/training_service/pai/paiConfig.ts
+11
-11
src/nni_manager/training_service/pai/paiData.ts
src/nni_manager/training_service/pai/paiData.ts
+1
-1
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
+7
-8
src/nni_manager/training_service/pai/paiJobRestServer.ts
src/nni_manager/training_service/pai/paiJobRestServer.ts
+2
-2
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+18
-20
src/nni_manager/training_service/pai/paiTrialConfig.ts
src/nni_manager/training_service/pai/paiTrialConfig.ts
+1
-1
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
...i_manager/training_service/remote_machine/gpuScheduler.ts
+11
-12
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+10
-10
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
...ning_service/remote_machine/remoteMachineJobRestServer.ts
+2
-2
No files found.
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
View file @
1328f412
...
@@ -118,7 +118,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
...
@@ -118,7 +118,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
public
readonly
replicas
:
number
;
public
readonly
replicas
:
number
;
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
privateRegistryAuthPath
?:
string
)
{
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
privateRegistryAuthPath
?:
string
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
,
privateRegistryAuthPath
);
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
,
privateRegistryAuthPath
);
this
.
replicas
=
replicas
;
this
.
replicas
=
replicas
;
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
View file @
1328f412
...
@@ -17,7 +17,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
...
@@ -17,7 +17,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
}
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
kubernetesTrialJob
:
KubernetesTrialJobDetail
):
Promise
<
void
>
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
...
@@ -40,7 +40,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
...
@@ -40,7 +40,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
conditions
)
{
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
conditions
)
{
const
latestCondition
:
any
=
kubernetesJobInfo
.
status
.
conditions
[
kubernetesJobInfo
.
status
.
conditions
.
length
-
1
];
const
latestCondition
:
any
=
kubernetesJobInfo
.
status
.
conditions
[
kubernetesJobInfo
.
status
.
conditions
.
length
-
1
];
const
tfJobType
:
KubeflowJobStatus
=
<
KubeflowJobStatus
>
latestCondition
.
type
;
const
tfJobType
:
KubeflowJobStatus
=
<
KubeflowJobStatus
>
latestCondition
.
type
;
switch
(
tfJobType
)
{
switch
(
tfJobType
)
{
case
'
Created
'
:
case
'
Created
'
:
kubernetesTrialJob
.
status
=
'
WAITING
'
;
kubernetesTrialJob
.
status
=
'
WAITING
'
;
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
1328f412
...
@@ -17,7 +17,6 @@ import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from
...
@@ -17,7 +17,6 @@ import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
...
@@ -116,7 +115,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -116,7 +115,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
{
const
kubeflowClusterJsonObject
:
object
=
JSON
.
parse
(
value
);
const
kubeflowClusterJsonObject
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
...
@@ -125,9 +124,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -125,9 +124,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
keyVault
.
name
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
...
@@ -139,8 +136,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -139,8 +136,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
this
.
kubernetesCRDClient
=
KubeflowOperatorClientFactory
.
createClient
(
this
.
kubernetesCRDClient
=
KubeflowOperatorClientFactory
.
createClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
break
;
}
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
...
@@ -163,6 +160,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -163,6 +160,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
return
Promise
.
reject
(
new
Error
(
error
));
return
Promise
.
reject
(
new
Error
(
error
));
}
}
break
;
break
;
}
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
break
;
...
@@ -235,7 +233,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -235,7 +233,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
//create tmp trial working folder locally.
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
...
@@ -293,14 +291,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -293,14 +291,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
);
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
);
}
}
const
workerPodResources
:
any
=
{};
const
workerPodResources
:
any
=
{};
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
kubeflowTrialConfig
.
worker
.
gpuNum
);
}
}
workerPodResources
.
limits
=
{...
workerPodResources
.
requests
};
workerPodResources
.
limits
=
{...
workerPodResources
.
requests
};
const
nonWorkerResources
:
any
=
{};
const
nonWorkerResources
:
any
=
{};
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
...
@@ -330,8 +328,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -330,8 +328,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template
* @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master
* @param nonWorkerPodResources non-worker pod template, like ps or master
*/
*/
private
async
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
private
async
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
Promise
<
any
>
{
nonWorkerPodResources
?:
any
):
Promise
<
any
>
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
...
@@ -348,11 +346,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -348,11 +346,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const
replicaSpecsObjMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
const
replicaSpecsObjMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
le
t
privateRegistrySecretName
=
await
this
.
createRegistrySecret
(
tensorflowTrialConfig
.
worker
.
privateRegistryAuthPath
);
cons
t
privateRegistrySecretName
=
await
this
.
createRegistrySecret
(
tensorflowTrialConfig
.
worker
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
,
privateRegistrySecretName
);
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
,
privateRegistrySecretName
);
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
le
t
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
tensorflowTrialConfig
.
ps
.
privateRegistryAuthPath
);
cons
t
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
tensorflowTrialConfig
.
ps
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
,
privateRegistrySecretName
);
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
,
privateRegistrySecretName
);
}
}
...
@@ -360,11 +358,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -360,11 +358,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
!==
undefined
)
{
if
(
pytorchTrialConfig
.
worker
!==
undefined
)
{
le
t
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
pytorchTrialConfig
.
worker
.
privateRegistryAuthPath
);
cons
t
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
pytorchTrialConfig
.
worker
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
,
privateRegistrySecretName
);
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
,
privateRegistrySecretName
);
}
}
le
t
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
pytorchTrialConfig
.
master
.
privateRegistryAuthPath
);
cons
t
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
pytorchTrialConfig
.
master
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
,
privateRegistrySecretName
);
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
,
privateRegistrySecretName
);
...
@@ -448,7 +446,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -448,7 +446,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
resources
:
podResources
resources
:
podResources
}
}
]);
]);
le
t
spec
:
any
=
{
cons
t
spec
:
any
=
{
containers
:
containersSpecMap
.
get
(
'
containers
'
),
containers
:
containersSpecMap
.
get
(
'
containers
'
),
restartPolicy
:
'
ExitCode
'
,
restartPolicy
:
'
ExitCode
'
,
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
)
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
)
...
...
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
View file @
1328f412
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
'
use strict
'
;
'
use strict
'
;
// eslint-disable-next-line @typescript-eslint/camelcase
import
{
Client1_10
,
config
}
from
'
kubernetes-client
'
;
import
{
Client1_10
,
config
}
from
'
kubernetes-client
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
...
@@ -21,7 +22,7 @@ class GeneralK8sClient {
...
@@ -21,7 +22,7 @@ class GeneralK8sClient {
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
result
=
Promise
.
resolve
(
true
);
...
@@ -73,7 +74,7 @@ abstract class KubernetesCRDClient {
...
@@ -73,7 +74,7 @@ abstract class KubernetesCRDClient {
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
result
=
Promise
.
resolve
(
true
);
}
else
{
}
else
{
...
@@ -86,7 +87,7 @@ abstract class KubernetesCRDClient {
...
@@ -86,7 +87,7 @@ abstract class KubernetesCRDClient {
//TODO : replace any
//TODO : replace any
public
async
getKubernetesJob
(
kubeflowJobName
:
string
):
Promise
<
any
>
{
public
async
getKubernetesJob
(
kubeflowJobName
:
string
):
Promise
<
any
>
{
let
result
:
Promise
<
any
>
;
let
result
:
Promise
<
any
>
;
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
)
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
)
.
get
();
.
get
();
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
response
.
body
);
result
=
Promise
.
resolve
(
response
.
body
);
...
@@ -104,7 +105,7 @@ abstract class KubernetesCRDClient {
...
@@ -104,7 +105,7 @@ abstract class KubernetesCRDClient {
.
map
((
labelKey
:
string
)
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
)
.
map
((
labelKey
:
string
)
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
)
.
join
(
'
,
'
);
.
join
(
'
,
'
);
try
{
try
{
const
deleteResult
:
any
=
await
this
.
operator
()
const
deleteResult
:
any
=
await
this
.
operator
()
.
delete
({
.
delete
({
qs
:
{
qs
:
{
labelSelector
:
matchQuery
,
labelSelector
:
matchQuery
,
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
1328f412
...
@@ -113,11 +113,11 @@ export class KubernetesClusterConfigFactory {
...
@@ -113,11 +113,11 @@ export class KubernetesClusterConfigFactory {
*/
*/
export
class
NFSConfig
{
export
class
NFSConfig
{
// IP Adress of NFS server
// IP Adress of NFS server
public
readonly
server
:
string
;
public
readonly
server
:
string
;
// exported NFS path on NFS server
// exported NFS path on NFS server
public
readonly
path
:
string
;
public
readonly
path
:
string
;
constructor
(
server
:
string
,
path
:
string
)
{
constructor
(
server
:
string
,
path
:
string
)
{
this
.
server
=
server
;
this
.
server
=
server
;
this
.
path
=
path
;
this
.
path
=
path
;
}
}
...
@@ -129,11 +129,11 @@ export class NFSConfig {
...
@@ -129,11 +129,11 @@ export class NFSConfig {
*/
*/
export
class
KeyVaultConfig
{
export
class
KeyVaultConfig
{
// The vault-name to specify vault
// The vault-name to specify vault
public
readonly
vaultName
:
string
;
public
readonly
vaultName
:
string
;
// The name to specify private key
// The name to specify private key
public
readonly
name
:
string
;
public
readonly
name
:
string
;
constructor
(
vaultName
:
string
,
name
:
string
)
{
constructor
(
vaultName
:
string
,
name
:
string
)
{
this
.
vaultName
=
vaultName
;
this
.
vaultName
=
vaultName
;
this
.
name
=
name
;
this
.
name
=
name
;
}
}
...
@@ -144,11 +144,11 @@ export class KeyVaultConfig {
...
@@ -144,11 +144,11 @@ export class KeyVaultConfig {
*/
*/
export
class
AzureStorage
{
export
class
AzureStorage
{
// The azure share to storage files
// The azure share to storage files
public
readonly
azureShare
:
string
;
public
readonly
azureShare
:
string
;
// The account name of sotrage service
// The account name of sotrage service
public
readonly
accountName
:
string
;
public
readonly
accountName
:
string
;
constructor
(
azureShare
:
string
,
accountName
:
string
)
{
constructor
(
azureShare
:
string
,
accountName
:
string
)
{
this
.
azureShare
=
azureShare
;
this
.
azureShare
=
azureShare
;
this
.
accountName
=
accountName
;
this
.
accountName
=
accountName
;
}
}
...
@@ -171,12 +171,12 @@ export class KubernetesTrialConfigTemplate {
...
@@ -171,12 +171,12 @@ export class KubernetesTrialConfigTemplate {
public
readonly
privateRegistryAuthPath
?:
string
;
public
readonly
privateRegistryAuthPath
?:
string
;
// Trail command
// Trail command
public
readonly
command
:
string
;
public
readonly
command
:
string
;
// Required GPU number for trial job. The number should be in [0,100]
// Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
gpuNum
:
number
,
constructor
(
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
privateRegistryAuthPath
?:
string
)
{
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
privateRegistryAuthPath
?:
string
)
{
this
.
command
=
command
;
this
.
command
=
command
;
this
.
gpuNum
=
gpuNum
;
this
.
gpuNum
=
gpuNum
;
...
...
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
View file @
1328f412
...
@@ -14,7 +14,7 @@ import { KubernetesTrialJobDetail } from './kubernetesData';
...
@@ -14,7 +14,7 @@ import { KubernetesTrialJobDetail } from './kubernetesData';
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/
*/
export
class
KubernetesJobInfoCollector
{
export
class
KubernetesJobInfoCollector
{
protected
readonly
trialJobsMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
;
protected
readonly
trialJobsMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
;
protected
readonly
log
:
Logger
=
getLogger
();
protected
readonly
log
:
Logger
=
getLogger
();
protected
readonly
statusesNeedToCheck
:
TrialJobStatus
[];
protected
readonly
statusesNeedToCheck
:
TrialJobStatus
[];
...
@@ -23,9 +23,9 @@ export class KubernetesJobInfoCollector {
...
@@ -23,9 +23,9 @@ export class KubernetesJobInfoCollector {
this
.
statusesNeedToCheck
=
[
'
RUNNING
'
,
'
WAITING
'
];
this
.
statusesNeedToCheck
=
[
'
RUNNING
'
,
'
WAITING
'
];
}
}
public
async
retrieveTrialStatus
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
)
:
Promise
<
void
>
{
public
async
retrieveTrialStatus
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
):
Promise
<
void
>
{
assert
(
kubernetesCRDClient
!==
undefined
);
assert
(
kubernetesCRDClient
!==
undefined
);
const
updateKubernetesTrialJobs
:
Promise
<
void
>
[]
=
[];
const
updateKubernetesTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
const
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
for
(
const
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
kubernetesTrialJob
===
undefined
)
{
if
(
kubernetesTrialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
...
@@ -41,7 +41,7 @@ export class KubernetesJobInfoCollector {
...
@@ -41,7 +41,7 @@ export class KubernetesJobInfoCollector {
}
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
kubernetesTrialJob
:
KubernetesTrialJobDetail
):
Promise
<
void
>
{
throw
new
MethodNotImplementedError
();
throw
new
MethodNotImplementedError
();
}
}
}
}
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
View file @
1328f412
...
@@ -26,7 +26,7 @@ export class KubernetesJobRestServer extends ClusterJobRestServer {
...
@@ -26,7 +26,7 @@ export class KubernetesJobRestServer extends ClusterJobRestServer {
}
}
// tslint:disable-next-line:no-any
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[]):
void
{
if
(
this
.
kubernetesTrainingService
===
undefined
)
{
if
(
this
.
kubernetesTrainingService
===
undefined
)
{
throw
Error
(
'
kubernetesTrainingService not initialized!
'
);
throw
Error
(
'
kubernetesTrainingService not initialized!
'
);
}
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
1328f412
...
@@ -22,8 +22,7 @@ import { KubernetesClusterConfig } from './kubernetesConfig';
...
@@ -22,8 +22,7 @@ import { KubernetesClusterConfig } from './kubernetesConfig';
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
var
yaml
=
require
(
'
js-yaml
'
);
const
fs
=
require
(
'
fs
'
);
var
fs
=
require
(
'
fs
'
);
/**
/**
* Training Service implementation for Kubernetes
* Training Service implementation for Kubernetes
...
@@ -36,7 +35,7 @@ abstract class KubernetesTrainingService {
...
@@ -36,7 +35,7 @@ abstract class KubernetesTrainingService {
// experiment root dir in NFS
// experiment root dir in NFS
protected
readonly
trialLocalNFSTempFolder
:
string
;
protected
readonly
trialLocalNFSTempFolder
:
string
;
protected
stopping
:
boolean
=
false
;
protected
stopping
:
boolean
=
false
;
protected
experimentId
!
:
string
;
protected
experimentId
!
:
string
;
protected
kubernetesRestServerPort
?:
number
;
protected
kubernetesRestServerPort
?:
number
;
protected
readonly
CONTAINER_MOUNT_PATH
:
string
;
protected
readonly
CONTAINER_MOUNT_PATH
:
string
;
protected
azureStorageClient
?:
azureStorage
.
FileService
;
protected
azureStorageClient
?:
azureStorage
.
FileService
;
...
@@ -113,12 +112,12 @@ abstract class KubernetesTrainingService {
...
@@ -113,12 +112,12 @@ abstract class KubernetesTrainingService {
return
Promise
.
resolve
(
''
);
return
Promise
.
resolve
(
''
);
}
}
public
get
MetricsEmitter
()
:
EventEmitter
{
public
get
MetricsEmitter
():
EventEmitter
{
return
this
.
metricsEmitter
;
return
this
.
metricsEmitter
;
}
}
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
KubernetesTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
trialJobDetail
:
KubernetesTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
if
(
trialJobDetail
===
undefined
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
not found`
;
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
not found`
;
this
.
log
.
error
(
errorMessage
);
this
.
log
.
error
(
errorMessage
);
...
@@ -208,7 +207,7 @@ abstract class KubernetesTrainingService {
...
@@ -208,7 +207,7 @@ abstract class KubernetesTrainingService {
}
}
// tslint:disable: no-unsafe-any no-any
// tslint:disable: no-unsafe-any no-any
protected
async
createAzureStorage
(
vaultName
:
string
,
valutKeyName
:
string
,
accountName
:
string
,
azureShare
:
string
):
Promise
<
void
>
{
protected
async
createAzureStorage
(
vaultName
:
string
,
valutKeyName
:
string
):
Promise
<
void
>
{
try
{
try
{
const
result
:
any
=
await
cpp
.
exec
(
`az keyvault secret show --name
${
valutKeyName
}
--vault-name
${
vaultName
}
`
);
const
result
:
any
=
await
cpp
.
exec
(
`az keyvault secret show --name
${
valutKeyName
}
--vault-name
${
vaultName
}
`
);
if
(
result
.
stderr
)
{
if
(
result
.
stderr
)
{
...
@@ -312,8 +311,8 @@ abstract class KubernetesTrainingService {
...
@@ -312,8 +311,8 @@ abstract class KubernetesTrainingService {
if
(
filePath
===
undefined
||
filePath
===
''
)
{
if
(
filePath
===
undefined
||
filePath
===
''
)
{
return
undefined
;
return
undefined
;
}
}
le
t
body
=
fs
.
readFileSync
(
filePath
).
toString
(
'
base64
'
);
cons
t
body
=
fs
.
readFileSync
(
filePath
).
toString
(
'
base64
'
);
le
t
registrySecretName
=
String
.
Format
(
'
nni-secret-{0}
'
,
uniqueString
(
8
)
cons
t
registrySecretName
=
String
.
Format
(
'
nni-secret-{0}
'
,
uniqueString
(
8
)
.
toLowerCase
());
.
toLowerCase
());
await
this
.
genericK8sClient
.
createSecret
(
await
this
.
genericK8sClient
.
createSecret
(
{
{
...
@@ -336,7 +335,7 @@ abstract class KubernetesTrainingService {
...
@@ -336,7 +335,7 @@ abstract class KubernetesTrainingService {
return
registrySecretName
;
return
registrySecretName
;
}
}
protected
async
uploadFilesToAzureStorage
(
trialJobId
:
string
,
trialLocalTempFolder
:
S
tring
,
codeDir
:
S
tring
,
uploadRetryCount
:
number
|
undefined
):
Promise
<
string
>
{
protected
async
uploadFilesToAzureStorage
(
trialJobId
:
string
,
trialLocalTempFolder
:
s
tring
,
codeDir
:
s
tring
,
uploadRetryCount
:
number
|
undefined
):
Promise
<
string
>
{
if
(
this
.
azureStorageClient
===
undefined
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
}
...
...
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
1328f412
...
@@ -4,11 +4,9 @@
...
@@ -4,11 +4,9 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
os
from
'
os
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
1328f412
...
@@ -107,7 +107,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -107,7 +107,7 @@ class LocalTrainingService implements TrainingService {
private
initialized
:
boolean
;
private
initialized
:
boolean
;
private
stopping
:
boolean
;
private
stopping
:
boolean
;
private
rootDir
!
:
string
;
private
rootDir
!
:
string
;
private
readonly
experimentId
!
:
string
;
private
readonly
experimentId
!
:
string
;
private
gpuScheduler
!
:
GPUScheduler
;
private
gpuScheduler
!
:
GPUScheduler
;
private
readonly
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
;
private
readonly
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
;
private
designatedGpuIndices
!
:
Set
<
number
>
;
private
designatedGpuIndices
!
:
Set
<
number
>
;
...
@@ -299,7 +299,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -299,7 +299,7 @@ class LocalTrainingService implements TrainingService {
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
switch
(
key
)
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
let
getResult
:
Promise
<
string
>
;
let
getResult
:
Promise
<
string
>
;
if
(
this
.
localTrialConfig
===
undefined
)
{
if
(
this
.
localTrialConfig
===
undefined
)
{
getResult
=
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`
${
key
}
is never set yet`
));
getResult
=
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`
${
key
}
is never set yet`
));
...
@@ -308,6 +308,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -308,6 +308,7 @@ class LocalTrainingService implements TrainingService {
}
}
return
getResult
;
return
getResult
;
}
default
:
default
:
return
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
'
Key not found
'
));
return
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
'
Key not found
'
));
}
}
...
...
src/nni_manager/training_service/pai/hdfsClientUtility.ts
View file @
1328f412
...
@@ -48,10 +48,10 @@ export namespace HDFSClientUtility {
...
@@ -48,10 +48,10 @@ export namespace HDFSClientUtility {
* @param hdfsClient hdfs client
* @param hdfsClient hdfs client
*/
*/
// tslint:disable: no-unsafe-any non-literal-fs-path no-any
// tslint:disable: no-unsafe-any non-literal-fs-path no-any
export
async
function
copyFileToHdfs
(
localFilePath
:
string
,
hdfsFilePath
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
export
async
function
copyFileToHdfs
(
localFilePath
:
string
,
hdfsFilePath
:
string
,
hdfsClient
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
// tslint:disable-next-line:non-literal-fs-path
// tslint:disable-next-line:non-literal-fs-path
fs
.
exists
(
localFilePath
,
(
exists
:
boolean
)
=>
{
fs
.
exists
(
localFilePath
,
(
exists
:
boolean
)
=>
{
// Detect if local file exist
// Detect if local file exist
if
(
exists
)
{
if
(
exists
)
{
const
localFileStream
:
fs
.
ReadStream
=
fs
.
createReadStream
(
localFilePath
);
const
localFileStream
:
fs
.
ReadStream
=
fs
.
createReadStream
(
localFilePath
);
...
@@ -60,7 +60,7 @@ export namespace HDFSClientUtility {
...
@@ -60,7 +60,7 @@ export namespace HDFSClientUtility {
hdfsFileStream
.
on
(
'
finish
'
,
()
=>
{
hdfsFileStream
.
on
(
'
finish
'
,
()
=>
{
deferred
.
resolve
();
deferred
.
resolve
();
});
});
hdfsFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
hdfsFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
getLogger
()
getLogger
()
.
error
(
`HDFSCientUtility:copyFileToHdfs, copy file failed, err is
${
err
.
message
}
`
);
.
error
(
`HDFSCientUtility:copyFileToHdfs, copy file failed, err is
${
err
.
message
}
`
);
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
...
@@ -82,7 +82,7 @@ export namespace HDFSClientUtility {
...
@@ -82,7 +82,7 @@ export namespace HDFSClientUtility {
* @param hdfsDirectory HDFS directory
* @param hdfsDirectory HDFS directory
* @param hdfsClient HDFS client
* @param hdfsClient HDFS client
*/
*/
export
async
function
copyDirectoryToHdfs
(
localDirectory
:
string
,
hdfsDirectory
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
export
async
function
copyDirectoryToHdfs
(
localDirectory
:
string
,
hdfsDirectory
:
string
,
hdfsClient
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
// TODO: fs.readdirSync doesn't support ~($HOME)
// TODO: fs.readdirSync doesn't support ~($HOME)
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
...
@@ -108,28 +108,51 @@ export namespace HDFSClientUtility {
...
@@ -108,28 +108,51 @@ export namespace HDFSClientUtility {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
* Check if an HDFS path already exists
*
* @param hdfsPath target path need to check in HDFS
* @param hdfsClient HDFS client
*/
export
async
function
pathExists
(
hdfsPath
:
string
,
hdfsClient
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
exists
(
hdfsPath
,
(
exist
:
boolean
)
=>
{
deferred
.
resolve
(
exist
);
});
let
timeoutId
:
NodeJS
.
Timer
;
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
):
void
=>
{
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId
=
setTimeout
(()
=>
{
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
);
},
5000
);
});
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
])
.
finally
(()
=>
{
clearTimeout
(
timeoutId
);
});
}
/**
/**
* Read content from HDFS file
* Read content from HDFS file
*
*
* @param hdfsPath HDFS file path
* @param hdfsPath HDFS file path
* @param hdfsClient HDFS client
* @param hdfsClient HDFS client
*/
*/
export
async
function
readFileFromHDFS
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
Buffer
>
{
export
async
function
readFileFromHDFS
(
hdfsPath
:
string
,
hdfsClient
:
any
):
Promise
<
Buffer
>
{
const
deferred
:
Deferred
<
Buffer
>
=
new
Deferred
<
Buffer
>
();
const
deferred
:
Deferred
<
Buffer
>
=
new
Deferred
<
Buffer
>
();
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
if
(
!
exist
)
{
if
(
!
exist
)
{
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
}
}
const
remoteFileStream
:
any
=
hdfsClient
.
createReadStream
(
hdfsPath
);
const
remoteFileStream
:
any
=
hdfsClient
.
createReadStream
(
hdfsPath
);
remoteFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
remoteFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
// Reject with the error
// Reject with the error
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
});
});
remoteFileStream
.
on
(
'
data
'
,
(
chunk
:
any
)
=>
{
remoteFileStream
.
on
(
'
data
'
,
(
chunk
:
any
)
=>
{
// Concat the data chunk to buffer
// Concat the data chunk to buffer
buffer
=
Buffer
.
concat
([
buffer
,
chunk
]);
buffer
=
Buffer
.
concat
([
buffer
,
chunk
]);
});
});
...
@@ -142,39 +165,16 @@ export namespace HDFSClientUtility {
...
@@ -142,39 +165,16 @@ export namespace HDFSClientUtility {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
* Check if an HDFS path already exists
*
* @param hdfsPath target path need to check in HDFS
* @param hdfsClient HDFS client
*/
export
async
function
pathExists
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
exists
(
hdfsPath
,
(
exist
:
boolean
)
=>
{
deferred
.
resolve
(
exist
);
});
let
timeoutId
:
NodeJS
.
Timer
;
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
)
:
void
=>
{
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId
=
setTimeout
(()
=>
{
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
);
},
5000
);
});
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
])
.
finally
(()
=>
{
clearTimeout
(
timeoutId
);
});
}
/**
/**
* Mkdir in HDFS, use default permission 755
* Mkdir in HDFS, use default permission 755
*
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient HDFS client
* @param hdfsClient HDFS client
*/
*/
export
function
mkdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
export
function
mkdir
(
hdfsPath
:
string
,
hdfsClient
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
mkdir
(
hdfsPath
,
(
err
:
any
)
=>
{
hdfsClient
.
mkdir
(
hdfsPath
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
if
(
!
err
)
{
deferred
.
resolve
(
true
);
deferred
.
resolve
(
true
);
}
else
{
}
else
{
...
@@ -191,14 +191,14 @@ export namespace HDFSClientUtility {
...
@@ -191,14 +191,14 @@ export namespace HDFSClientUtility {
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient HDFS client
* @param hdfsClient HDFS client
*/
*/
export
async
function
readdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
string
[]
>
{
export
async
function
readdir
(
hdfsPath
:
string
,
hdfsClient
:
any
):
Promise
<
string
[]
>
{
const
deferred
:
Deferred
<
string
[]
>
=
new
Deferred
<
string
[]
>
();
const
deferred
:
Deferred
<
string
[]
>
=
new
Deferred
<
string
[]
>
();
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
if
(
!
exist
)
{
if
(
!
exist
)
{
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
}
}
hdfsClient
.
readdir
(
hdfsPath
,
(
err
:
any
,
files
:
any
[])
=>
{
hdfsClient
.
readdir
(
hdfsPath
,
(
err
:
any
,
files
:
any
[])
=>
{
if
(
err
)
{
if
(
err
)
{
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
}
}
...
@@ -215,9 +215,9 @@ export namespace HDFSClientUtility {
...
@@ -215,9 +215,9 @@ export namespace HDFSClientUtility {
* @param hdfsClient HDFS client
* @param hdfsClient HDFS client
* @param recursive Mark if need to delete recursively
* @param recursive Mark if need to delete recursively
*/
*/
export
function
deletePath
(
hdfsPath
:
string
,
hdfsClient
:
any
,
recursive
:
boolean
=
true
)
:
Promise
<
boolean
>
{
export
function
deletePath
(
hdfsPath
:
string
,
hdfsClient
:
any
,
recursive
:
boolean
=
true
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
unlink
(
hdfsPath
,
recursive
,
(
err
:
any
)
=>
{
hdfsClient
.
unlink
(
hdfsPath
,
recursive
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
if
(
!
err
)
{
deferred
.
resolve
(
true
);
deferred
.
resolve
(
true
);
}
else
{
}
else
{
...
...
src/nni_manager/training_service/pai/paiConfig.ts
View file @
1328f412
...
@@ -24,7 +24,7 @@ export class PAITaskRole {
...
@@ -24,7 +24,7 @@ export class PAITaskRole {
//Shared memory for one task in the task role
//Shared memory for one task in the task role
public
readonly
shmMB
?:
number
;
public
readonly
shmMB
?:
number
;
//portList to specify the port used in container
//portList to specify the port used in container
public
portList
?:
p
ortListMetaData
[];
public
portList
?:
P
ortListMetaData
[];
/**
/**
* Constructor
* Constructor
...
@@ -35,8 +35,8 @@ export class PAITaskRole {
...
@@ -35,8 +35,8 @@ export class PAITaskRole {
* @param gpuNumber GPU number for one task in the task role, no less than 0
* @param gpuNumber GPU number for one task in the task role, no less than 0
* @param command Executable command for tasks in the task role, can not be empty
* @param command Executable command for tasks in the task role, can not be empty
*/
*/
constructor
(
name
:
string
,
taskNumber
:
number
,
cpuNumber
:
number
,
memoryMB
:
number
,
gpuNumber
:
number
,
constructor
(
name
:
string
,
taskNumber
:
number
,
cpuNumber
:
number
,
memoryMB
:
number
,
gpuNumber
:
number
,
command
:
string
,
shmMB
?:
number
,
portList
?:
p
ortListMetaData
[])
{
command
:
string
,
shmMB
?:
number
,
portList
?:
P
ortListMetaData
[])
{
this
.
name
=
name
;
this
.
name
=
name
;
this
.
taskNumber
=
taskNumber
;
this
.
taskNumber
=
taskNumber
;
this
.
cpuNumber
=
cpuNumber
;
this
.
cpuNumber
=
cpuNumber
;
...
@@ -75,8 +75,8 @@ export class PAIJobConfig {
...
@@ -75,8 +75,8 @@ export class PAIJobConfig {
* @param outputDir Output directory on HDFS
* @param outputDir Output directory on HDFS
* @param taskRoles List of taskRole, one task role at least
* @param taskRoles List of taskRole, one task role at least
*/
*/
constructor
(
jobName
:
string
,
image
:
string
,
codeDir
:
string
,
constructor
(
jobName
:
string
,
image
:
string
,
codeDir
:
string
,
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
,
authFile
?:
string
)
{
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
,
authFile
?:
string
)
{
this
.
jobName
=
jobName
;
this
.
jobName
=
jobName
;
this
.
image
=
image
;
this
.
image
=
image
;
this
.
codeDir
=
codeDir
;
this
.
codeDir
=
codeDir
;
...
@@ -102,7 +102,7 @@ export class PAIClusterConfig {
...
@@ -102,7 +102,7 @@ export class PAIClusterConfig {
* @param host Host IP of PAI Cluster
* @param host Host IP of PAI Cluster
* @param token PAI token of PAI Cluster
* @param token PAI token of PAI Cluster
*/
*/
constructor
(
userName
:
string
,
host
:
string
,
passWord
?:
string
,
token
?:
string
)
{
constructor
(
userName
:
string
,
host
:
string
,
passWord
?:
string
,
token
?:
string
)
{
this
.
userName
=
userName
;
this
.
userName
=
userName
;
this
.
passWord
=
passWord
;
this
.
passWord
=
passWord
;
this
.
host
=
host
;
this
.
host
=
host
;
...
@@ -113,8 +113,8 @@ export class PAIClusterConfig {
...
@@ -113,8 +113,8 @@ export class PAIClusterConfig {
/**
/**
* portList data structure used in PAI taskRole
* portList data structure used in PAI taskRole
*/
*/
export
class
p
ortListMetaData
{
export
class
P
ortListMetaData
{
public
readonly
label
:
string
=
''
;
public
readonly
label
:
string
=
''
;
public
readonly
beginAt
:
number
=
0
;
public
readonly
beginAt
:
number
=
0
;
public
readonly
portNumber
:
number
=
0
;
public
readonly
portNumber
:
number
=
0
;
}
}
...
@@ -135,10 +135,10 @@ export class NNIPAITrialConfig extends TrialConfig {
...
@@ -135,10 +135,10 @@ export class NNIPAITrialConfig extends TrialConfig {
//authentication file used for private Docker registry
//authentication file used for private Docker registry
public
authFile
?:
string
;
public
authFile
?:
string
;
//portList to specify the port used in container
//portList to specify the port used in container
public
portList
?:
p
ortListMetaData
[];
public
portList
?:
P
ortListMetaData
[];
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
,
authFile
?:
string
,
portList
?:
p
ortListMetaData
[])
{
image
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
,
authFile
?:
string
,
portList
?:
P
ortListMetaData
[])
{
super
(
command
,
codeDir
,
gpuNum
);
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
memoryMB
=
memoryMB
;
...
...
src/nni_manager/training_service/pai/paiData.ts
View file @
1328f412
...
@@ -22,7 +22,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
...
@@ -22,7 +22,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
public
hdfsLogPath
:
string
;
public
hdfsLogPath
:
string
;
public
isEarlyStopped
?:
boolean
;
public
isEarlyStopped
?:
boolean
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
TrialJobApplicationForm
,
hdfsLogPath
:
string
)
{
submitTime
:
number
,
workingDirectory
:
string
,
form
:
TrialJobApplicationForm
,
hdfsLogPath
:
string
)
{
this
.
id
=
id
;
this
.
id
=
id
;
this
.
status
=
status
;
this
.
status
=
status
;
...
...
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
View file @
1328f412
...
@@ -16,10 +16,10 @@ import { PAITrialJobDetail } from './paiData';
...
@@ -16,10 +16,10 @@ import { PAITrialJobDetail } from './paiData';
* Collector PAI jobs info from PAI cluster, and update pai job status locally
* Collector PAI jobs info from PAI cluster, and update pai job status locally
*/
*/
export
class
PAIJobInfoCollector
{
export
class
PAIJobInfoCollector
{
private
readonly
trialJobsMap
:
Map
<
string
,
PAITrialJobDetail
>
;
private
readonly
trialJobsMap
:
Map
<
string
,
PAITrialJobDetail
>
;
private
readonly
log
:
Logger
=
getLogger
();
private
readonly
log
:
Logger
=
getLogger
();
private
readonly
statusesNeedToCheck
:
TrialJobStatus
[];
private
readonly
statusesNeedToCheck
:
TrialJobStatus
[];
private
readonly
finalStatuses
:
TrialJobStatus
[];
private
readonly
finalStatuses
:
TrialJobStatus
[];
constructor
(
jobMap
:
Map
<
string
,
PAITrialJobDetail
>
)
{
constructor
(
jobMap
:
Map
<
string
,
PAITrialJobDetail
>
)
{
this
.
trialJobsMap
=
jobMap
;
this
.
trialJobsMap
=
jobMap
;
...
@@ -27,12 +27,12 @@ export class PAIJobInfoCollector {
...
@@ -27,12 +27,12 @@ export class PAIJobInfoCollector {
this
.
finalStatuses
=
[
'
SUCCEEDED
'
,
'
FAILED
'
,
'
USER_CANCELED
'
,
'
SYS_CANCELED
'
,
'
EARLY_STOPPED
'
];
this
.
finalStatuses
=
[
'
SUCCEEDED
'
,
'
FAILED
'
,
'
USER_CANCELED
'
,
'
SYS_CANCELED
'
,
'
EARLY_STOPPED
'
];
}
}
public
async
retrieveTrialStatus
(
paiToken
?
:
string
,
paiClusterConfig
?:
PAIClusterConfig
)
:
Promise
<
void
>
{
public
async
retrieveTrialStatus
(
paiToken
?
:
string
,
paiClusterConfig
?:
PAIClusterConfig
):
Promise
<
void
>
{
if
(
paiClusterConfig
===
undefined
||
paiToken
===
undefined
)
{
if
(
paiClusterConfig
===
undefined
||
paiToken
===
undefined
)
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
const
updatePaiTrialJobs
:
Promise
<
void
>
[]
=
[];
const
updatePaiTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
const
[
trialJobId
,
paiTrialJob
]
of
this
.
trialJobsMap
)
{
for
(
const
[
trialJobId
,
paiTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
paiTrialJob
===
undefined
)
{
if
(
paiTrialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
...
@@ -43,9 +43,8 @@ export class PAIJobInfoCollector {
...
@@ -43,9 +43,8 @@ export class PAIJobInfoCollector {
await
Promise
.
all
(
updatePaiTrialJobs
);
await
Promise
.
all
(
updatePaiTrialJobs
);
}
}
private
getSinglePAITrialJobInfo
(
paiTrialJob
:
PAITrialJobDetail
,
paiToken
:
string
,
paiClusterConfig
:
PAIClusterConfig
)
private
getSinglePAITrialJobInfo
(
paiTrialJob
:
PAITrialJobDetail
,
paiToken
:
string
,
paiClusterConfig
:
PAIClusterConfig
):
Promise
<
void
>
{
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
!
this
.
statusesNeedToCheck
.
includes
(
paiTrialJob
.
status
))
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
paiTrialJob
.
status
))
{
deferred
.
resolve
();
deferred
.
resolve
();
...
...
src/nni_manager/training_service/pai/paiJobRestServer.ts
View file @
1328f412
...
@@ -24,7 +24,7 @@ export class PAIJobRestServer extends ClusterJobRestServer {
...
@@ -24,7 +24,7 @@ export class PAIJobRestServer extends ClusterJobRestServer {
private
parameterFileMetaList
:
ParameterFileMeta
[]
=
[];
private
parameterFileMetaList
:
ParameterFileMeta
[]
=
[];
@
Inject
@
Inject
private
readonly
paiTrainingService
:
PAITrainingService
;
private
readonly
paiTrainingService
:
PAITrainingService
;
/**
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
* constructor to provide NNIRestServer's own rest property, e.g. port
...
@@ -35,7 +35,7 @@ export class PAIJobRestServer extends ClusterJobRestServer {
...
@@ -35,7 +35,7 @@ export class PAIJobRestServer extends ClusterJobRestServer {
}
}
// tslint:disable-next-line:no-any
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[]):
void
{
// Split metrics array into single metric, then emit
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for
(
const
singleMetric
of
metrics
)
{
for
(
const
singleMetric
of
metrics
)
{
...
...
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
1328f412
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
// tslint:disable-next-line:no-implicit-dependencies
// tslint:disable-next-line:no-implicit-dependencies
...
@@ -13,7 +12,6 @@ import * as component from '../../common/component';
...
@@ -13,7 +12,6 @@ import * as component from '../../common/component';
import
{
EventEmitter
}
from
'
events
'
;
import
{
EventEmitter
}
from
'
events
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
import
{
...
@@ -52,8 +50,8 @@ class PAITrainingService implements TrainingService {
...
@@ -52,8 +50,8 @@ class PAITrainingService implements TrainingService {
private
paiToken
?
:
string
;
private
paiToken
?
:
string
;
private
paiTokenUpdateTime
?:
number
;
private
paiTokenUpdateTime
?:
number
;
private
readonly
paiTokenUpdateInterval
:
number
;
private
readonly
paiTokenUpdateInterval
:
number
;
private
readonly
experimentId
!
:
string
;
private
readonly
experimentId
!
:
string
;
private
readonly
paiJobCollector
:
PAIJobInfoCollector
;
private
readonly
paiJobCollector
:
PAIJobInfoCollector
;
private
paiRestServerPort
?:
number
;
private
paiRestServerPort
?:
number
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
...
@@ -126,7 +124,7 @@ class PAITrainingService implements TrainingService {
...
@@ -126,7 +124,7 @@ class PAITrainingService implements TrainingService {
if
(
this
.
paiClusterConfig
===
undefined
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
`paiClusterConfig not initialized!`
);
throw
new
Error
(
`paiClusterConfig not initialized!`
);
}
}
const
deferred
:
Deferred
<
PAITrialJobDetail
>
=
new
Deferred
<
PAITrialJobDetail
>
();
const
deferred
:
Deferred
<
PAITrialJobDetail
>
=
new
Deferred
<
PAITrialJobDetail
>
();
this
.
log
.
info
(
`submitTrialJob: form:
${
JSON
.
stringify
(
form
)}
`
);
this
.
log
.
info
(
`submitTrialJob: form:
${
JSON
.
stringify
(
form
)}
`
);
...
@@ -137,7 +135,7 @@ class PAITrainingService implements TrainingService {
...
@@ -137,7 +135,7 @@ class PAITrainingService implements TrainingService {
const
hdfsCodeDir
:
string
=
HDFSClientUtility
.
getHdfsTrialWorkDir
(
this
.
paiClusterConfig
.
userName
,
trialJobId
);
const
hdfsCodeDir
:
string
=
HDFSClientUtility
.
getHdfsTrialWorkDir
(
this
.
paiClusterConfig
.
userName
,
trialJobId
);
const
hdfsOutputDir
:
string
=
unixPathJoin
(
hdfsCodeDir
,
'
nnioutput
'
);
const
hdfsOutputDir
:
string
=
unixPathJoin
(
hdfsCodeDir
,
'
nnioutput
'
);
const
hdfsLogPath
:
string
=
String
.
Format
(
const
hdfsLogPath
:
string
=
String
.
Format
(
PAI_LOG_PATH_FORMAT
,
PAI_LOG_PATH_FORMAT
,
this
.
paiClusterConfig
.
host
,
this
.
paiClusterConfig
.
host
,
hdfsOutputDir
hdfsOutputDir
...
@@ -175,8 +173,8 @@ class PAITrainingService implements TrainingService {
...
@@ -175,8 +173,8 @@ class PAITrainingService implements TrainingService {
// tslint:disable:no-http-string
// tslint:disable:no-http-string
public
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
public
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
trialJobDetail
===
undefined
)
{
if
(
trialJobDetail
===
undefined
)
{
this
.
log
.
error
(
`cancelTrialJob: trial job id
${
trialJobId
}
not found`
);
this
.
log
.
error
(
`cancelTrialJob: trial job id
${
trialJobId
}
not found`
);
...
@@ -222,7 +220,7 @@ class PAITrainingService implements TrainingService {
...
@@ -222,7 +220,7 @@ class PAITrainingService implements TrainingService {
// tslint:disable: no-unsafe-any no-any
// tslint:disable: no-unsafe-any no-any
// tslint:disable-next-line:max-func-body-length
// tslint:disable-next-line:max-func-body-length
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
switch
(
key
)
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
...
@@ -303,7 +301,7 @@ class PAITrainingService implements TrainingService {
...
@@ -303,7 +301,7 @@ class PAITrainingService implements TrainingService {
// tslint:enable: no-unsafe-any
// tslint:enable: no-unsafe-any
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
deferred
.
resolve
();
deferred
.
resolve
();
...
@@ -314,7 +312,7 @@ class PAITrainingService implements TrainingService {
...
@@ -314,7 +312,7 @@ class PAITrainingService implements TrainingService {
this
.
log
.
info
(
'
Stopping PAI training service...
'
);
this
.
log
.
info
(
'
Stopping PAI training service...
'
);
this
.
stopping
=
true
;
this
.
stopping
=
true
;
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
try
{
try
{
await
restServer
.
stop
();
await
restServer
.
stop
();
...
@@ -329,13 +327,13 @@ class PAITrainingService implements TrainingService {
...
@@ -329,13 +327,13 @@ class PAITrainingService implements TrainingService {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
public
get
MetricsEmitter
()
:
EventEmitter
{
public
get
MetricsEmitter
():
EventEmitter
{
return
this
.
metricsEmitter
;
return
this
.
metricsEmitter
;
}
}
// tslint:disable-next-line:max-func-body-length
// tslint:disable-next-line:max-func-body-length
private
async
submitTrialJobToPAI
(
trialJobId
:
string
):
Promise
<
boolean
>
{
private
async
submitTrialJobToPAI
(
trialJobId
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
if
(
trialJobDetail
===
undefined
)
{
...
@@ -372,7 +370,7 @@ class PAITrainingService implements TrainingService {
...
@@ -372,7 +370,7 @@ class PAITrainingService implements TrainingService {
//create tmp trial working folder locally.
//create tmp trial working folder locally.
await
execMkdir
(
trialLocalTempFolder
);
await
execMkdir
(
trialLocalTempFolder
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
...
@@ -388,7 +386,7 @@ class PAITrainingService implements TrainingService {
...
@@ -388,7 +386,7 @@ class PAITrainingService implements TrainingService {
// tslint:disable-next-line: strict-boolean-expressions
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
PAI_TRIAL_COMMAND_FORMAT
,
PAI_TRIAL_COMMAND_FORMAT
,
// PAI will copy job's codeDir into /root directory
// PAI will copy job's codeDir into /root directory
`$PWD/
${
trialJobId
}
`
,
`$PWD/
${
trialJobId
}
`
,
...
@@ -411,7 +409,7 @@ class PAITrainingService implements TrainingService {
...
@@ -411,7 +409,7 @@ class PAITrainingService implements TrainingService {
// tslint:disable-next-line:no-console
// tslint:disable-next-line:no-console
this
.
log
.
info
(
`nniPAItrial command is
${
nniPaiTrialCommand
.
trim
()}
`
);
this
.
log
.
info
(
`nniPAItrial command is
${
nniPaiTrialCommand
.
trim
()}
`
);
const
paiTaskRoles
:
PAITaskRole
[]
=
[
const
paiTaskRoles
:
PAITaskRole
[]
=
[
new
PAITaskRole
(
new
PAITaskRole
(
`nni_trail_
${
trialJobId
}
`
,
`nni_trail_
${
trialJobId
}
`
,
// Task role number
// Task role number
...
@@ -431,7 +429,7 @@ class PAITrainingService implements TrainingService {
...
@@ -431,7 +429,7 @@ class PAITrainingService implements TrainingService {
)
)
];
];
const
paiJobConfig
:
PAIJobConfig
=
new
PAIJobConfig
(
const
paiJobConfig
:
PAIJobConfig
=
new
PAIJobConfig
(
// Job name
// Job name
trialJobDetail
.
paiJobName
,
trialJobDetail
.
paiJobName
,
// Docker image
// Docker image
...
@@ -472,7 +470,7 @@ class PAITrainingService implements TrainingService {
...
@@ -472,7 +470,7 @@ class PAITrainingService implements TrainingService {
// tslint:disable:no-any no-unsafe-any
// tslint:disable:no-any no-unsafe-any
request
(
submitJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
request
(
submitJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`Submit trial
${
trialJobId
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
response
.
body
.
message
}
`
;
`Submit trial
${
trialJobId
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
response
.
body
.
message
}
`
;
trialJobDetail
.
status
=
'
FAILED
'
;
trialJobDetail
.
status
=
'
FAILED
'
;
deferred
.
resolve
(
true
);
deferred
.
resolve
(
true
);
...
@@ -527,7 +525,7 @@ class PAITrainingService implements TrainingService {
...
@@ -527,7 +525,7 @@ class PAITrainingService implements TrainingService {
* Update pai token by the interval time or initialize the pai token
* Update pai token by the interval time or initialize the pai token
*/
*/
private
async
updatePaiToken
():
Promise
<
void
>
{
private
async
updatePaiToken
():
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
currentTime
:
number
=
new
Date
().
getTime
();
const
currentTime
:
number
=
new
Date
().
getTime
();
//If pai token initialized and not reach the interval time, do not update
//If pai token initialized and not reach the interval time, do not update
...
@@ -603,7 +601,7 @@ class PAITrainingService implements TrainingService {
...
@@ -603,7 +601,7 @@ class PAITrainingService implements TrainingService {
}
}
private
postParameterFileMeta
(
parameterFileMeta
:
ParameterFileMeta
):
Promise
<
void
>
{
private
postParameterFileMeta
(
parameterFileMeta
:
ParameterFileMeta
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
const
req
:
request
.
Options
=
{
const
req
:
request
.
Options
=
{
uri
:
`
${
restServer
.
endPoint
}${
restServer
.
apiRootUrl
}
/parameter-file-meta`
,
uri
:
`
${
restServer
.
endPoint
}${
restServer
.
apiRootUrl
}
/parameter-file-meta`
,
...
...
src/nni_manager/training_service/pai/paiTrialConfig.ts
View file @
1328f412
...
@@ -15,7 +15,7 @@ export class PAITrialConfig extends TrialConfig {
...
@@ -15,7 +15,7 @@ export class PAITrialConfig extends TrialConfig {
public
readonly
dataDir
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
outputDir
:
string
;
public
readonly
outputDir
:
string
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
)
{
image
:
string
,
dataDir
:
string
,
outputDir
:
string
)
{
super
(
command
,
codeDir
,
gpuNum
);
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
cpuNum
=
cpuNum
;
...
...
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
View file @
1328f412
...
@@ -5,7 +5,6 @@
...
@@ -5,7 +5,6 @@
import
*
as
assert
from
'
assert
'
;
import
*
as
assert
from
'
assert
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialJobDetail
}
from
'
../../common/trainingService
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
import
{
...
@@ -19,7 +18,7 @@ type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';
...
@@ -19,7 +18,7 @@ type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';
*/
*/
export
class
GPUScheduler
{
export
class
GPUScheduler
{
private
readonly
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
private
readonly
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
private
readonly
log
:
Logger
=
getLogger
();
private
readonly
log
:
Logger
=
getLogger
();
private
readonly
policyName
:
SCHEDULE_POLICY_NAME
=
'
round-robin
'
;
private
readonly
policyName
:
SCHEDULE_POLICY_NAME
=
'
round-robin
'
;
private
roundRobinIndex
:
number
=
0
;
private
roundRobinIndex
:
number
=
0
;
...
@@ -29,7 +28,7 @@ export class GPUScheduler {
...
@@ -29,7 +28,7 @@ export class GPUScheduler {
* Constructor
* Constructor
* @param machineSSHClientMap map from remote machine to sshClient
* @param machineSSHClientMap map from remote machine to sshClient
*/
*/
constructor
(
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
)
{
constructor
(
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
)
{
assert
(
machineSSHClientMap
.
size
>
0
);
assert
(
machineSSHClientMap
.
size
>
0
);
this
.
machineSSHClientMap
=
machineSSHClientMap
;
this
.
machineSSHClientMap
=
machineSSHClientMap
;
this
.
configuredRMs
=
Array
.
from
(
machineSSHClientMap
.
keys
());
this
.
configuredRMs
=
Array
.
from
(
machineSSHClientMap
.
keys
());
...
@@ -39,7 +38,7 @@ export class GPUScheduler {
...
@@ -39,7 +38,7 @@ export class GPUScheduler {
* Schedule a machine according to the constraints (requiredGPUNum)
* Schedule a machine according to the constraints (requiredGPUNum)
* @param requiredGPUNum required GPU number
* @param requiredGPUNum required GPU number
*/
*/
public
scheduleMachine
(
requiredGPUNum
:
number
|
undefined
,
trialJobDetail
:
RemoteMachineTrialJobDetail
)
:
RemoteMachineScheduleResult
{
public
scheduleMachine
(
requiredGPUNum
:
number
|
undefined
,
trialJobDetail
:
RemoteMachineTrialJobDetail
):
RemoteMachineScheduleResult
{
if
(
requiredGPUNum
===
undefined
)
{
if
(
requiredGPUNum
===
undefined
)
{
requiredGPUNum
=
0
;
requiredGPUNum
=
0
;
}
}
...
@@ -48,7 +47,7 @@ export class GPUScheduler {
...
@@ -48,7 +47,7 @@ export class GPUScheduler {
assert
(
allRMs
.
length
>
0
);
assert
(
allRMs
.
length
>
0
);
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const
eligibleRM
:
RemoteMachineMeta
[]
=
allRMs
.
filter
((
rmMeta
:
RemoteMachineMeta
)
=>
const
eligibleRM
:
RemoteMachineMeta
[]
=
allRMs
.
filter
((
rmMeta
:
RemoteMachineMeta
)
=>
rmMeta
.
gpuSummary
===
undefined
||
requiredGPUNum
===
0
||
(
requiredGPUNum
!==
undefined
&&
rmMeta
.
gpuSummary
.
gpuCount
>=
requiredGPUNum
));
rmMeta
.
gpuSummary
===
undefined
||
requiredGPUNum
===
0
||
(
requiredGPUNum
!==
undefined
&&
rmMeta
.
gpuSummary
.
gpuCount
>=
requiredGPUNum
));
if
(
eligibleRM
.
length
===
0
)
{
if
(
eligibleRM
.
length
===
0
)
{
// If the required gpu number exceeds the upper limit of all machine's GPU number
// If the required gpu number exceeds the upper limit of all machine's GPU number
...
@@ -134,8 +133,8 @@ export class GPUScheduler {
...
@@ -134,8 +133,8 @@ export class GPUScheduler {
* @param availableGPUMap available GPU resource filled by this detection
* @param availableGPUMap available GPU resource filled by this detection
* @returns Available GPU number on this remote machine
* @returns Available GPU number on this remote machine
*/
*/
private
gpuResourceDetection
()
:
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
{
private
gpuResourceDetection
():
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
{
const
totalResourceMap
:
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
=
new
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
();
const
totalResourceMap
:
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
=
new
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
();
this
.
machineSSHClientMap
.
forEach
((
sshClientManager
:
SSHClientManager
,
rmMeta
:
RemoteMachineMeta
)
=>
{
this
.
machineSSHClientMap
.
forEach
((
sshClientManager
:
SSHClientManager
,
rmMeta
:
RemoteMachineMeta
)
=>
{
// Assgin totoal GPU count as init available GPU number
// Assgin totoal GPU count as init available GPU number
if
(
rmMeta
.
gpuSummary
!==
undefined
)
{
if
(
rmMeta
.
gpuSummary
!==
undefined
)
{
...
@@ -224,7 +223,7 @@ export class GPUScheduler {
...
@@ -224,7 +223,7 @@ export class GPUScheduler {
resultType
:
ScheduleResultType
.
SUCCEED
,
resultType
:
ScheduleResultType
.
SUCCEED
,
scheduleInfo
:
{
scheduleInfo
:
{
rmMeta
:
rmMeta
,
rmMeta
:
rmMeta
,
cuda
_v
isible
_d
evice
:
allocatedGPUs
cuda
V
isible
D
evice
:
allocatedGPUs
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
return
gpuInfo
.
index
;
return
gpuInfo
.
index
;
})
})
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
1328f412
...
@@ -13,13 +13,13 @@ import { GPUInfo, GPUSummary } from '../common/gpuData';
...
@@ -13,13 +13,13 @@ import { GPUInfo, GPUSummary } from '../common/gpuData';
* Metadata of remote machine for configuration and statuc query
* Metadata of remote machine for configuration and statuc query
*/
*/
export
class
RemoteMachineMeta
{
export
class
RemoteMachineMeta
{
public
readonly
ip
:
string
=
''
;
public
readonly
ip
:
string
=
''
;
public
readonly
port
:
number
=
22
;
public
readonly
port
:
number
=
22
;
public
readonly
username
:
string
=
''
;
public
readonly
username
:
string
=
''
;
public
readonly
passwd
:
string
=
''
;
public
readonly
passwd
:
string
=
''
;
public
readonly
sshKeyPath
?:
string
;
public
readonly
sshKeyPath
?:
string
;
public
readonly
passphrase
?:
string
;
public
readonly
passphrase
?:
string
;
public
gpuSummary
:
GPUSummary
|
undefined
;
public
gpuSummary
:
GPUSummary
|
undefined
;
public
readonly
gpuIndices
?:
string
;
public
readonly
gpuIndices
?:
string
;
public
readonly
maxTrialNumPerGpu
?:
number
;
public
readonly
maxTrialNumPerGpu
?:
number
;
//TODO: initialize varialbe in constructor
//TODO: initialize varialbe in constructor
...
@@ -43,11 +43,11 @@ export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
...
@@ -43,11 +43,11 @@ export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
* The execution result for command executed on remote machine
* The execution result for command executed on remote machine
*/
*/
export
class
RemoteCommandResult
{
export
class
RemoteCommandResult
{
public
readonly
stdout
:
string
;
public
readonly
stdout
:
string
;
public
readonly
stderr
:
string
;
public
readonly
stderr
:
string
;
public
readonly
exitCode
:
number
;
public
readonly
exitCode
:
number
;
constructor
(
stdout
:
string
,
stderr
:
string
,
exitCode
:
number
)
{
constructor
(
stdout
:
string
,
stderr
:
string
,
exitCode
:
number
)
{
this
.
stdout
=
stdout
;
this
.
stdout
=
stdout
;
this
.
stderr
=
stderr
;
this
.
stderr
=
stderr
;
this
.
exitCode
=
exitCode
;
this
.
exitCode
=
exitCode
;
...
@@ -225,9 +225,9 @@ export class SSHClientManager {
...
@@ -225,9 +225,9 @@ export class SSHClientManager {
}
}
}
}
export
type
RemoteMachineScheduleResult
=
{
scheduleInfo
:
RemoteMachineScheduleInfo
|
undefined
;
resultType
:
ScheduleResultType
};
export
type
RemoteMachineScheduleResult
=
{
scheduleInfo
:
RemoteMachineScheduleInfo
|
undefined
;
resultType
:
ScheduleResultType
};
export
type
RemoteMachineScheduleInfo
=
{
rmMeta
:
RemoteMachineMeta
;
cuda
_v
isible
_d
evice
:
string
};
export
type
RemoteMachineScheduleInfo
=
{
rmMeta
:
RemoteMachineMeta
;
cuda
V
isible
D
evice
:
string
};
export
enum
ScheduleResultType
{
export
enum
ScheduleResultType
{
// Schedule succeeded
// Schedule succeeded
...
...
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
View file @
1328f412
...
@@ -15,7 +15,7 @@ import { RemoteMachineTrainingService } from './remoteMachineTrainingService';
...
@@ -15,7 +15,7 @@ import { RemoteMachineTrainingService } from './remoteMachineTrainingService';
@
component
.
Singleton
@
component
.
Singleton
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
@
Inject
@
Inject
private
readonly
remoteMachineTrainingService
:
RemoteMachineTrainingService
;
private
readonly
remoteMachineTrainingService
:
RemoteMachineTrainingService
;
/**
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
* constructor to provide NNIRestServer's own rest property, e.g. port
...
@@ -26,7 +26,7 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer {
...
@@ -26,7 +26,7 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer {
}
}
// tslint:disable-next-line:no-any
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[]):
void
{
// Split metrics array into single metric, then emit
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWNls
// Warning: If not split metrics into single ones, the behavior will be UNKNOWNls
for
(
const
singleMetric
of
metrics
)
{
for
(
const
singleMetric
of
metrics
)
{
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment