Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
ba8dccd6
Commit
ba8dccd6
authored
Jun 23, 2019
by
suiguoxin
Browse files
Merge branch 'master' of
https://github.com/microsoft/nni
parents
56a1575b
150ee83a
Changes
208
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
708 additions
and
573 deletions
+708
-573
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+155
-133
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+28
-26
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+51
-47
src/nni_manager/training_service/pai/hdfsClientUtility.ts
src/nni_manager/training_service/pai/hdfsClientUtility.ts
+51
-39
src/nni_manager/training_service/pai/paiConfig.ts
src/nni_manager/training_service/pai/paiConfig.ts
+25
-13
src/nni_manager/training_service/pai/paiData.ts
src/nni_manager/training_service/pai/paiData.ts
+16
-11
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
+36
-28
src/nni_manager/training_service/pai/paiJobRestServer.ts
src/nni_manager/training_service/pai/paiJobRestServer.ts
+6
-5
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+68
-53
src/nni_manager/training_service/pai/paiTrialConfig.ts
src/nni_manager/training_service/pai/paiTrialConfig.ts
+9
-5
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
...i_manager/training_service/remote_machine/gpuScheduler.ts
+31
-23
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+76
-68
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
...ning_service/remote_machine/remoteMachineJobRestServer.ts
+6
-5
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+114
-92
src/nni_manager/training_service/remote_machine/sshClientUtility.ts
...nager/training_service/remote_machine/sshClientUtility.ts
+23
-15
src/nni_manager/training_service/test/hdfsClientUtility.test.ts
...i_manager/training_service/test/hdfsClientUtility.test.ts
+4
-4
src/nni_manager/training_service/test/kubeflowTrainingService.test.ts
...ger/training_service/test/kubeflowTrainingService.test.ts
+2
-2
src/nni_manager/training_service/test/localTrainingService.test.ts
...anager/training_service/test/localTrainingService.test.ts
+2
-2
src/nni_manager/training_service/test/paiTrainingService.test.ts
..._manager/training_service/test/paiTrainingService.test.ts
+1
-1
src/nni_manager/tslint.json
src/nni_manager/tslint.json
+4
-1
No files found.
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
ba8dccd6
...
@@ -17,35 +17,36 @@
...
@@ -17,35 +17,36 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
*/
'
use strict
'
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
azureStorage
from
'
azure-storage
'
;
import
{
EventEmitter
}
from
'
events
'
;
import
{
EventEmitter
}
from
'
events
'
;
import
{
Base64
}
from
'
js-base64
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
getExperimentId
,
getInitTrialSequenceId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getExperimentId
,
getInitTrialSequenceId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getExperimentRootDir
,
uniqueString
,
getJobCancelStatus
,
getIPV4Address
,
getVersion
}
from
'
../../common/utils
'
;
import
{
import
{
TrialJobDetail
,
TrialJobMetric
,
NNIManagerIpConfig
NNIManagerIpConfig
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
KubernetesTrialJobDetail
,
KubernetesScriptFormat
}
from
'
./kubernetesData
'
;
import
{
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
*
as
azureStorage
from
'
azure-storage
'
;
var
azure
=
require
(
'
azure-storage
'
);
var
base64
=
require
(
'
js-base64
'
).
Base64
;
/**
* Training Service implementation for Kubernetes
*/
abstract
class
KubernetesTrainingService
{
abstract
class
KubernetesTrainingService
{
protected
readonly
NNI_KUBERNETES_TRIAL_LABEL
:
string
=
'
nni-kubernetes-trial
'
;
protected
readonly
NNI_KUBERNETES_TRIAL_LABEL
:
string
=
'
nni-kubernetes-trial
'
;
protected
readonly
log
!
:
Logger
;
protected
readonly
log
!
:
Logger
;
protected
readonly
metricsEmitter
:
EventEmitter
;
protected
readonly
metricsEmitter
:
EventEmitter
;
protected
readonly
trialJobsMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
;
protected
readonly
trialJobsMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
;
/
**
experiment root dir in NFS
*/
/
/
experiment root dir in NFS
protected
readonly
trialLocalNFSTempFolder
:
string
;
protected
readonly
trialLocalNFSTempFolder
:
string
;
protected
stopping
:
boolean
=
false
;
protected
stopping
:
boolean
=
false
;
protected
experimentId
!
:
string
;
protected
experimentId
!
:
string
;
...
@@ -63,35 +64,36 @@ abstract class KubernetesTrainingService {
...
@@ -63,35 +64,36 @@ abstract class KubernetesTrainingService {
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
versionCheck
:
boolean
=
true
;
protected
versionCheck
:
boolean
=
true
;
protected
logCollection
:
string
;
protected
logCollection
:
string
;
constructor
()
{
constructor
()
{
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
this
.
metricsEmitter
=
new
EventEmitter
();
this
.
metricsEmitter
=
new
EventEmitter
();
this
.
trialJobsMap
=
new
Map
<
string
,
KubernetesTrialJobDetail
>
();
this
.
trialJobsMap
=
new
Map
<
string
,
KubernetesTrialJobDetail
>
();
this
.
trialLocalNFSTempFolder
=
path
.
join
(
getExperimentRootDir
(),
'
trials-nfs-tmp
'
);
this
.
trialLocalNFSTempFolder
=
path
.
join
(
getExperimentRootDir
(),
'
trials-nfs-tmp
'
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
this
.
nextTrialSequenceId
=
-
1
;
this
.
CONTAINER_MOUNT_PATH
=
'
/tmp/mount
'
;
this
.
CONTAINER_MOUNT_PATH
=
'
/tmp/mount
'
;
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
logCollection
=
'
none
'
;
this
.
logCollection
=
'
none
'
;
}
}
public
generatePodResource
(
memory
:
number
,
cpuNum
:
number
,
gpuNum
:
number
)
{
// tslint:disable:no-any
public
generatePodResource
(
memory
:
number
,
cpuNum
:
number
,
gpuNum
:
number
):
any
{
return
{
return
{
'
memory
'
:
`
${
memory
}
Mi`
,
memory
:
`
${
memory
}
Mi`
,
'
cpu
'
:
`
${
cpuNum
}
`
,
cpu
:
`
${
cpuNum
}
`
,
'
nvidia.com/gpu
'
:
`
${
gpuNum
}
`
'
nvidia.com/gpu
'
:
`
${
gpuNum
}
`
}
}
;
}
}
// tslint:enable:no-any
public
async
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
public
async
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
const
jobs
:
TrialJobDetail
[]
=
[];
const
jobs
:
TrialJobDetail
[]
=
[];
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
value
.
form
.
jobType
===
'
TRIAL
'
)
{
if
(
value
.
form
.
jobType
===
'
TRIAL
'
)
{
jobs
.
push
(
await
this
.
getTrialJob
(
key
));
jobs
.
push
(
await
this
.
getTrialJob
(
key
));
}
}
}
;
}
return
Promise
.
resolve
(
jobs
);
return
Promise
.
resolve
(
jobs
);
}
}
...
@@ -100,21 +102,21 @@ abstract class KubernetesTrainingService {
...
@@ -100,21 +102,21 @@ abstract class KubernetesTrainingService {
const
kubernetesTrialJob
:
TrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
kubernetesTrialJob
:
TrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
kubernetesTrialJob
)
{
if
(
kubernetesTrialJob
===
undefined
)
{
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
)
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
)
;
}
}
return
Promise
.
resolve
(
kubernetesTrialJob
);
return
Promise
.
resolve
(
kubernetesTrialJob
);
}
}
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
{
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
:
void
{
this
.
metricsEmitter
.
on
(
'
metric
'
,
listener
);
this
.
metricsEmitter
.
on
(
'
metric
'
,
listener
);
}
}
public
removeTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
{
public
removeTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
:
void
{
this
.
metricsEmitter
.
off
(
'
metric
'
,
listener
);
this
.
metricsEmitter
.
off
(
'
metric
'
,
listener
);
}
}
public
get
isMultiPhaseJobSupported
():
boolean
{
public
get
isMultiPhaseJobSupported
():
boolean
{
return
false
;
return
false
;
}
}
...
@@ -127,6 +129,96 @@ abstract class KubernetesTrainingService {
...
@@ -127,6 +129,96 @@ abstract class KubernetesTrainingService {
return
this
.
metricsEmitter
;
return
this
.
metricsEmitter
;
}
}
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
KubernetesTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
not found`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
failed because operatorClient is undefined`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
try
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()],
[
'
trialId
'
,
trialJobId
]
]
));
}
catch
(
err
)
{
const
errorMessage
:
string
=
`Delete trial
${
trialJobId
}
failed:
${
err
}
`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
trialJobDetail
.
endTime
=
Date
.
now
();
trialJobDetail
.
status
=
getJobCancelStatus
(
isEarlyStopped
);
return
Promise
.
resolve
();
}
public
async
cleanUp
():
Promise
<
void
>
{
this
.
stopping
=
true
;
// First, cancel all running kubernetes jobs
for
(
const
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
([
'
RUNNING
'
,
'
WAITING
'
,
'
UNKNOWN
'
].
includes
(
kubernetesTrialJob
.
status
))
{
try
{
await
this
.
cancelTrialJob
(
trialJobId
);
}
catch
(
error
)
{
// DONT throw error during cleanup
}
kubernetesTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
}
// Delete all kubernetes jobs whose expId label is current experiment id
try
{
if
(
this
.
kubernetesCRDClient
!==
undefined
)
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()]
]
));
}
}
catch
(
error
)
{
this
.
log
.
error
(
`Delete kubernetes job with label: app=
${
this
.
NNI_KUBERNETES_TRIAL_LABEL
}
,\
expId=
${
getExperimentId
()}
failed, error is
${
error
}
`
);
}
// Unmount NFS
try
{
await
cpp
.
exec
(
`sudo umount
${
this
.
trialLocalNFSTempFolder
}
`
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Unmount
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
);
}
// Stop kubernetes rest server
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
try
{
await
this
.
kubernetesJobRestServer
.
stop
();
this
.
log
.
info
(
'
Kubernetes Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
// tslint:disable-next-line: no-unsafe-any
this
.
log
.
error
(
`Kubernetes Training service rest server stopped failed, error:
${
error
.
message
}
`
);
return
Promise
.
reject
(
error
);
}
return
Promise
.
resolve
();
}
protected
generateSequenceId
():
number
{
protected
generateSequenceId
():
number
{
if
(
this
.
nextTrialSequenceId
===
-
1
)
{
if
(
this
.
nextTrialSequenceId
===
-
1
)
{
this
.
nextTrialSequenceId
=
getInitTrialSequenceId
();
this
.
nextTrialSequenceId
=
getInitTrialSequenceId
();
...
@@ -135,25 +227,31 @@ abstract class KubernetesTrainingService {
...
@@ -135,25 +227,31 @@ abstract class KubernetesTrainingService {
return
this
.
nextTrialSequenceId
++
;
return
this
.
nextTrialSequenceId
++
;
}
}
// tslint:disable: no-unsafe-any no-any
protected
async
createAzureStorage
(
vaultName
:
string
,
valutKeyName
:
string
,
accountName
:
string
,
azureShare
:
string
):
Promise
<
void
>
{
protected
async
createAzureStorage
(
vaultName
:
string
,
valutKeyName
:
string
,
accountName
:
string
,
azureShare
:
string
):
Promise
<
void
>
{
try
{
try
{
const
result
=
await
cpp
.
exec
(
`az keyvault secret show --name
${
valutKeyName
}
--vault-name
${
vaultName
}
`
);
const
result
:
any
=
await
cpp
.
exec
(
`az keyvault secret show --name
${
valutKeyName
}
--vault-name
${
vaultName
}
`
);
if
(
result
.
stderr
)
{
if
(
result
.
stderr
)
{
const
errorMessage
:
string
=
result
.
stderr
;
const
errorMessage
:
string
=
result
.
stderr
;
this
.
log
.
error
(
errorMessage
);
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
}
const
storageAccountKey
=
JSON
.
parse
(
result
.
stdout
).
value
;
const
storageAccountKey
:
any
=
JSON
.
parse
(
result
.
stdout
).
value
;
if
(
this
.
azureStorageAccountName
===
undefined
)
{
throw
new
Error
(
'
azureStorageAccountName not initialized!
'
);
}
//create storage client
//create storage client
this
.
azureStorageClient
=
azure
.
createFileService
(
this
.
azureStorageAccountName
,
storageAccountKey
);
this
.
azureStorageClient
=
azure
Storage
.
createFileService
(
this
.
azureStorageAccountName
,
storageAccountKey
);
await
AzureStorageClientUtility
.
createShare
(
this
.
azureStorageClient
,
this
.
azureStorageShare
);
await
AzureStorageClientUtility
.
createShare
(
this
.
azureStorageClient
,
this
.
azureStorageShare
);
//create sotrage secret
//create sotrage secret
this
.
azureStorageSecretName
=
'
nni-secret-
'
+
uniqueString
(
8
).
toLowerCase
();
this
.
azureStorageSecretName
=
String
.
Format
(
'
nni-secret-{0}
'
,
uniqueString
(
8
)
.
toLowerCase
());
await
this
.
genericK8sClient
.
createSecret
(
await
this
.
genericK8sClient
.
createSecret
(
{
{
apiVersion
:
'
v1
'
,
apiVersion
:
'
v1
'
,
kind
:
'
Secret
'
,
kind
:
'
Secret
'
,
metadata
:
{
metadata
:
{
name
:
this
.
azureStorageSecretName
,
name
:
this
.
azureStorageSecretName
,
namespace
:
'
default
'
,
namespace
:
'
default
'
,
labels
:
{
labels
:
{
...
@@ -163,38 +261,42 @@ abstract class KubernetesTrainingService {
...
@@ -163,38 +261,42 @@ abstract class KubernetesTrainingService {
},
},
type
:
'
Opaque
'
,
type
:
'
Opaque
'
,
data
:
{
data
:
{
azurestorageaccountname
:
b
ase64
.
encode
(
this
.
azureStorageAccountName
),
azurestorageaccountname
:
B
ase64
.
encode
(
this
.
azureStorageAccountName
),
azurestorageaccountkey
:
b
ase64
.
encode
(
storageAccountKey
)
azurestorageaccountkey
:
B
ase64
.
encode
(
storageAccountKey
)
}
}
}
}
);
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
return
Promise
.
reject
(
error
);
}
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
// tslint:enable: no-unsafe-any no-any
/**
/**
* Genereate run script for different roles(like worker or ps)
* Genereate run script for different roles(like worker or ps)
* @param trialJobId trial job id
* @param trialJobId trial job id
* @param trialWorkingFolder working folder
* @param trialWorkingFolder working folder
* @param command
* @param command
command
* @param trialSequenceId sequence id
* @param trialSequenceId sequence id
*/
*/
protected
async
generateRunScript
(
platform
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
protected
async
generateRunScript
(
platform
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
command
:
string
,
trialSequenceId
:
string
,
roleName
:
string
,
gpuNum
:
number
):
Promise
<
string
>
{
command
:
string
,
trialSequenceId
:
string
,
roleName
:
string
,
gpuNum
:
number
):
Promise
<
string
>
{
let
nvidia
_s
cript
:
string
=
''
;
let
nvidia
S
cript
:
string
=
''
;
// Nvidia devcie plugin for K8S has a known issue that requesting zero GPUs allocates all GPUs
// Nvidia devcie plugin for K8S has a known issue that requesting zero GPUs allocates all GPUs
// Refer https://github.com/NVIDIA/k8s-device-plugin/issues/61
// Refer https://github.com/NVIDIA/k8s-device-plugin/issues/61
// So we have to explicitly set CUDA_VISIBLE_DEVICES to empty if user sets gpuNum to 0 in NNI config file
// So we have to explicitly set CUDA_VISIBLE_DEVICES to empty if user sets gpuNum to 0 in NNI config file
if
(
gpuNum
===
0
)
{
if
(
gpuNum
===
0
)
{
nvidia
_s
cript
=
`export CUDA_VISIBLE_DEVICES='0'`
;
nvidia
S
cript
=
`export CUDA_VISIBLE_DEVICES='0'`
;
}
}
const
nniManagerIp
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
// tslint:disable-next-line: strict-boolean-expressions
const
version
=
this
.
versionCheck
?
await
getVersion
():
''
;
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
runScript
:
string
=
String
.
Format
(
const
runScript
:
string
=
String
.
Format
(
K
ubernetesScriptFormat
,
k
ubernetesScriptFormat
,
platform
,
platform
,
trialJobId
,
trialJobId
,
path
.
join
(
trialWorkingFolder
,
'
output
'
,
`
${
roleName
}
_output`
),
path
.
join
(
trialWorkingFolder
,
'
output
'
,
`
${
roleName
}
_output`
),
...
@@ -202,108 +304,28 @@ abstract class KubernetesTrainingService {
...
@@ -202,108 +304,28 @@ abstract class KubernetesTrainingService {
getExperimentId
(),
getExperimentId
(),
trialWorkingFolder
,
trialWorkingFolder
,
trialSequenceId
,
trialSequenceId
,
nvidia
_s
cript
,
nvidia
S
cript
,
command
,
command
,
nniManagerIp
,
nniManagerIp
,
this
.
kubernetesRestServerPort
,
this
.
kubernetesRestServerPort
,
version
,
version
,
this
.
logCollection
this
.
logCollection
);
);
return
Promise
.
resolve
(
runScript
);
return
Promise
.
resolve
(
runScript
);
}
}
protected
async
createNFSStorage
(
nfsServer
:
string
,
nfsPath
:
string
):
Promise
<
void
>
{
protected
async
createNFSStorage
(
nfsServer
:
string
,
nfsPath
:
string
):
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
`
);
try
{
try
{
await
cpp
.
exec
(
`sudo mount
${
nfsServer
}
:
${
nfsPath
}
${
this
.
trialLocalNFSTempFolder
}
`
);
await
cpp
.
exec
(
`sudo mount
${
nfsServer
}
:
${
nfsPath
}
${
this
.
trialLocalNFSTempFolder
}
`
);
}
catch
(
error
)
{
}
catch
(
error
)
{
const
mountError
:
string
=
`Mount NFS
${
nfsServer
}
:
${
nfsPath
}
to
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
;
const
mountError
:
string
=
`Mount NFS
${
nfsServer
}
:
${
nfsPath
}
to
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
;
this
.
log
.
error
(
mountError
);
this
.
log
.
error
(
mountError
);
return
Promise
.
reject
(
mountError
);
}
return
Promise
.
resolve
();
}
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
return
Promise
.
reject
(
mountError
);
const
trialJobDetail
:
KubernetesTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJobDetail
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
not found`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
failed because operatorClient is undefined`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
try
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()],
[
'
trialId
'
,
trialJobId
]
]
));
}
catch
(
err
)
{
const
errorMessage
:
string
=
`Delete trial
${
trialJobId
}
failed:
${
err
}
`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
trialJobDetail
.
endTime
=
Date
.
now
();
trialJobDetail
.
status
=
getJobCancelStatus
(
isEarlyStopped
);
return
Promise
.
resolve
();
}
public
async
cleanUp
():
Promise
<
void
>
{
this
.
stopping
=
true
;
// First, cancel all running kubernetes jobs
for
(
let
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
([
'
RUNNING
'
,
'
WAITING
'
,
'
UNKNOWN
'
].
includes
(
kubernetesTrialJob
.
status
))
{
try
{
await
this
.
cancelTrialJob
(
trialJobId
);
}
catch
(
error
)
{}
// DONT throw error during cleanup
kubernetesTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
}
// Delete all kubernetes jobs whose expId label is current experiment id
try
{
if
(
this
.
kubernetesCRDClient
)
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
[
[
'
app
'
,
this
.
NNI_KUBERNETES_TRIAL_LABEL
],
[
'
expId
'
,
getExperimentId
()]
]
));
}
}
catch
(
error
)
{
this
.
log
.
error
(
`Delete kubernetes job with label: app=
${
this
.
NNI_KUBERNETES_TRIAL_LABEL
}
,expId=
${
getExperimentId
()}
failed, error is
${
error
}
`
);
}
// Unmount NFS
try
{
await
cpp
.
exec
(
`sudo umount
${
this
.
trialLocalNFSTempFolder
}
`
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Unmount
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
);
}
// Stop kubernetes rest server
if
(
!
this
.
kubernetesJobRestServer
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
try
{
await
this
.
kubernetesJobRestServer
.
stop
();
this
.
log
.
info
(
'
Kubernetes Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Kubernetes Training service rest server stopped failed, error:
${
error
.
message
}
`
);
Promise
.
reject
(
error
);
}
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
}
}
export
{
KubernetesTrainingService
};
export
{
KubernetesTrainingService
}
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
ba8dccd6
...
@@ -25,10 +25,10 @@ import * as fs from 'fs';
...
@@ -25,10 +25,10 @@ import * as fs from 'fs';
import
*
as
os
from
'
os
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
execMkdir
,
getScriptName
,
getgpuMetricsCollectorScriptContent
,
execScript
,
execTail
,
execRemove
,
execKill
}
from
'
../common/util
'
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
execKill
,
execMkdir
,
execRemove
,
execTail
,
getgpuMetricsCollectorScriptContent
,
getScriptName
,
runScript
}
from
'
../common/util
'
;
/**
/**
* GPUScheduler for local training service
* GPUScheduler for local training service
...
@@ -37,8 +37,8 @@ class GPUScheduler {
...
@@ -37,8 +37,8 @@ class GPUScheduler {
private
gpuSummary
!
:
GPUSummary
;
private
gpuSummary
!
:
GPUSummary
;
private
stopping
:
boolean
;
private
stopping
:
boolean
;
private
log
:
Logger
;
private
readonly
log
:
Logger
;
private
gpuMetricCollectorScriptFolder
:
string
;
private
readonly
gpuMetricCollectorScriptFolder
:
string
;
constructor
()
{
constructor
()
{
this
.
stopping
=
false
;
this
.
stopping
=
false
;
...
@@ -58,28 +58,15 @@ class GPUScheduler {
...
@@ -58,28 +58,15 @@ class GPUScheduler {
}
}
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
);
//generate gpu_metrics_collector script
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
getScriptName
(
'
gpu_metrics_collector
'
));
const
gpuMetricsCollectorScriptContent
:
string
=
getgpuMetricsCollectorScriptContent
(
this
.
gpuMetricCollectorScriptFolder
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
execScript
(
gpuMetricsCollectorScriptPath
)
}
public
getAvailableGPUIndices
(
useActiveGpu
:
boolean
,
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
):
number
[]
{
public
getAvailableGPUIndices
(
useActiveGpu
:
boolean
,
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
):
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
if
(
this
.
gpuSummary
!==
undefined
)
{
if
(
process
.
platform
===
'
win32
'
||
useActiveGpu
)
{
if
(
process
.
platform
===
'
win32
'
||
useActiveGpu
)
{
return
this
.
gpuSummary
.
gpuInfos
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
return
this
.
gpuSummary
.
gpuInfos
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
}
}
else
{
else
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
===
undefined
&&
info
.
activeProcessNum
===
0
||
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
=
==
undefined
&&
info
.
activeProcessNum
===
0
||
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
!
==
undefined
)
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
!==
undefined
)
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
}
}
}
}
...
@@ -105,17 +92,32 @@ class GPUScheduler {
...
@@ -105,17 +92,32 @@ class GPUScheduler {
}
}
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
);
//generate gpu_metrics_collector script
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
getScriptName
(
'
gpu_metrics_collector
'
));
const
gpuMetricsCollectorScriptContent
:
string
=
getgpuMetricsCollectorScriptContent
(
this
.
gpuMetricCollectorScriptFolder
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
runScript
(
gpuMetricsCollectorScriptPath
);
}
// tslint:disable:non-literal-fs-path
private
async
updateGPUSummary
():
Promise
<
void
>
{
private
async
updateGPUSummary
():
Promise
<
void
>
{
le
t
gpuMetricPath
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
);
cons
t
gpuMetricPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
);
if
(
fs
.
existsSync
(
gpuMetricPath
))
{
if
(
fs
.
existsSync
(
gpuMetricPath
))
{
const
cmdresult
:
cpp
.
childProcessPromise
.
Result
=
await
execTail
(
gpuMetricPath
);
const
cmdresult
:
cpp
.
childProcessPromise
.
Result
=
await
execTail
(
gpuMetricPath
);
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
if
(
cmdresult
!==
undefined
&&
cmdresult
.
stdout
!==
undefined
)
{
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
else
{
}
else
{
this
.
log
.
error
(
'
Could not get gpu metrics information!
'
);
this
.
log
.
error
(
'
Could not get gpu metrics information!
'
);
}
}
}
else
{
}
else
{
this
.
log
.
warning
(
'
gpu_metrics file does not exist!
'
)
this
.
log
.
warning
(
'
gpu_metrics file does not exist!
'
)
;
}
}
}
}
}
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
ba8dccd6
...
@@ -24,6 +24,7 @@ import { EventEmitter } from 'events';
...
@@ -24,6 +24,7 @@ import { EventEmitter } from 'events';
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
ts
from
'
tail-stream
'
;
import
*
as
ts
from
'
tail-stream
'
;
import
*
as
tkill
from
'
tree-kill
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getInitTrialSequenceId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getInitTrialSequenceId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
...
@@ -31,14 +32,14 @@ import {
...
@@ -31,14 +32,14 @@ import {
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
uniqueString
,
isAlive
,
getNewLine
}
from
'
../../common/utils
'
;
import
{
import
{
execMkdir
,
getScriptName
,
execScript
,
setEnvironmentVariable
,
execNewFile
}
from
'
../common/util
'
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
getNewLine
,
isAlive
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execMkdir
,
execNewFile
,
getScriptName
,
runScript
,
setEnvironmentVariable
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
const
tkill
=
require
(
'
tree-kill
'
);
/**
/**
* Decode a command
* Decode a command
* @param Buffer binary incoming data
* @param Buffer binary incoming data
...
@@ -46,7 +47,7 @@ const tkill = require('tree-kill');
...
@@ -46,7 +47,7 @@ const tkill = require('tree-kill');
* success: true if the buffer contains at least one complete command; otherwise false
* success: true if the buffer contains at least one complete command; otherwise false
* remain: remaining data after the first command
* remain: remaining data after the first command
*/
*/
// tslint:disable
-next-line:
informative-docs
// tslint:disable
:newline-per-chained-call
informative-docs
function
decodeCommand
(
data
:
Buffer
):
[
boolean
,
string
,
string
,
Buffer
]
{
function
decodeCommand
(
data
:
Buffer
):
[
boolean
,
string
,
string
,
Buffer
]
{
if
(
data
.
length
<
8
)
{
if
(
data
.
length
<
8
)
{
return
[
false
,
''
,
''
,
data
];
return
[
false
,
''
,
''
,
data
];
...
@@ -61,6 +62,7 @@ function decodeCommand(data: Buffer): [boolean, string, string, Buffer] {
...
@@ -61,6 +62,7 @@ function decodeCommand(data: Buffer): [boolean, string, string, Buffer] {
return
[
true
,
commandType
,
content
,
remain
];
return
[
true
,
commandType
,
content
,
remain
];
}
}
// tslint:enable:newline-per-chained-call informative-docs
/**
/**
* LocalTrialJobDetail
* LocalTrialJobDetail
...
@@ -117,21 +119,21 @@ class LocalConfig {
...
@@ -117,21 +119,21 @@ class LocalConfig {
* Local machine training service
* Local machine training service
*/
*/
class
LocalTrainingService
implements
TrainingService
{
class
LocalTrainingService
implements
TrainingService
{
private
eventEmitter
:
EventEmitter
;
private
readonly
eventEmitter
:
EventEmitter
;
private
jobMap
:
Map
<
string
,
LocalTrialJobDetail
>
;
private
readonly
jobMap
:
Map
<
string
,
LocalTrialJobDetail
>
;
private
jobQueue
:
string
[];
private
readonly
jobQueue
:
string
[];
private
initialized
:
boolean
;
private
initialized
:
boolean
;
private
stopping
:
boolean
;
private
stopping
:
boolean
;
private
rootDir
!
:
string
;
private
rootDir
!
:
string
;
private
trialSequenceId
:
number
;
private
trialSequenceId
:
number
;
private
gpuScheduler
!
:
GPUScheduler
;
private
gpuScheduler
!
:
GPUScheduler
;
private
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
;
private
readonly
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
;
private
designatedGpuIndices
!
:
Set
<
number
>
;
private
designatedGpuIndices
!
:
Set
<
number
>
;
private
log
:
Logger
;
private
readonly
log
:
Logger
;
private
localTrailConfig
?:
TrialConfig
;
private
localTrailConfig
?:
TrialConfig
;
private
localConfig
?:
LocalConfig
;
private
localConfig
?:
LocalConfig
;
private
isMultiPhase
:
boolean
;
private
isMultiPhase
:
boolean
;
private
jobStreamMap
:
Map
<
string
,
ts
.
Stream
>
;
private
readonly
jobStreamMap
:
Map
<
string
,
ts
.
Stream
>
;
private
maxTrialNumPerGpu
:
number
;
private
maxTrialNumPerGpu
:
number
;
private
useActiveGpu
:
boolean
;
private
useActiveGpu
:
boolean
;
...
@@ -182,7 +184,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -182,7 +184,7 @@ class LocalTrainingService implements TrainingService {
return
this
.
getHostJob
(
trialJobId
);
return
this
.
getHostJob
(
trialJobId
);
}
}
if
(
trialJob
.
status
===
'
RUNNING
'
)
{
if
(
trialJob
.
status
===
'
RUNNING
'
)
{
le
t
alive
:
boolean
=
await
isAlive
(
trialJob
.
pid
);
cons
t
alive
:
boolean
=
await
isAlive
(
trialJob
.
pid
);
if
(
!
alive
)
{
if
(
!
alive
)
{
trialJob
.
endTime
=
Date
.
now
();
trialJob
.
endTime
=
Date
.
now
();
this
.
setTrialJobStatus
(
trialJob
,
'
FAILED
'
);
this
.
setTrialJobStatus
(
trialJob
,
'
FAILED
'
);
...
@@ -276,7 +278,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -276,7 +278,7 @@ class LocalTrainingService implements TrainingService {
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
if
(
trialJob
.
form
.
jobType
===
'
TRIAL
'
)
{
if
(
trialJob
.
form
.
jobType
===
'
TRIAL
'
)
{
await
tkill
(
trialJob
.
pid
,
'
SIGKILL
'
);
tkill
(
trialJob
.
pid
,
'
SIGKILL
'
);
}
else
if
(
trialJob
.
form
.
jobType
===
'
HOST
'
)
{
}
else
if
(
trialJob
.
form
.
jobType
===
'
HOST
'
)
{
await
cpp
.
exec
(
`pkill -9 -P
${
trialJob
.
pid
}
`
);
await
cpp
.
exec
(
`pkill -9 -P
${
trialJob
.
pid
}
`
);
}
else
{
}
else
{
...
@@ -290,7 +292,8 @@ class LocalTrainingService implements TrainingService {
...
@@ -290,7 +292,8 @@ class LocalTrainingService implements TrainingService {
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
if
(
!
this
.
initialized
)
{
if
(
!
this
.
initialized
)
{
this
.
rootDir
=
getExperimentRootDir
();
this
.
rootDir
=
getExperimentRootDir
();
if
(
!
fs
.
existsSync
(
this
.
rootDir
)){
// tslint:disable-next-line:non-literal-fs-path
if
(
!
fs
.
existsSync
(
this
.
rootDir
))
{
await
cpp
.
exec
(
`powershell.exe mkdir
${
this
.
rootDir
}
`
);
await
cpp
.
exec
(
`powershell.exe mkdir
${
this
.
rootDir
}
`
);
}
}
this
.
initialized
=
true
;
this
.
initialized
=
true
;
...
@@ -299,7 +302,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -299,7 +302,7 @@ class LocalTrainingService implements TrainingService {
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
this
.
localTrailConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
this
.
localTrailConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
// Parse trial config failed, throw Error
if
(
!
this
.
localTrailConfig
)
{
if
(
this
.
localTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
throw
new
Error
(
'
trial config parsed failed
'
);
}
}
this
.
log
.
info
(
`required GPU number is
${
this
.
localTrailConfig
.
gpuNum
}
`
);
this
.
log
.
info
(
`required GPU number is
${
this
.
localTrailConfig
.
gpuNum
}
`
);
...
@@ -336,10 +339,10 @@ class LocalTrainingService implements TrainingService {
...
@@ -336,10 +339,10 @@ class LocalTrainingService implements TrainingService {
switch
(
key
)
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
let
getResult
:
Promise
<
string
>
;
let
getResult
:
Promise
<
string
>
;
if
(
!
this
.
localTrailConfig
)
{
if
(
this
.
localTrailConfig
===
undefined
)
{
getResult
=
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`
${
key
}
is never set yet`
));
getResult
=
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`
${
key
}
is never set yet`
));
}
else
{
}
else
{
getResult
=
Promise
.
resolve
(
!
this
.
localTrailConfig
?
''
:
JSON
.
stringify
(
this
.
localTrailConfig
));
getResult
=
Promise
.
resolve
(
JSON
.
stringify
(
this
.
localTrailConfig
));
}
}
return
getResult
;
return
getResult
;
...
@@ -366,7 +369,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -366,7 +369,7 @@ class LocalTrainingService implements TrainingService {
if
([
'
SUCCEEDED
'
,
'
FAILED
'
,
'
USER_CANCELED
'
,
'
SYS_CANCELED
'
,
'
EARLY_STOPPED
'
].
includes
(
trialJob
.
status
))
{
if
([
'
SUCCEEDED
'
,
'
FAILED
'
,
'
USER_CANCELED
'
,
'
SYS_CANCELED
'
,
'
EARLY_STOPPED
'
].
includes
(
trialJob
.
status
))
{
if
(
this
.
jobStreamMap
.
has
(
trialJob
.
id
))
{
if
(
this
.
jobStreamMap
.
has
(
trialJob
.
id
))
{
const
stream
:
ts
.
Stream
|
undefined
=
this
.
jobStreamMap
.
get
(
trialJob
.
id
);
const
stream
:
ts
.
Stream
|
undefined
=
this
.
jobStreamMap
.
get
(
trialJob
.
id
);
if
(
!
stream
)
{
if
(
stream
===
undefined
)
{
throw
new
Error
(
`Could not find stream in trial
${
trialJob
.
id
}
`
);
throw
new
Error
(
`Could not find stream in trial
${
trialJob
.
id
}
`
);
}
}
stream
.
destroy
();
stream
.
destroy
();
...
@@ -376,13 +379,13 @@ class LocalTrainingService implements TrainingService {
...
@@ -376,13 +379,13 @@ class LocalTrainingService implements TrainingService {
if
(
trialJob
.
gpuIndices
!==
undefined
&&
trialJob
.
gpuIndices
.
length
>
0
&&
this
.
gpuScheduler
!==
undefined
)
{
if
(
trialJob
.
gpuIndices
!==
undefined
&&
trialJob
.
gpuIndices
.
length
>
0
&&
this
.
gpuScheduler
!==
undefined
)
{
if
(
oldStatus
===
'
RUNNING
'
&&
trialJob
.
status
!==
'
RUNNING
'
)
{
if
(
oldStatus
===
'
RUNNING
'
&&
trialJob
.
status
!==
'
RUNNING
'
)
{
for
(
const
index
of
trialJob
.
gpuIndices
)
{
for
(
const
index
of
trialJob
.
gpuIndices
)
{
le
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
cons
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
if
(
num
===
undefined
)
{
throw
new
Error
(
`gpu resource schedule error`
);
throw
new
Error
(
`gpu resource schedule error`
);
}
else
if
(
num
===
1
)
{
}
else
if
(
num
===
1
)
{
this
.
occupiedGpuIndexNumMap
.
delete
(
index
);
this
.
occupiedGpuIndexNumMap
.
delete
(
index
);
}
else
{
}
else
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
-
1
)
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
-
1
)
;
}
}
}
}
}
}
...
@@ -424,10 +427,10 @@ class LocalTrainingService implements TrainingService {
...
@@ -424,10 +427,10 @@ class LocalTrainingService implements TrainingService {
}
}
let
selectedGPUIndices
:
number
[]
=
[];
let
selectedGPUIndices
:
number
[]
=
[];
le
t
availableGpuIndices
:
number
[]
=
this
.
gpuScheduler
.
getAvailableGPUIndices
(
this
.
useActiveGpu
,
this
.
occupiedGpuIndexNumMap
);
cons
t
availableGpuIndices
:
number
[]
=
this
.
gpuScheduler
.
getAvailableGPUIndices
(
this
.
useActiveGpu
,
this
.
occupiedGpuIndexNumMap
);
for
(
le
t
index
of
availableGpuIndices
)
{
for
(
cons
t
index
of
availableGpuIndices
)
{
le
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
cons
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
||
num
<
this
.
maxTrialNumPerGpu
)
{
if
(
num
===
undefined
||
num
<
this
.
maxTrialNumPerGpu
)
{
selectedGPUIndices
.
push
(
index
);
selectedGPUIndices
.
push
(
index
);
}
}
}
}
...
@@ -461,11 +464,11 @@ class LocalTrainingService implements TrainingService {
...
@@ -461,11 +464,11 @@ class LocalTrainingService implements TrainingService {
private
occupyResource
(
resource
:
{
gpuIndices
:
number
[]}):
void
{
private
occupyResource
(
resource
:
{
gpuIndices
:
number
[]}):
void
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
for
(
const
index
of
resource
.
gpuIndices
)
{
for
(
const
index
of
resource
.
gpuIndices
)
{
le
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
cons
t
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
if
(
num
===
undefined
)
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
1
)
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
1
)
;
}
else
{
}
else
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
+
1
)
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
+
1
)
;
}
}
}
}
}
}
...
@@ -498,20 +501,20 @@ class LocalTrainingService implements TrainingService {
...
@@ -498,20 +501,20 @@ class LocalTrainingService implements TrainingService {
}
}
}
}
private
getScript
(
localTrailConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]{
private
getScript
(
localTrailConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]
{
le
t
script
:
string
[]
=
[];
cons
t
script
:
string
[]
=
[];
if
(
process
.
platform
===
"
win32
"
)
{
if
(
process
.
platform
===
'
win32
'
)
{
script
.
push
(
script
.
push
(
`cmd /c
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`cmd /c
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = "$NOW_DATE" +
"000"
`
,
`$NOW_DATE = "$NOW_DATE" +
(Get-Date -Format fff).ToString()
`
,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
-NoNewline -encoding utf8`
);
`Write $LASTEXITCODE " " $NOW_DATE | Out-File
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
-NoNewline -encoding utf8`
);
}
}
else
{
else
{
script
.
push
(
script
.
push
(
`eval
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`eval
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s
000
\`
>
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
`echo $?
\`
date +%s
%3N
\`
>
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
}
}
return
script
;
return
script
;
}
}
...
@@ -519,28 +522,29 @@ class LocalTrainingService implements TrainingService {
...
@@ -519,28 +522,29 @@ class LocalTrainingService implements TrainingService {
const
trialJobDetail
:
LocalTrialJobDetail
=
<
LocalTrialJobDetail
>
this
.
jobMap
.
get
(
trialJobId
);
const
trialJobDetail
:
LocalTrialJobDetail
=
<
LocalTrialJobDetail
>
this
.
jobMap
.
get
(
trialJobId
);
const
variables
:
{
key
:
string
;
value
:
string
}[]
=
this
.
getEnvironmentVariables
(
trialJobDetail
,
resource
);
const
variables
:
{
key
:
string
;
value
:
string
}[]
=
this
.
getEnvironmentVariables
(
trialJobDetail
,
resource
);
if
(
!
this
.
localTrailConfig
)
{
if
(
this
.
localTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
throw
new
Error
(
'
trial config is not initialized
'
);
}
}
const
runScript
Lines
:
string
[]
=
[];
const
runScript
Content
:
string
[]
=
[];
if
(
process
.
platform
!==
"
win32
"
)
{
if
(
process
.
platform
!==
'
win32
'
)
{
runScript
Lines
.
push
(
'
#!/bin/bash
'
);
runScript
Content
.
push
(
'
#!/bin/bash
'
);
}
}
runScript
Lines
.
push
(
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
runScript
Content
.
push
(
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
for
(
const
variable
of
variables
)
{
for
(
const
variable
of
variables
)
{
runScript
Lines
.
push
(
setEnvironmentVariable
(
variable
));
runScript
Content
.
push
(
setEnvironmentVariable
(
variable
));
}
}
const
scripts
:
string
[]
=
this
.
getScript
(
this
.
localTrailConfig
,
trialJobDetail
.
workingDirectory
);
const
scripts
:
string
[]
=
this
.
getScript
(
this
.
localTrailConfig
,
trialJobDetail
.
workingDirectory
);
scripts
.
forEach
(
script
=>
{
scripts
.
forEach
(
(
script
:
string
)
=>
{
runScript
Lines
.
push
(
script
);
runScript
Content
.
push
(
script
);
});
});
await
execMkdir
(
trialJobDetail
.
workingDirectory
);
await
execMkdir
(
trialJobDetail
.
workingDirectory
);
await
execMkdir
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
));
await
execMkdir
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
));
await
execNewFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
metrics
'
));
await
execNewFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
metrics
'
));
const
scriptName
:
string
=
getScriptName
(
'
run
'
);
const
scriptName
:
string
=
getScriptName
(
'
run
'
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
),
runScriptLines
.
join
(
getNewLine
()),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
),
runScriptContent
.
join
(
getNewLine
()),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
await
this
.
writeParameterFile
(
trialJobDetail
.
workingDirectory
,
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
).
hyperParameters
);
await
this
.
writeParameterFile
(
trialJobDetail
.
workingDirectory
,
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
).
hyperParameters
);
const
trialJobProcess
:
cp
.
ChildProcess
=
exec
Script
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
));
const
trialJobProcess
:
cp
.
ChildProcess
=
run
Script
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
));
this
.
setTrialJobStatus
(
trialJobDetail
,
'
RUNNING
'
);
this
.
setTrialJobStatus
(
trialJobDetail
,
'
RUNNING
'
);
trialJobDetail
.
startTime
=
Date
.
now
();
trialJobDetail
.
startTime
=
Date
.
now
();
trialJobDetail
.
pid
=
trialJobProcess
.
pid
;
trialJobDetail
.
pid
=
trialJobProcess
.
pid
;
...
...
src/nni_manager/training_service/pai/hdfsClientUtility.ts
View file @
ba8dccd6
...
@@ -17,12 +17,12 @@
...
@@ -17,12 +17,12 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
*/
import
*
as
path
from
'
path
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
getLogger
}
from
'
../../common/log
'
;
import
{
getLogger
}
from
'
../../common/log
'
;
import
{
unixPathJoin
}
from
'
../../common/utils
'
import
{
unixPathJoin
}
from
'
../../common/utils
'
;
/**
/**
* HDFS client utility, including copy file/directory
* HDFS client utility, including copy file/directory
...
@@ -33,6 +33,7 @@ export namespace HDFSClientUtility {
...
@@ -33,6 +33,7 @@ export namespace HDFSClientUtility {
* @param hdfsUserName HDFS user name
* @param hdfsUserName HDFS user name
*/
*/
function
hdfsExpRootDir
(
hdfsUserName
:
string
):
string
{
function
hdfsExpRootDir
(
hdfsUserName
:
string
):
string
{
// tslint:disable-next-line:prefer-template
return
'
/
'
+
unixPathJoin
(
hdfsUserName
,
'
nni
'
,
'
experiments
'
,
getExperimentId
());
return
'
/
'
+
unixPathJoin
(
hdfsUserName
,
'
nni
'
,
'
experiments
'
,
getExperimentId
());
}
}
...
@@ -50,63 +51,70 @@ export namespace HDFSClientUtility {
...
@@ -50,63 +51,70 @@ export namespace HDFSClientUtility {
* @param trialId NNI trial ID
* @param trialId NNI trial ID
*/
*/
export
function
getHdfsTrialWorkDir
(
hdfsUserName
:
string
,
trialId
:
string
):
string
{
export
function
getHdfsTrialWorkDir
(
hdfsUserName
:
string
,
trialId
:
string
):
string
{
le
t
root
=
hdfsExpRootDir
(
hdfsUserName
)
cons
t
root
:
string
=
hdfsExpRootDir
(
hdfsUserName
)
;
console
.
log
(
root
)
return
unixPathJoin
(
root
,
'
trials
'
,
trialId
);
return
unixPathJoin
(
root
,
'
trials
'
,
trialId
);
}
}
/**
/**
* Copy a local file to hdfs directory
* Copy a local file to hdfs directory
*
*
* @param localFilePath local file path(source)
* @param localFilePath local file path(source)
* @param hdfsFilePath hdfs file path(target)
* @param hdfsFilePath hdfs file path(target)
* @param hdfsClient hdfs client
* @param hdfsClient hdfs client
*/
*/
// tslint:disable: no-unsafe-any non-literal-fs-path no-any
export
async
function
copyFileToHdfs
(
localFilePath
:
string
,
hdfsFilePath
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
export
async
function
copyFileToHdfs
(
localFilePath
:
string
,
hdfsFilePath
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
// tslint:disable-next-line:non-literal-fs-path
fs
.
exists
(
localFilePath
,
(
exists
:
boolean
)
=>
{
fs
.
exists
(
localFilePath
,
(
exists
:
boolean
)
=>
{
// Detect if local file exist
// Detect if local file exist
if
(
exists
)
{
if
(
exists
)
{
var
localFileStream
=
fs
.
createReadStream
(
localFilePath
);
const
localFileStream
:
fs
.
ReadStream
=
fs
.
createReadStream
(
localFilePath
);
var
hdfsFileStream
=
hdfsClient
.
createWriteStream
(
hdfsFilePath
);
const
hdfsFileStream
:
any
=
hdfsClient
.
createWriteStream
(
hdfsFilePath
);
localFileStream
.
pipe
(
hdfsFileStream
);
localFileStream
.
pipe
(
hdfsFileStream
);
hdfsFileStream
.
on
(
'
finish
'
,
function
onFinish
()
{
hdfsFileStream
.
on
(
'
finish
'
,
()
=>
{
deferred
.
resolve
();
deferred
.
resolve
();
});
});
hdfsFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
hdfsFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
getLogger
().
error
(
`HDFSCientUtility:copyFileToHdfs, copy file failed, err is
${
err
.
message
}
`
);
getLogger
()
.
error
(
`HDFSCientUtility:copyFileToHdfs, copy file failed, err is
${
err
.
message
}
`
);
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
});
});
}
else
{
}
else
{
getLogger
().
error
(
`HDFSCientUtility:copyFileToHdfs,
${
localFilePath
}
doesn't exist locally`
);
getLogger
()
.
error
(
`HDFSCientUtility:copyFileToHdfs,
${
localFilePath
}
doesn't exist locally`
);
deferred
.
reject
(
'
file not exist!
'
);
deferred
.
reject
(
'
file not exist!
'
);
}
}
});
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* Recursively copy local directory to hdfs directory
* Recursively copy local directory to hdfs directory
*
*
* @param localDirectory local directory
* @param localDirectory local directory
* @param hdfsDirectory HDFS directory
* @param hdfsDirectory HDFS directory
* @param hdfsClient HDFS client
* @param hdfsClient HDFS client
*/
*/
export
async
function
copyDirectoryToHdfs
(
localDirectory
:
string
,
hdfsDirectory
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
export
async
function
copyDirectoryToHdfs
(
localDirectory
:
string
,
hdfsDirectory
:
string
,
hdfsClient
:
any
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
// TODO: fs.readdirSync doesn't support ~($HOME)
// TODO: fs.readdirSync doesn't support ~($HOME)
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
for
(
var
fileName
of
fileNameArray
){
for
(
const
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
try
{
if
(
fs
.
lstatSync
(
fullFilePath
).
isFile
())
{
// tslint:disable-next-line:non-literal-fs-path
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
await
copyFileToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
await
copyFileToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
}
else
{
}
else
{
// If filePath is a directory, recuisively copy it to remote directory
// If filePath is a directory, recuisively copy it to remote directory
await
copyDirectoryToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
await
copyDirectoryToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
}
}
}
catch
(
error
)
{
}
catch
(
error
)
{
deferred
.
reject
(
error
);
deferred
.
reject
(
error
);
}
}
}
}
...
@@ -118,20 +126,20 @@ export namespace HDFSClientUtility {
...
@@ -118,20 +126,20 @@ export namespace HDFSClientUtility {
/**
/**
* Read content from HDFS file
* Read content from HDFS file
*
*
* @param hdfsPath HDFS file path
* @param hdfsPath HDFS file path
* @param hdfsClient HDFS client
* @param hdfsClient HDFS client
*/
*/
export
async
function
readFileFromHDFS
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
Buffer
>
{
export
async
function
readFileFromHDFS
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
Buffer
>
{
const
deferred
:
Deferred
<
Buffer
>
=
new
Deferred
<
Buffer
>
();
const
deferred
:
Deferred
<
Buffer
>
=
new
Deferred
<
Buffer
>
();
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
if
(
!
exist
)
{
if
(
!
exist
)
{
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
}
}
const
remoteFileStream
=
hdfsClient
.
createReadStream
(
hdfsPath
);
const
remoteFileStream
:
any
=
hdfsClient
.
createReadStream
(
hdfsPath
);
remoteFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
remoteFileStream
.
on
(
'
error
'
,
(
err
:
any
)
=>
{
// Reject with the error
// Reject with the error
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
...
@@ -141,8 +149,8 @@ export namespace HDFSClientUtility {
...
@@ -141,8 +149,8 @@ export namespace HDFSClientUtility {
// Concat the data chunk to buffer
// Concat the data chunk to buffer
buffer
=
Buffer
.
concat
([
buffer
,
chunk
]);
buffer
=
Buffer
.
concat
([
buffer
,
chunk
]);
});
});
remoteFileStream
.
on
(
'
finish
'
,
function
onFinish
()
{
remoteFileStream
.
on
(
'
finish
'
,
()
=>
{
// Upload is done, resolve
// Upload is done, resolve
deferred
.
resolve
(
buffer
);
deferred
.
resolve
(
buffer
);
});
});
...
@@ -152,36 +160,38 @@ export namespace HDFSClientUtility {
...
@@ -152,36 +160,38 @@ export namespace HDFSClientUtility {
/**
/**
* Check if an HDFS path already exists
* Check if an HDFS path already exists
*
*
* @param hdfsPath target path need to check in HDFS
* @param hdfsPath target path need to check in HDFS
* @param hdfsClient HDFS client
* @param hdfsClient HDFS client
*/
*/
export
async
function
pathExists
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
export
async
function
pathExists
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
exists
(
hdfsPath
,
(
exist
:
boolean
)
=>
{
hdfsClient
.
exists
(
hdfsPath
,
(
exist
:
boolean
)
=>
{
deferred
.
resolve
(
exist
);
deferred
.
resolve
(
exist
);
});
});
let
timeoutId
:
NodeJS
.
Timer
let
timeoutId
:
NodeJS
.
Timer
;
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
)
:
void
=>
{
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
)
:
void
=>
{
// Set timeout and reject the promise once reach timeout (5 seconds)
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId
=
setTimeout
(()
=>
deferred
.
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
),
5000
);
timeoutId
=
setTimeout
(()
=>
{
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
)
;
}
,
5000
);
});
});
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
]).
finally
(()
=>
clearTimeout
(
timeoutId
));
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
])
.
finally
(()
=>
{
clearTimeout
(
timeoutId
);
});
}
}
/**
/**
* Mkdir in HDFS, use default permission 755
* Mkdir in HDFS, use default permission 755
*
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
HDFS client
*/
*/
export
function
mkdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
export
function
mkdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
mkdir
(
hdfsPath
,
(
err
:
any
)
=>
{
hdfsClient
.
mkdir
(
hdfsPath
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
if
(
!
err
)
{
deferred
.
resolve
(
true
);
deferred
.
resolve
(
true
);
}
else
{
}
else
{
deferred
.
reject
(
err
.
message
);
deferred
.
reject
(
err
.
message
);
...
@@ -193,19 +203,19 @@ export namespace HDFSClientUtility {
...
@@ -193,19 +203,19 @@ export namespace HDFSClientUtility {
/**
/**
* Read directory contents
* Read directory contents
*
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
HDFS client
*/
*/
export
async
function
readdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
string
[]
>
{
export
async
function
readdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
string
[]
>
{
const
deferred
:
Deferred
<
string
[]
>
=
new
Deferred
<
string
[]
>
();
const
deferred
:
Deferred
<
string
[]
>
=
new
Deferred
<
string
[]
>
();
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
const
exist
:
boolean
=
await
pathExists
(
hdfsPath
,
hdfsClient
);
if
(
!
exist
)
{
if
(
!
exist
)
{
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
deferred
.
reject
(
`
${
hdfsPath
}
doesn't exists`
);
}
}
hdfsClient
.
readdir
(
hdfsPath
,
(
err
:
any
,
files
:
any
[]
)
=>
{
hdfsClient
.
readdir
(
hdfsPath
,
(
err
:
any
,
files
:
any
[])
=>
{
if
(
err
)
{
if
(
err
)
{
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
}
}
...
@@ -218,18 +228,20 @@ export namespace HDFSClientUtility {
...
@@ -218,18 +228,20 @@ export namespace HDFSClientUtility {
/**
/**
* Delete HDFS path
* Delete HDFS path
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
HDFS client
* @param recursive Mark if need to delete recursively
* @param recursive Mark if need to delete recursively
*/
*/
export
function
deletePath
(
hdfsPath
:
string
,
hdfsClient
:
any
,
recursive
:
boolean
=
true
)
:
Promise
<
boolean
>
{
export
function
deletePath
(
hdfsPath
:
string
,
hdfsClient
:
any
,
recursive
:
boolean
=
true
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
hdfsClient
.
unlink
(
hdfsPath
,
recursive
,
(
err
:
any
)
=>
{
hdfsClient
.
unlink
(
hdfsPath
,
recursive
,
(
err
:
any
)
=>
{
if
(
!
err
)
{
if
(
!
err
)
{
deferred
.
resolve
(
true
);
deferred
.
resolve
(
true
);
}
else
{
}
else
{
deferred
.
reject
(
err
.
message
);
deferred
.
reject
(
err
.
message
);
}
}
});
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
// tslint:enable: no-unsafe-any non-literal-fs-path no-any
}
}
src/nni_manager/training_service/pai/paiConfig.ts
View file @
ba8dccd6
...
@@ -19,8 +19,11 @@
...
@@ -19,8 +19,11 @@
'
use strict
'
;
'
use strict
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
/**
* Task role for PAI
*/
export
class
PAITaskRole
{
export
class
PAITaskRole
{
// Name for the task role
// Name for the task role
public
readonly
name
:
string
;
public
readonly
name
:
string
;
...
@@ -36,7 +39,7 @@ export class PAITaskRole {
...
@@ -36,7 +39,7 @@ export class PAITaskRole {
public
readonly
command
:
string
;
public
readonly
command
:
string
;
//Shared memory for one task in the task role
//Shared memory for one task in the task role
public
readonly
shmMB
?:
number
;
public
readonly
shmMB
?:
number
;
/**
/**
* Constructor
* Constructor
* @param name Name for the task role
* @param name Name for the task role
...
@@ -46,18 +49,22 @@ export class PAITaskRole {
...
@@ -46,18 +49,22 @@ export class PAITaskRole {
* @param gpuNumber GPU number for one task in the task role, no less than 0
* @param gpuNumber GPU number for one task in the task role, no less than 0
* @param command Executable command for tasks in the task role, can not be empty
* @param command Executable command for tasks in the task role, can not be empty
*/
*/
constructor
(
name
:
string
,
taskNumber
:
number
,
cpuNumber
:
number
,
memoryMB
:
number
,
gpuNumber
:
number
,
command
:
string
,
shmMB
?:
number
)
{
constructor
(
name
:
string
,
taskNumber
:
number
,
cpuNumber
:
number
,
memoryMB
:
number
,
gpuNumber
:
number
,
command
:
string
,
shmMB
?:
number
)
{
this
.
name
=
name
;
this
.
name
=
name
;
this
.
taskNumber
=
taskNumber
;
this
.
taskNumber
=
taskNumber
;
this
.
cpuNumber
=
cpuNumber
;
this
.
cpuNumber
=
cpuNumber
;
this
.
memoryMB
=
memoryMB
;
this
.
memoryMB
=
memoryMB
;
this
.
gpuNumber
=
gpuNumber
;
this
.
gpuNumber
=
gpuNumber
;
this
.
command
=
command
;
this
.
command
=
command
;
this
.
shmMB
=
shmMB
;
this
.
shmMB
=
shmMB
;
}
}
}
}
export
class
PAIJobConfig
{
/**
* Trial job configuration submitted to PAI
*/
export
class
PAIJobConfig
{
// Name for the job, need to be unique
// Name for the job, need to be unique
public
readonly
jobName
:
string
;
public
readonly
jobName
:
string
;
// URL pointing to the Docker image for all tasks in the job
// URL pointing to the Docker image for all tasks in the job
...
@@ -83,8 +90,8 @@ export class PAIJobConfig{
...
@@ -83,8 +90,8 @@ export class PAIJobConfig{
* @param outputDir Output directory on HDFS
* @param outputDir Output directory on HDFS
* @param taskRoles List of taskRole, one task role at least
* @param taskRoles List of taskRole, one task role at least
*/
*/
constructor
(
jobName
:
string
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
codeDir
:
string
,
constructor
(
jobName
:
string
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
codeDir
:
string
,
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
)
{
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
)
{
this
.
jobName
=
jobName
;
this
.
jobName
=
jobName
;
this
.
image
=
image
;
this
.
image
=
image
;
this
.
dataDir
=
dataDir
;
this
.
dataDir
=
dataDir
;
...
@@ -95,6 +102,9 @@ export class PAIJobConfig{
...
@@ -95,6 +102,9 @@ export class PAIJobConfig{
}
}
}
}
/**
* PAI cluster configuration
*/
export
class
PAIClusterConfig
{
export
class
PAIClusterConfig
{
public
readonly
userName
:
string
;
public
readonly
userName
:
string
;
public
readonly
passWord
:
string
;
public
readonly
passWord
:
string
;
...
@@ -106,18 +116,21 @@ export class PAIClusterConfig {
...
@@ -106,18 +116,21 @@ export class PAIClusterConfig {
* @param passWord password of PAI Cluster
* @param passWord password of PAI Cluster
* @param host Host IP of PAI Cluster
* @param host Host IP of PAI Cluster
*/
*/
constructor
(
userName
:
string
,
passWord
:
string
,
host
:
string
){
constructor
(
userName
:
string
,
passWord
:
string
,
host
:
string
)
{
this
.
userName
=
userName
;
this
.
userName
=
userName
;
this
.
passWord
=
passWord
;
this
.
passWord
=
passWord
;
this
.
host
=
host
;
this
.
host
=
host
;
}
}
}
}
export
class
NNIPAITrialConfig
extends
TrialConfig
{
/**
* PAI trial configuration
*/
export
class
NNIPAITrialConfig
extends
TrialConfig
{
public
readonly
cpuNum
:
number
;
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
public
readonly
image
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
dataDir
:
string
;
public
outputDir
:
string
;
public
outputDir
:
string
;
//The virtual cluster job runs on. If omitted, the job will run on default virtual cluster
//The virtual cluster job runs on. If omitted, the job will run on default virtual cluster
...
@@ -125,8 +138,8 @@ export class NNIPAITrialConfig extends TrialConfig{
...
@@ -125,8 +138,8 @@ export class NNIPAITrialConfig extends TrialConfig{
//Shared memory for one task in the task role
//Shared memory for one task in the task role
public
shmMB
?:
number
;
public
shmMB
?:
number
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
)
{
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
)
{
super
(
command
,
codeDir
,
gpuNum
);
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
memoryMB
=
memoryMB
;
...
@@ -137,4 +150,3 @@ export class NNIPAITrialConfig extends TrialConfig{
...
@@ -137,4 +150,3 @@ export class NNIPAITrialConfig extends TrialConfig{
this
.
shmMB
=
shmMB
;
this
.
shmMB
=
shmMB
;
}
}
}
}
src/nni_manager/training_service/pai/paiData.ts
View file @
ba8dccd6
...
@@ -19,8 +19,11 @@
...
@@ -19,8 +19,11 @@
'
use strict
'
;
'
use strict
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
common/trainingService
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../
common/trainingService
'
;
/**
* PAI trial job detail
*/
export
class
PAITrialJobDetail
implements
TrialJobDetail
{
export
class
PAITrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
id
:
string
;
public
status
:
TrialJobStatus
;
public
status
:
TrialJobStatus
;
...
@@ -36,8 +39,8 @@ export class PAITrialJobDetail implements TrialJobDetail {
...
@@ -36,8 +39,8 @@ export class PAITrialJobDetail implements TrialJobDetail {
public
hdfsLogPath
:
string
;
public
hdfsLogPath
:
string
;
public
isEarlyStopped
?:
boolean
;
public
isEarlyStopped
?:
boolean
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
,
hdfsLogPath
:
string
)
{
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
,
hdfsLogPath
:
string
)
{
this
.
id
=
id
;
this
.
id
=
id
;
this
.
status
=
status
;
this
.
status
=
status
;
this
.
paiJobName
=
paiJobName
;
this
.
paiJobName
=
paiJobName
;
...
@@ -50,7 +53,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
...
@@ -50,7 +53,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
}
}
}
}
export
const
PAI_INSTALL_NNI_SHELL_FORMAT
:
string
=
export
const
PAI_INSTALL_NNI_SHELL_FORMAT
:
string
=
`#!/bin/bash
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
# nni module is already installed, skip
...
@@ -61,13 +64,15 @@ else
...
@@ -61,13 +64,15 @@ else
fi`
;
fi`
;
export
const
PAI_TRIAL_COMMAND_FORMAT
:
string
=
export
const
PAI_TRIAL_COMMAND_FORMAT
:
string
=
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} \
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& cd $NNI_SYS_DIR && sh install_nni.sh \
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}' \
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --nni_manager_version '{12}' --log_collection '{13}'`
;
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' \
--nni_manager_version '{12}' --log_collection '{13}'`
;
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
`hdfs://{0}:9000/`
;
`hdfs://{0}:9000/`
;
export
const
PAI_LOG_PATH_FORMAT
:
string
=
// tslint:disable:no-http-string
`http://{0}/webhdfs/explorer.html#{1}`
export
const
PAI_LOG_PATH_FORMAT
:
string
=
`http://{0}/webhdfs/explorer.html#{1}`
;
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
View file @
ba8dccd6
...
@@ -19,13 +19,14 @@
...
@@ -19,13 +19,14 @@
'
use strict
'
;
'
use strict
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
request
from
'
request
'
;
import
*
as
request
from
'
request
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
PAITrialJobDetail
}
from
'
./paiData
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
PAIClusterConfig
}
from
'
./paiConfig
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
PAIClusterConfig
}
from
'
./paiConfig
'
;
import
{
PAITrialJobDetail
}
from
'
./paiData
'
;
/**
/**
* Collector PAI jobs info from PAI cluster, and update pai job status locally
* Collector PAI jobs info from PAI cluster, and update pai job status locally
...
@@ -43,60 +44,65 @@ export class PAIJobInfoCollector {
...
@@ -43,60 +44,65 @@ export class PAIJobInfoCollector {
}
}
public
async
retrieveTrialStatus
(
paiToken
?
:
string
,
paiClusterConfig
?:
PAIClusterConfig
)
:
Promise
<
void
>
{
public
async
retrieveTrialStatus
(
paiToken
?
:
string
,
paiClusterConfig
?:
PAIClusterConfig
)
:
Promise
<
void
>
{
if
(
!
paiClusterConfig
||
!
paiToken
)
{
if
(
paiClusterConfig
===
undefined
||
paiToken
===
undefined
)
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
const
updatePaiTrialJobs
:
Promise
<
void
>
[]
=
[];
const
updatePaiTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
le
t
[
trialJobId
,
paiTrialJob
]
of
this
.
trialJobsMap
)
{
for
(
cons
t
[
trialJobId
,
paiTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
!
paiTrialJob
)
{
if
(
paiTrialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
}
updatePaiTrialJobs
.
push
(
this
.
getSinglePAITrialJobInfo
(
paiTrialJob
,
paiToken
,
paiClusterConfig
))
updatePaiTrialJobs
.
push
(
this
.
getSinglePAITrialJobInfo
(
paiTrialJob
,
paiToken
,
paiClusterConfig
))
;
}
}
await
Promise
.
all
(
updatePaiTrialJobs
);
await
Promise
.
all
(
updatePaiTrialJobs
);
}
}
private
getSinglePAITrialJobInfo
(
paiTrialJob
:
PAITrialJobDetail
,
paiToken
:
string
,
paiClusterConfig
:
PAIClusterConfig
)
:
Promise
<
void
>
{
private
getSinglePAITrialJobInfo
(
paiTrialJob
:
PAITrialJobDetail
,
paiToken
:
string
,
paiClusterConfig
:
PAIClusterConfig
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
!
this
.
statusesNeedToCheck
.
includes
(
paiTrialJob
.
status
))
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
paiTrialJob
.
status
))
{
deferred
.
resolve
();
deferred
.
resolve
();
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
// Rest call to get PAI job info and update status
// Rest call to get PAI job info and update status
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const
getJobInfoRequest
:
request
.
Options
=
{
const
getJobInfoRequest
:
request
.
Options
=
{
// tslint:disable-next-line:no-http-string
uri
:
`http://
${
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
paiClusterConfig
.
userName
}
/jobs/
${
paiTrialJob
.
paiJobName
}
`
,
uri
:
`http://
${
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
paiClusterConfig
.
userName
}
/jobs/
${
paiTrialJob
.
paiJobName
}
`
,
method
:
'
GET
'
,
method
:
'
GET
'
,
json
:
true
,
json
:
true
,
headers
:
{
headers
:
{
"
Content-Type
"
:
"
application/json
"
,
'
Content-Type
'
:
'
application/json
'
,
"
Authorization
"
:
'
Bearer
'
+
paiToken
Authorization
:
`
Bearer
${
paiToken
}
`
}
}
};
};
//TODO : pass in request timeout param?
// tslint:disable: no-unsafe-any no-any cyclomatic-complexity
//TODO : pass in request timeout param?
request
(
getJobInfoRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
request
(
getJobInfoRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
500
)
{
if
(
(
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
500
)
{
this
.
log
.
error
(
`PAI Training service: get job info for trial
${
paiTrialJob
.
id
}
from PAI Cluster failed!`
);
this
.
log
.
error
(
`PAI Training service: get job info for trial
${
paiTrialJob
.
id
}
from PAI Cluster failed!`
);
// Queried PAI job info failed, set job status to UNKNOWN
// Queried PAI job info failed, set job status to UNKNOWN
if
(
paiTrialJob
.
status
===
'
WAITING
'
||
paiTrialJob
.
status
===
'
RUNNING
'
)
{
if
(
paiTrialJob
.
status
===
'
WAITING
'
||
paiTrialJob
.
status
===
'
RUNNING
'
)
{
paiTrialJob
.
status
=
'
UNKNOWN
'
;
paiTrialJob
.
status
=
'
UNKNOWN
'
;
}
}
}
else
{
}
else
{
if
(
response
.
body
.
jobStatus
&&
response
.
body
.
jobStatus
.
state
)
{
if
(
response
.
body
.
jobStatus
&&
response
.
body
.
jobStatus
.
state
)
{
switch
(
response
.
body
.
jobStatus
.
state
)
{
switch
(
response
.
body
.
jobStatus
.
state
)
{
case
'
WAITING
'
:
case
'
WAITING
'
:
paiTrialJob
.
status
=
'
WAITING
'
;
paiTrialJob
.
status
=
'
WAITING
'
;
break
;
break
;
case
'
RUNNING
'
:
case
'
RUNNING
'
:
paiTrialJob
.
status
=
'
RUNNING
'
;
paiTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
paiTrialJob
.
startTime
)
{
if
(
paiTrialJob
.
startTime
===
undefined
)
{
paiTrialJob
.
startTime
=
response
.
body
.
jobStatus
.
appLaunchedTime
;
paiTrialJob
.
startTime
=
response
.
body
.
jobStatus
.
appLaunchedTime
;
}
}
if
(
!
paiTrialJob
.
url
)
{
if
(
paiTrialJob
.
url
===
undefined
)
{
paiTrialJob
.
url
=
response
.
body
.
jobStatus
.
appTrackingUrl
;
paiTrialJob
.
url
=
response
.
body
.
jobStatus
.
appTrackingUrl
;
}
}
break
;
break
;
case
'
SUCCEEDED
'
:
case
'
SUCCEEDED
'
:
...
@@ -104,30 +110,31 @@ export class PAIJobInfoCollector {
...
@@ -104,30 +110,31 @@ export class PAIJobInfoCollector {
break
;
break
;
case
'
STOPPED
'
:
case
'
STOPPED
'
:
if
(
paiTrialJob
.
isEarlyStopped
!==
undefined
)
{
if
(
paiTrialJob
.
isEarlyStopped
!==
undefined
)
{
paiTrialJob
.
status
=
paiTrialJob
.
isEarlyStopped
===
true
?
paiTrialJob
.
status
=
paiTrialJob
.
isEarlyStopped
===
true
?
'
EARLY_STOPPED
'
:
'
USER_CANCELED
'
;
'
EARLY_STOPPED
'
:
'
USER_CANCELED
'
;
}
else
{
}
else
{
// if paiTrialJob's isEarlyStopped is undefined, that mean we didn't stop it via cancellation, mark it as SYS_CANCELLED by PAI
/* if paiTrialJob's isEarlyStopped is undefined, that mean we didn't stop it via cancellation,
* mark it as SYS_CANCELLED by PAI
*/
paiTrialJob
.
status
=
'
SYS_CANCELED
'
;
paiTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
}
break
;
break
;
case
'
FAILED
'
:
case
'
FAILED
'
:
paiTrialJob
.
status
=
'
FAILED
'
;
paiTrialJob
.
status
=
'
FAILED
'
;
break
;
break
;
default
:
default
:
paiTrialJob
.
status
=
'
UNKNOWN
'
;
paiTrialJob
.
status
=
'
UNKNOWN
'
;
break
;
}
}
// For final job statues, update startTime, endTime and url
// For final job statues, update startTime, endTime and url
if
(
this
.
finalStatuses
.
includes
(
paiTrialJob
.
status
))
{
if
(
this
.
finalStatuses
.
includes
(
paiTrialJob
.
status
))
{
if
(
!
paiTrialJob
.
startTime
)
{
if
(
paiTrialJob
.
startTime
===
undefined
)
{
paiTrialJob
.
startTime
=
response
.
body
.
jobStatus
.
appLaunchedTime
;
paiTrialJob
.
startTime
=
response
.
body
.
jobStatus
.
appLaunchedTime
;
}
}
if
(
!
paiTrialJob
.
endTime
)
{
if
(
paiTrialJob
.
endTime
===
undefined
)
{
paiTrialJob
.
endTime
=
response
.
body
.
jobStatus
.
completedTime
;
paiTrialJob
.
endTime
=
response
.
body
.
jobStatus
.
completedTime
;
}
}
// Set pai trial job's url to WebHDFS output path
// Set pai trial job's url to WebHDFS output path
if
(
paiTrialJob
.
hdfsLogPath
)
{
if
(
paiTrialJob
.
hdfsLogPath
!==
undefined
)
{
paiTrialJob
.
url
+=
`,
${
paiTrialJob
.
hdfsLogPath
}
`
;
paiTrialJob
.
url
+=
`,
${
paiTrialJob
.
hdfsLogPath
}
`
;
}
}
}
}
...
@@ -138,4 +145,5 @@ export class PAIJobInfoCollector {
...
@@ -138,4 +145,5 @@ export class PAIJobInfoCollector {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
// tslint:enable: no-unsafe-any no-any
}
}
src/nni_manager/training_service/pai/paiJobRestServer.ts
View file @
ba8dccd6
...
@@ -19,17 +19,17 @@
...
@@ -19,17 +19,17 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
PAITrainingService
}
from
'
./paiTrainingService
'
;
import
{
PAITrainingService
}
from
'
./paiTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
/**
* PAI Training service Rest server, provides rest API to support pai job metrics update
* PAI Training service Rest server, provides rest API to support pai job metrics update
*
*
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
class
PAIJobRestServer
extends
ClusterJobRestServer
{
export
class
PAIJobRestServer
extends
ClusterJobRestServer
{
@
Inject
@
Inject
private
readonly
paiTrainingService
:
PAITrainingService
;
private
readonly
paiTrainingService
:
PAITrainingService
;
...
@@ -41,6 +41,7 @@ export class PAIJobRestServer extends ClusterJobRestServer{
...
@@ -41,6 +41,7 @@ export class PAIJobRestServer extends ClusterJobRestServer{
this
.
paiTrainingService
=
component
.
get
(
PAITrainingService
);
this
.
paiTrainingService
=
component
.
get
(
PAITrainingService
);
}
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
// Split metrics array into single metric, then emit
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
...
@@ -51,4 +52,4 @@ export class PAIJobRestServer extends ClusterJobRestServer{
...
@@ -51,4 +52,4 @@ export class PAIJobRestServer extends ClusterJobRestServer{
});
});
}
}
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
ba8dccd6
/**
/**
* Copyright (c) Microsoft Corporation
* Copyright (c) Microsoft Corporation
* All rights reserved.
* All rights reserved.
...
@@ -23,6 +22,7 @@
...
@@ -23,6 +22,7 @@
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
request
from
'
request
'
;
import
*
as
request
from
'
request
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
...
@@ -37,18 +37,17 @@ import {
...
@@ -37,18 +37,17 @@ import {
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
getExperimentRootDir
,
getIPV4Address
,
getVersion
,
uniqueString
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
,
execMkdir
}
from
'
../common/util
'
;
import
{
execMkdir
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
unixPathJoin
}
from
'
../../common/utils
'
import
{
HDFSClientUtility
}
from
'
./hdfsClientUtility
'
;
import
{
HDFSClientUtility
}
from
'
./hdfsClientUtility
'
;
import
{
NNIPAITrialConfig
,
PAIClusterConfig
,
PAIJobConfig
,
PAITaskRole
}
from
'
./paiConfig
'
;
import
{
NNIPAITrialConfig
,
PAIClusterConfig
,
PAIJobConfig
,
PAITaskRole
}
from
'
./paiConfig
'
;
import
{
PAI_LOG_PATH_FORMAT
,
PAI_OUTPUT_DIR_FORMAT
,
PAI_TRIAL_COMMAND_FORMAT
,
PAITrialJobDetail
}
from
'
./paiData
'
;
import
{
PAI_LOG_PATH_FORMAT
,
PAI_OUTPUT_DIR_FORMAT
,
PAI_TRIAL_COMMAND_FORMAT
,
PAITrialJobDetail
}
from
'
./paiData
'
;
import
{
PAIJobInfoCollector
}
from
'
./paiJobInfoCollector
'
;
import
{
PAIJobInfoCollector
}
from
'
./paiJobInfoCollector
'
;
import
{
PAIJobRestServer
}
from
'
./paiJobRestServer
'
;
import
{
PAIJobRestServer
}
from
'
./paiJobRestServer
'
;
const
WebHDFS
=
require
(
'
webhdfs
'
)
;
import
*
as
WebHDFS
from
'
webhdfs
'
;
/**
/**
* Training Service implementation for OpenPAI (Open Platform for AI)
* Training Service implementation for OpenPAI (Open Platform for AI)
...
@@ -62,13 +61,14 @@ class PAITrainingService implements TrainingService {
...
@@ -62,13 +61,14 @@ class PAITrainingService implements TrainingService {
private
readonly
expRootDir
:
string
;
private
readonly
expRootDir
:
string
;
private
paiTrialConfig
:
NNIPAITrialConfig
|
undefined
;
private
paiTrialConfig
:
NNIPAITrialConfig
|
undefined
;
private
paiClusterConfig
?:
PAIClusterConfig
;
private
paiClusterConfig
?:
PAIClusterConfig
;
private
jobQueue
:
string
[];
private
readonly
jobQueue
:
string
[];
private
stopping
:
boolean
=
false
;
private
stopping
:
boolean
=
false
;
// tslint:disable-next-line:no-any
private
hdfsClient
:
any
;
private
hdfsClient
:
any
;
private
paiToken
?
:
string
;
private
paiToken
?
:
string
;
private
paiTokenUpdateTime
?:
number
;
private
paiTokenUpdateTime
?:
number
;
private
paiTokenUpdateInterval
:
number
;
private
readonly
paiTokenUpdateInterval
:
number
;
private
experimentId
!
:
string
;
private
readonly
experimentId
!
:
string
;
private
readonly
paiJobCollector
:
PAIJobInfoCollector
;
private
readonly
paiJobCollector
:
PAIJobInfoCollector
;
private
readonly
hdfsDirPattern
:
string
;
private
readonly
hdfsDirPattern
:
string
;
private
hdfsBaseDir
:
string
|
undefined
;
private
hdfsBaseDir
:
string
|
undefined
;
...
@@ -121,13 +121,13 @@ class PAITrainingService implements TrainingService {
...
@@ -121,13 +121,13 @@ class PAITrainingService implements TrainingService {
}
}
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
}
const
paiTrialJob
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
paiTrialJob
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
paiTrialJob
)
{
if
(
paiTrialJob
===
undefined
)
{
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
);
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
);
}
}
...
@@ -144,7 +144,7 @@ class PAITrainingService implements TrainingService {
...
@@ -144,7 +144,7 @@ class PAITrainingService implements TrainingService {
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
const
deferred
:
Deferred
<
PAITrialJobDetail
>
=
new
Deferred
<
PAITrialJobDetail
>
();
const
deferred
:
Deferred
<
PAITrialJobDetail
>
=
new
Deferred
<
PAITrialJobDetail
>
();
if
(
!
this
.
hdfsBaseDir
)
{
if
(
this
.
hdfsBaseDir
===
undefined
)
{
throw
new
Error
(
'
hdfsBaseDir is not initialized
'
);
throw
new
Error
(
'
hdfsBaseDir is not initialized
'
);
}
}
...
@@ -187,24 +187,26 @@ class PAITrainingService implements TrainingService {
...
@@ -187,24 +187,26 @@ class PAITrainingService implements TrainingService {
return
false
;
return
false
;
}
}
// tslint:disable:no-http-string
public
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
public
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
!
trialJobDetail
)
{
if
(
trialJobDetail
===
undefined
)
{
this
.
log
.
error
(
`cancelTrialJob: trial job id
${
trialJobId
}
not found`
);
this
.
log
.
error
(
`cancelTrialJob: trial job id
${
trialJobId
}
not found`
);
return
Promise
.
reject
();
return
Promise
.
reject
();
}
}
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
}
if
(
!
this
.
paiToken
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
throw
new
Error
(
'
PAI token is not initialized
'
);
}
}
const
stopJobRequest
:
request
.
Options
=
{
const
stopJobRequest
:
request
.
Options
=
{
uri
:
`http://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
this
.
paiClusterConfig
.
userName
}
/jobs/
${
trialJobDetail
.
paiJobName
}
/executionType`
,
uri
:
`http://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
this
.
paiClusterConfig
.
userName
}
\
/jobs/
${
trialJobDetail
.
paiJobName
}
/executionType`
,
method
:
'
PUT
'
,
method
:
'
PUT
'
,
json
:
true
,
json
:
true
,
body
:
{
value
:
'
STOP
'
},
body
:
{
value
:
'
STOP
'
},
...
@@ -217,10 +219,12 @@ class PAITrainingService implements TrainingService {
...
@@ -217,10 +219,12 @@ class PAITrainingService implements TrainingService {
// Set trialjobDetail's early stopped field, to mark the job's cancellation source
// Set trialjobDetail's early stopped field, to mark the job's cancellation source
trialJobDetail
.
isEarlyStopped
=
isEarlyStopped
;
trialJobDetail
.
isEarlyStopped
=
isEarlyStopped
;
// tslint:disable-next-line:no-any
request
(
stopJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
request
(
stopJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
400
)
{
if
(
(
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
this
.
log
.
error
(
`PAI Training service: stop trial
${
trialJobId
}
to PAI Cluster failed!`
);
this
.
log
.
error
(
`PAI Training service: stop trial
${
trialJobId
}
to PAI Cluster failed!`
);
deferred
.
reject
(
error
?
error
.
message
:
`Stop trial failed, http code:
${
response
.
statusCode
}
`
);
deferred
.
reject
((
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`Stop trial failed, http code:
${
response
.
statusCode
}
`
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
();
}
}
...
@@ -229,6 +233,7 @@ class PAITrainingService implements TrainingService {
...
@@ -229,6 +233,7 @@ class PAITrainingService implements TrainingService {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
// tslint:disable: no-unsafe-any no-any
// tslint:disable-next-line:max-func-body-length
// tslint:disable-next-line:max-func-body-length
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
@@ -256,47 +261,47 @@ class PAITrainingService implements TrainingService {
...
@@ -256,47 +261,47 @@ class PAITrainingService implements TrainingService {
break
;
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
this
.
log
.
error
(
'
pai cluster config is not initialized
'
);
deferred
.
reject
(
new
Error
(
'
pai cluster config is not initialized
'
));
deferred
.
reject
(
new
Error
(
'
pai cluster config is not initialized
'
));
break
;
break
;
}
}
this
.
paiTrialConfig
=
<
NNIPAITrialConfig
>
JSON
.
parse
(
value
);
this
.
paiTrialConfig
=
<
NNIPAITrialConfig
>
JSON
.
parse
(
value
);
//paiTrialConfig.outputDir could be null if it is not set in nnictl
//paiTrialConfig.outputDir could be null if it is not set in nnictl
if
(
this
.
paiTrialConfig
.
outputDir
===
undefined
||
this
.
paiTrialConfig
.
outputDir
===
null
){
if
(
this
.
paiTrialConfig
.
outputDir
===
undefined
||
this
.
paiTrialConfig
.
outputDir
===
null
)
{
this
.
paiTrialConfig
.
outputDir
=
String
.
Format
(
this
.
paiTrialConfig
.
outputDir
=
String
.
Format
(
PAI_OUTPUT_DIR_FORMAT
,
PAI_OUTPUT_DIR_FORMAT
,
this
.
paiClusterConfig
.
host
this
.
paiClusterConfig
.
host
).
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
}
}
// Validate to make sure codeDir doesn't have too many files
// Validate to make sure codeDir doesn't have too many files
try
{
try
{
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
await
validateCodeDir
(
this
.
paiTrialConfig
.
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
deferred
.
reject
(
new
Error
(
error
));
deferred
.
reject
(
new
Error
(
error
));
break
;
break
;
}
}
const
hdfsDirContent
=
this
.
paiTrialConfig
.
outputDir
.
match
(
this
.
hdfsDirPattern
);
const
hdfsDirContent
:
any
=
this
.
paiTrialConfig
.
outputDir
.
match
(
this
.
hdfsDirPattern
);
if
(
hdfsDirContent
===
null
)
{
if
(
hdfsDirContent
===
null
)
{
throw
new
Error
(
'
Trial outputDir format Error
'
);
throw
new
Error
(
'
Trial outputDir format Error
'
);
}
}
const
groups
=
hdfsDirContent
.
groups
;
const
groups
:
any
=
hdfsDirContent
.
groups
;
if
(
groups
===
undefined
)
{
if
(
groups
===
undefined
)
{
throw
new
Error
(
'
Trial outputDir format Error
'
);
throw
new
Error
(
'
Trial outputDir format Error
'
);
}
}
this
.
hdfsOutputHost
=
groups
.
host
;
this
.
hdfsOutputHost
=
groups
[
'
host
'
];
//TODO: choose to use /${username} as baseDir
//TODO: choose to use /${username} as baseDir
this
.
hdfsBaseDir
=
groups
[
'
baseDir
'
]
;
this
.
hdfsBaseDir
=
groups
.
baseDir
;
if
(
this
.
hdfsBaseDir
===
undefined
)
{
if
(
this
.
hdfsBaseDir
===
undefined
)
{
this
.
hdfsBaseDir
=
'
/
'
;
this
.
hdfsBaseDir
=
'
/
'
;
}
}
let
dataOutputHdfsClient
;
let
dataOutputHdfsClient
:
any
;
if
(
this
.
paiClusterConfig
.
host
===
this
.
hdfsOutputHost
&&
this
.
hdfsClient
)
{
if
(
this
.
paiClusterConfig
.
host
===
this
.
hdfsOutputHost
&&
this
.
hdfsClient
)
{
dataOutputHdfsClient
=
this
.
hdfsClient
;
dataOutputHdfsClient
=
this
.
hdfsClient
;
}
else
{
}
else
{
...
@@ -338,6 +343,7 @@ class PAITrainingService implements TrainingService {
...
@@ -338,6 +343,7 @@ class PAITrainingService implements TrainingService {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
// tslint:enable: no-unsafe-any
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
...
@@ -358,6 +364,7 @@ class PAITrainingService implements TrainingService {
...
@@ -358,6 +364,7 @@ class PAITrainingService implements TrainingService {
deferred
.
resolve
();
deferred
.
resolve
();
this
.
log
.
info
(
'
PAI Training service rest server stopped successfully.
'
);
this
.
log
.
info
(
'
PAI Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
}
catch
(
error
)
{
// tslint:disable-next-line: no-unsafe-any
this
.
log
.
error
(
`PAI Training service rest server stopped failed, error:
${
error
.
message
}
`
);
this
.
log
.
error
(
`PAI Training service rest server stopped failed, error:
${
error
.
message
}
`
);
deferred
.
reject
(
error
);
deferred
.
reject
(
error
);
}
}
...
@@ -374,35 +381,35 @@ class PAITrainingService implements TrainingService {
...
@@ -374,35 +381,35 @@ class PAITrainingService implements TrainingService {
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJobDetail
)
{
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`Failed to find PAITrialJobDetail for job
${
trialJobId
}
`
);
throw
new
Error
(
`Failed to find PAITrialJobDetail for job
${
trialJobId
}
`
);
}
}
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
throw
new
Error
(
'
PAI Cluster config is not initialized
'
);
}
}
if
(
!
this
.
paiTrialConfig
)
{
if
(
this
.
paiTrialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
throw
new
Error
(
'
trial config is not initialized
'
);
}
}
if
(
!
this
.
paiToken
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
'
PAI token is not initialized
'
);
throw
new
Error
(
'
PAI token is not initialized
'
);
}
}
if
(
!
this
.
hdfsBaseDir
)
{
if
(
this
.
hdfsBaseDir
===
undefined
)
{
throw
new
Error
(
'
hdfsBaseDir is not initialized
'
);
throw
new
Error
(
'
hdfsBaseDir is not initialized
'
);
}
}
if
(
!
this
.
hdfsOutputHost
)
{
if
(
this
.
hdfsOutputHost
===
undefined
)
{
throw
new
Error
(
'
hdfsOutputHost is not initialized
'
);
throw
new
Error
(
'
hdfsOutputHost is not initialized
'
);
}
}
if
(
!
this
.
paiRestServerPort
)
{
if
(
this
.
paiRestServerPort
===
undefined
)
{
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
this
.
paiRestServerPort
=
restServer
.
clusterRestServerPort
;
this
.
paiRestServerPort
=
restServer
.
clusterRestServerPort
;
}
}
// Make sure experiment code files is copied from local to HDFS
// Make sure experiment code files is copied from local to HDFS
if
(
this
.
copyExpCodeDirPromise
)
{
if
(
this
.
copyExpCodeDirPromise
!==
undefined
)
{
await
this
.
copyExpCodeDirPromise
;
await
this
.
copyExpCodeDirPromise
;
}
}
...
@@ -420,13 +427,14 @@ class PAITrainingService implements TrainingService {
...
@@ -420,13 +427,14 @@ class PAITrainingService implements TrainingService {
// Write file content ( parameter.cfg ) to local tmp folders
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
);
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
);
if
(
trialForm
)
{
if
(
trialForm
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
}
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
}
);
);
}
}
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
...
@@ -446,8 +454,10 @@ class PAITrainingService implements TrainingService {
...
@@ -446,8 +454,10 @@ class PAITrainingService implements TrainingService {
HDFSClientUtility
.
getHdfsExpCodeDir
(
this
.
paiClusterConfig
.
userName
),
HDFSClientUtility
.
getHdfsExpCodeDir
(
this
.
paiClusterConfig
.
userName
),
version
,
version
,
this
.
logCollection
this
.
logCollection
).
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
// tslint:disable-next-line:no-console
console
.
log
(
`nniPAItrial command is
${
nniPaiTrialCommand
.
trim
()}
`
);
console
.
log
(
`nniPAItrial command is
${
nniPaiTrialCommand
.
trim
()}
`
);
const
paiTaskRoles
:
PAITaskRole
[]
=
[
const
paiTaskRoles
:
PAITaskRole
[]
=
[
new
PAITaskRole
(
new
PAITaskRole
(
...
@@ -489,7 +499,10 @@ class PAITrainingService implements TrainingService {
...
@@ -489,7 +499,10 @@ class PAITrainingService implements TrainingService {
await
HDFSClientUtility
.
copyDirectoryToHdfs
(
trialLocalTempFolder
,
hdfsCodeDir
,
this
.
hdfsClient
);
await
HDFSClientUtility
.
copyDirectoryToHdfs
(
trialLocalTempFolder
,
hdfsCodeDir
,
this
.
hdfsClient
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
`PAI Training service: copy
${
this
.
paiTrialConfig
.
codeDir
}
to HDFS
${
hdfsCodeDir
}
failed, error is
${
error
}
`
);
this
.
log
.
error
(
`PAI Training service: copy
${
this
.
paiTrialConfig
.
codeDir
}
to HDFS
${
hdfsCodeDir
}
failed, error is
${
error
}
`
);
throw
new
Error
(
error
.
message
);
trialJobDetail
.
status
=
'
FAILED
'
;
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
}
}
// Step 3. Submit PAI job via Rest call
// Step 3. Submit PAI job via Rest call
...
@@ -504,13 +517,14 @@ class PAITrainingService implements TrainingService {
...
@@ -504,13 +517,14 @@ class PAITrainingService implements TrainingService {
Authorization
:
`Bearer
${
this
.
paiToken
}
`
Authorization
:
`Bearer
${
this
.
paiToken
}
`
}
}
};
};
// tslint:disable:no-any no-unsafe-any
request
(
submitJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
request
(
submitJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
400
)
{
if
(
(
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
error
?
error
.
message
:
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
`Submit trial
${
trialJobId
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
response
.
body
}
`
;
`Submit trial
${
trialJobId
}
failed, http code:
${
response
.
statusCode
}
, http body:
${
response
.
body
}
`
;
this
.
log
.
error
(
errorMessage
);
this
.
log
.
error
(
errorMessage
);
trialJobDetail
.
status
=
'
FAILED
'
;
trialJobDetail
.
status
=
'
FAILED
'
;
deferred
.
re
ject
(
new
Error
(
errorMessage
)
);
deferred
.
re
solve
(
true
);
}
else
{
}
else
{
trialJobDetail
.
submitTime
=
Date
.
now
();
trialJobDetail
.
submitTime
=
Date
.
now
();
deferred
.
resolve
(
true
);
deferred
.
resolve
(
true
);
...
@@ -530,18 +544,18 @@ class PAITrainingService implements TrainingService {
...
@@ -530,18 +544,18 @@ class PAITrainingService implements TrainingService {
private
async
statusCheckingLoop
():
Promise
<
void
>
{
private
async
statusCheckingLoop
():
Promise
<
void
>
{
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
try
{
try
{
await
this
.
updatePaiToken
();
await
this
.
updatePaiToken
();
}
catch
(
error
){
}
catch
(
error
)
{
this
.
log
.
error
(
`
${
error
}
`
);
this
.
log
.
error
(
`
${
error
}
`
);
//only throw error when initlize paiToken first time
//only throw error when initlize paiToken first time
if
(
!
this
.
paiToken
)
{
if
(
this
.
paiToken
===
undefined
)
{
throw
new
Error
(
error
);
throw
new
Error
(
error
);
}
}
}
}
await
this
.
paiJobCollector
.
retrieveTrialStatus
(
this
.
paiToken
,
this
.
paiClusterConfig
);
await
this
.
paiJobCollector
.
retrieveTrialStatus
(
this
.
paiToken
,
this
.
paiClusterConfig
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
if
(
restServer
.
getErrorMessage
)
{
if
(
restServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
throw
new
Error
(
restServer
.
getErrorMessage
);
}
}
await
delay
(
3000
);
await
delay
(
3000
);
...
@@ -572,17 +586,17 @@ class PAITrainingService implements TrainingService {
...
@@ -572,17 +586,17 @@ class PAITrainingService implements TrainingService {
const
currentTime
:
number
=
new
Date
().
getTime
();
const
currentTime
:
number
=
new
Date
().
getTime
();
//If pai token initialized and not reach the interval time, do not update
//If pai token initialized and not reach the interval time, do not update
if
(
this
.
paiTokenUpdateTime
&&
(
currentTime
-
this
.
paiTokenUpdateTime
)
<
this
.
paiTokenUpdateInterval
){
if
(
this
.
paiTokenUpdateTime
!==
undefined
&&
(
currentTime
-
this
.
paiTokenUpdateTime
)
<
this
.
paiTokenUpdateInterval
)
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
if
(
!
this
.
paiClusterConfig
)
{
if
(
this
.
paiClusterConfig
===
undefined
)
{
const
paiClusterConfigError
:
string
=
`pai cluster config not initialized!`
;
const
paiClusterConfigError
:
string
=
`pai cluster config not initialized!`
;
this
.
log
.
error
(
`
${
paiClusterConfigError
}
`
);
this
.
log
.
error
(
`
${
paiClusterConfigError
}
`
);
throw
Error
(
`
${
paiClusterConfigError
}
`
);
throw
Error
(
`
${
paiClusterConfigError
}
`
);
}
}
const
authentication
_r
eq
:
request
.
Options
=
{
const
authentication
R
eq
:
request
.
Options
=
{
uri
:
`http://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/token`
,
uri
:
`http://
${
this
.
paiClusterConfig
.
host
}
/rest-server/api/v1/token`
,
method
:
'
POST
'
,
method
:
'
POST
'
,
json
:
true
,
json
:
true
,
...
@@ -592,12 +606,12 @@ class PAITrainingService implements TrainingService {
...
@@ -592,12 +606,12 @@ class PAITrainingService implements TrainingService {
}
}
};
};
request
(
authentication
_r
eq
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
request
(
authentication
R
eq
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
)
{
if
(
error
!==
undefined
&&
error
!==
null
)
{
this
.
log
.
error
(
`Get PAI token failed:
${
error
.
message
}
`
);
this
.
log
.
error
(
`Get PAI token failed:
${
error
.
message
}
`
);
deferred
.
reject
(
new
Error
(
`Get PAI token failed:
${
error
.
message
}
`
));
deferred
.
reject
(
new
Error
(
`Get PAI token failed:
${
error
.
message
}
`
));
}
else
{
}
else
{
if
(
response
.
statusCode
!==
200
){
if
(
response
.
statusCode
!==
200
)
{
this
.
log
.
error
(
`Get PAI token failed: get PAI Rest return code
${
response
.
statusCode
}
`
);
this
.
log
.
error
(
`Get PAI token failed: get PAI Rest return code
${
response
.
statusCode
}
`
);
deferred
.
reject
(
new
Error
(
`Get PAI token failed:
${
response
.
body
}
, please check paiConfig username or password`
));
deferred
.
reject
(
new
Error
(
`Get PAI token failed:
${
response
.
body
}
, please check paiConfig username or password`
));
}
}
...
@@ -616,8 +630,9 @@ class PAITrainingService implements TrainingService {
...
@@ -616,8 +630,9 @@ class PAITrainingService implements TrainingService {
});
});
return
Promise
.
race
([
timeoutDelay
,
deferred
.
promise
])
return
Promise
.
race
([
timeoutDelay
,
deferred
.
promise
])
.
finally
(()
=>
clearTimeout
(
timeoutId
));
.
finally
(()
=>
{
clearTimeout
(
timeoutId
)
;
}
);
}
}
// tslint:enable:no-any no-unsafe-any no-http-string
}
}
export
{
PAITrainingService
};
export
{
PAITrainingService
};
src/nni_manager/training_service/pai/paiTrialConfig.ts
View file @
ba8dccd6
...
@@ -19,16 +19,20 @@
...
@@ -19,16 +19,20 @@
'
use strict
'
;
'
use strict
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
export
class
PAITrialConfig
extends
TrialConfig
{
/**
* PAI configuration to run trials
*/
export
class
PAITrialConfig
extends
TrialConfig
{
public
readonly
cpuNum
:
number
;
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
public
readonly
image
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
outputDir
:
string
;
public
readonly
outputDir
:
string
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
)
{
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
)
{
super
(
command
,
codeDir
,
gpuNum
);
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
memoryMB
=
memoryMB
;
...
@@ -36,4 +40,4 @@ export class PAITrialConfig extends TrialConfig{
...
@@ -36,4 +40,4 @@ export class PAITrialConfig extends TrialConfig{
this
.
dataDir
=
dataDir
;
this
.
dataDir
=
dataDir
;
this
.
outputDir
=
outputDir
;
this
.
outputDir
=
outputDir
;
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
View file @
ba8dccd6
...
@@ -21,10 +21,12 @@
...
@@ -21,10 +21,12 @@
import
*
as
assert
from
'
assert
'
;
import
*
as
assert
from
'
assert
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialJobDetail
}
from
'
../../common/trainingService
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
RemoteMachineTrialJobDetail
,
parseGpuIndices
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
ScheduleResultType
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
import
{
import
{
TrialJobDetail
}
from
'
common/trainingService
'
;
parseGpuIndices
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
/**
/**
* A simple GPU scheduler implementation
* A simple GPU scheduler implementation
...
@@ -32,7 +34,7 @@ import { TrialJobDetail } from 'common/trainingService';
...
@@ -32,7 +34,7 @@ import { TrialJobDetail } from 'common/trainingService';
export
class
GPUScheduler
{
export
class
GPUScheduler
{
private
readonly
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
private
readonly
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
private
log
:
Logger
=
getLogger
();
private
readonly
log
:
Logger
=
getLogger
();
/**
/**
* Constructor
* Constructor
...
@@ -89,21 +91,21 @@ export class GPUScheduler {
...
@@ -89,21 +91,21 @@ export class GPUScheduler {
* remove the job's gpu reversion
* remove the job's gpu reversion
*/
*/
public
removeGpuReservation
(
trialJobId
:
string
,
trialJobMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
):
void
{
public
removeGpuReservation
(
trialJobId
:
string
,
trialJobMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
):
void
{
le
t
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
trialJobMap
.
get
(
trialJobId
);
cons
t
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
trialJobMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
if
(
trialJobDetail
===
undefined
)
{
throw
new
Error
(
`could not get trialJobDetail by id
${
trialJobId
}
`
);
throw
new
Error
(
`could not get trialJobDetail by id
${
trialJobId
}
`
);
}
}
if
(
trialJobDetail
.
rmMeta
!==
undefined
&&
if
(
trialJobDetail
.
rmMeta
!==
undefined
&&
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
!==
undefined
&&
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
!==
undefined
&&
trialJobDetail
.
gpuIndices
!==
undefined
&&
trialJobDetail
.
gpuIndices
!==
undefined
&&
trialJobDetail
.
gpuIndices
.
length
>
0
)
{
trialJobDetail
.
gpuIndices
.
length
>
0
)
{
for
(
const
gpuInfo
of
trialJobDetail
.
gpuIndices
)
{
for
(
const
gpuInfo
of
trialJobDetail
.
gpuIndices
)
{
le
t
num
:
number
|
undefined
=
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
cons
t
num
:
number
|
undefined
=
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
!==
undefined
)
{
if
(
num
!==
undefined
)
{
if
(
num
===
1
)
{
if
(
num
===
1
)
{
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
delete
(
gpuInfo
.
index
);
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
delete
(
gpuInfo
.
index
);
}
else
{
}
else
{
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
-
1
)
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
-
1
)
;
}
}
}
}
}
}
...
@@ -116,7 +118,6 @@ export class GPUScheduler {
...
@@ -116,7 +118,6 @@ export class GPUScheduler {
const
totalResourceMap
:
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
=
this
.
gpuResourceDetection
();
const
totalResourceMap
:
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
=
this
.
gpuResourceDetection
();
const
qualifiedRMs
:
RemoteMachineMeta
[]
=
[];
const
qualifiedRMs
:
RemoteMachineMeta
[]
=
[];
totalResourceMap
.
forEach
((
gpuInfos
:
GPUInfo
[],
rmMeta
:
RemoteMachineMeta
)
=>
{
totalResourceMap
.
forEach
((
gpuInfos
:
GPUInfo
[],
rmMeta
:
RemoteMachineMeta
)
=>
{
if
(
gpuInfos
!==
undefined
&&
gpuInfos
.
length
>=
requiredGPUNum
)
{
if
(
gpuInfos
!==
undefined
&&
gpuInfos
.
length
>=
requiredGPUNum
)
{
qualifiedRMs
.
push
(
rmMeta
);
qualifiedRMs
.
push
(
rmMeta
);
}
}
...
@@ -154,6 +155,7 @@ export class GPUScheduler {
...
@@ -154,6 +155,7 @@ export class GPUScheduler {
}
}
}
}
this
.
log
.
debug
(
`designated gpu indices:
${
designatedGpuIndices
}
`
);
this
.
log
.
debug
(
`designated gpu indices:
${
designatedGpuIndices
}
`
);
// tslint:disable: strict-boolean-expressions
rmMeta
.
gpuSummary
.
gpuInfos
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
rmMeta
.
gpuSummary
.
gpuInfos
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
// if the GPU has active process, OR be reserved by a job,
// if the GPU has active process, OR be reserved by a job,
// or index not in gpuIndices configuration in machineList,
// or index not in gpuIndices configuration in machineList,
...
@@ -161,10 +163,10 @@ export class GPUScheduler {
...
@@ -161,10 +163,10 @@ export class GPUScheduler {
// We should NOT allocate this GPU
// We should NOT allocate this GPU
// if users set useActiveGpu, use the gpu whether there is another activeProcess
// if users set useActiveGpu, use the gpu whether there is another activeProcess
if
(
designatedGpuIndices
===
undefined
||
designatedGpuIndices
.
has
(
gpuInfo
.
index
))
{
if
(
designatedGpuIndices
===
undefined
||
designatedGpuIndices
.
has
(
gpuInfo
.
index
))
{
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
le
t
num
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
cons
t
num
:
number
|
undefined
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
le
t
maxTrialNumPerGpu
:
number
=
rmMeta
.
maxTrialNumPerGpu
?
rmMeta
.
maxTrialNumPerGpu
:
1
;
cons
t
maxTrialNumPerGpu
:
number
=
rmMeta
.
maxTrialNumPerGpu
?
rmMeta
.
maxTrialNumPerGpu
:
1
;
if
((
num
===
undefined
&&
(
!
rmMeta
.
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
rmMeta
.
useActiveGpu
))
||
if
((
num
===
undefined
&&
(
!
rmMeta
.
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
rmMeta
.
useActiveGpu
))
||
(
num
!==
undefined
&&
num
<
maxTrialNumPerGpu
))
{
(
num
!==
undefined
&&
num
<
maxTrialNumPerGpu
))
{
availableGPUs
.
push
(
gpuInfo
);
availableGPUs
.
push
(
gpuInfo
);
}
}
...
@@ -179,6 +181,7 @@ export class GPUScheduler {
...
@@ -179,6 +181,7 @@ export class GPUScheduler {
return
totalResourceMap
;
return
totalResourceMap
;
}
}
// tslint:enable: strict-boolean-expressions
private
selectMachine
(
rmMetas
:
RemoteMachineMeta
[]):
RemoteMachineMeta
{
private
selectMachine
(
rmMetas
:
RemoteMachineMeta
[]):
RemoteMachineMeta
{
assert
(
rmMetas
!==
undefined
&&
rmMetas
.
length
>
0
);
assert
(
rmMetas
!==
undefined
&&
rmMetas
.
length
>
0
);
...
@@ -196,23 +199,28 @@ export class GPUScheduler {
...
@@ -196,23 +199,28 @@ export class GPUScheduler {
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
let
num
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
let
num
:
number
|
undefined
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
===
undefined
)
{
if
(
num
===
undefined
)
{
num
=
0
;
num
=
0
;
}
}
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
+
1
);
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
+
1
);
}
else
{
}
else
{
throw
new
Error
(
`Machine
${
rmMeta
.
ip
}
occupiedGpuIndexMap initialize error!`
);
throw
new
Error
(
`Machine
${
rmMeta
.
ip
}
occupiedGpuIndexMap initialize error!`
);
}
}
});
});
trialJobDetail
.
gpuIndices
=
allocatedGPUs
;
trialJobDetail
.
gpuIndices
=
allocatedGPUs
;
trialJobDetail
.
rmMeta
=
rmMeta
;
trialJobDetail
.
rmMeta
=
rmMeta
;
return
{
return
{
resultType
:
ScheduleResultType
.
SUCCEED
,
resultType
:
ScheduleResultType
.
SUCCEED
,
scheduleInfo
:
{
scheduleInfo
:
{
rmMeta
:
rmMeta
,
rmMeta
:
rmMeta
,
cuda_visible_device
:
allocatedGPUs
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
return
gpuInfo
.
index
;
}).
join
(
'
,
'
)
cuda_visible_device
:
allocatedGPUs
.
map
((
gpuInfo
:
GPUInfo
)
=>
{
return
gpuInfo
.
index
;
})
.
join
(
'
,
'
)
}
}
};
};
}
}
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
ba8dccd6
...
@@ -23,7 +23,7 @@ import * as fs from 'fs';
...
@@ -23,7 +23,7 @@ import * as fs from 'fs';
import
{
Client
,
ConnectConfig
}
from
'
ssh2
'
;
import
{
Client
,
ConnectConfig
}
from
'
ssh2
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
GPU
Summary
,
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
GPU
Info
,
GPUSummary
}
from
'
../common/gpuData
'
;
/**
/**
* Metadata of remote machine for configuration and statuc query
* Metadata of remote machine for configuration and statuc query
...
@@ -73,7 +73,6 @@ export class RemoteCommandResult {
...
@@ -73,7 +73,6 @@ export class RemoteCommandResult {
/**
/**
* RemoteMachineTrialJobDetail
* RemoteMachineTrialJobDetail
*/
*/
// tslint:disable-next-line:max-classes-per-file
export
class
RemoteMachineTrialJobDetail
implements
TrialJobDetail
{
export
class
RemoteMachineTrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
id
:
string
;
public
status
:
TrialJobStatus
;
public
status
:
TrialJobStatus
;
...
@@ -98,7 +97,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
...
@@ -98,7 +97,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
this
.
form
=
form
;
this
.
form
=
form
;
this
.
sequenceId
=
sequenceId
;
this
.
sequenceId
=
sequenceId
;
this
.
tags
=
[];
this
.
tags
=
[];
this
.
gpuIndices
=
[]
this
.
gpuIndices
=
[]
;
}
}
}
}
...
@@ -112,7 +111,7 @@ export class SSHClient {
...
@@ -112,7 +111,7 @@ export class SSHClient {
this
.
sshClient
=
sshClient
;
this
.
sshClient
=
sshClient
;
this
.
usedConnectionNumber
=
usedConnectionNumber
;
this
.
usedConnectionNumber
=
usedConnectionNumber
;
}
}
public
get
getSSHClientInstance
():
Client
{
public
get
getSSHClientInstance
():
Client
{
return
this
.
sshClient
;
return
this
.
sshClient
;
}
}
...
@@ -121,17 +120,20 @@ export class SSHClient {
...
@@ -121,17 +120,20 @@ export class SSHClient {
return
this
.
usedConnectionNumber
;
return
this
.
usedConnectionNumber
;
}
}
public
addUsedConnectionNumber
()
{
public
addUsedConnectionNumber
()
:
void
{
this
.
usedConnectionNumber
+=
1
;
this
.
usedConnectionNumber
+=
1
;
}
}
public
minusUsedConnectionNumber
()
{
public
minusUsedConnectionNumber
()
:
void
{
this
.
usedConnectionNumber
-=
1
;
this
.
usedConnectionNumber
-=
1
;
}
}
}
}
/**
* The remote machine ssh client manager
*/
export
class
SSHClientManager
{
export
class
SSHClientManager
{
private
sshClientArray
:
SSHClient
[];
private
readonly
sshClientArray
:
SSHClient
[];
private
readonly
maxTrialNumberPerConnection
:
number
;
private
readonly
maxTrialNumberPerConnection
:
number
;
private
readonly
rmMeta
:
RemoteMachineMeta
;
private
readonly
rmMeta
:
RemoteMachineMeta
;
constructor
(
sshClientArray
:
SSHClient
[],
maxTrialNumberPerConnection
:
number
,
rmMeta
:
RemoteMachineMeta
)
{
constructor
(
sshClientArray
:
SSHClient
[],
maxTrialNumberPerConnection
:
number
,
rmMeta
:
RemoteMachineMeta
)
{
...
@@ -140,122 +142,128 @@ export class SSHClientManager {
...
@@ -140,122 +142,128 @@ export class SSHClientManager {
this
.
maxTrialNumberPerConnection
=
maxTrialNumberPerConnection
;
this
.
maxTrialNumberPerConnection
=
maxTrialNumberPerConnection
;
}
}
/**
* Create a new ssh connection client and initialize it
*/
private
initNewSSHClient
():
Promise
<
Client
>
{
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
const
conn
:
Client
=
new
Client
();
let
connectConfig
:
ConnectConfig
=
{
host
:
this
.
rmMeta
.
ip
,
port
:
this
.
rmMeta
.
port
,
username
:
this
.
rmMeta
.
username
};
if
(
this
.
rmMeta
.
passwd
)
{
connectConfig
.
password
=
this
.
rmMeta
.
passwd
;
}
else
if
(
this
.
rmMeta
.
sshKeyPath
)
{
if
(
!
fs
.
existsSync
(
this
.
rmMeta
.
sshKeyPath
))
{
//SSh key path is not a valid file, reject
deferred
.
reject
(
new
Error
(
`
${
this
.
rmMeta
.
sshKeyPath
}
does not exist.`
));
}
const
privateKey
:
string
=
fs
.
readFileSync
(
this
.
rmMeta
.
sshKeyPath
,
'
utf8
'
);
connectConfig
.
privateKey
=
privateKey
;
connectConfig
.
passphrase
=
this
.
rmMeta
.
passphrase
;
}
else
{
deferred
.
reject
(
new
Error
(
`No valid passwd or sshKeyPath is configed.`
));
}
conn
.
on
(
'
ready
'
,
()
=>
{
this
.
addNewSSHClient
(
conn
);
deferred
.
resolve
(
conn
);
}).
on
(
'
error
'
,
(
err
:
Error
)
=>
{
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
}).
connect
(
connectConfig
);
return
deferred
.
promise
;
}
/**
/**
* find a available ssh client in ssh array, if no ssh client available, return undefined
* find a available ssh client in ssh array, if no ssh client available, return undefined
*/
*/
public
async
getAvailableSSHClient
():
Promise
<
Client
>
{
public
async
getAvailableSSHClient
():
Promise
<
Client
>
{
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
for
(
const
index
in
this
.
sshClientArray
)
{
for
(
const
index
of
this
.
sshClientArray
.
keys
()
)
{
le
t
connectionNumber
:
number
=
this
.
sshClientArray
[
index
].
getUsedConnectionNumber
;
cons
t
connectionNumber
:
number
=
this
.
sshClientArray
[
index
].
getUsedConnectionNumber
;
if
(
connectionNumber
<
this
.
maxTrialNumberPerConnection
)
{
if
(
connectionNumber
<
this
.
maxTrialNumberPerConnection
)
{
this
.
sshClientArray
[
index
].
addUsedConnectionNumber
();
this
.
sshClientArray
[
index
].
addUsedConnectionNumber
();
deferred
.
resolve
(
this
.
sshClientArray
[
index
].
getSSHClientInstance
);
deferred
.
resolve
(
this
.
sshClientArray
[
index
].
getSSHClientInstance
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
};
}
//init a new ssh client if could not get an available one
//init a new ssh client if could not get an available one
return
await
this
.
initNewSSHClient
();
return
this
.
initNewSSHClient
();
}
}
/**
/**
* add a new ssh client to sshClientArray
* add a new ssh client to sshClientArray
* @param sshClient
* @param sshClient
SSH Client
*/
*/
public
addNewSSHClient
(
client
:
Client
)
{
public
addNewSSHClient
(
client
:
Client
)
:
void
{
this
.
sshClientArray
.
push
(
new
SSHClient
(
client
,
1
));
this
.
sshClientArray
.
push
(
new
SSHClient
(
client
,
1
));
}
}
/**
/**
* first ssh cli
l
ent instance is used for gpu collector and host job
* first ssh client instance is used for gpu collector and host job
*/
*/
public
getFirstSSHClient
()
{
public
getFirstSSHClient
()
:
Client
{
return
this
.
sshClientArray
[
0
].
getSSHClientInstance
;
return
this
.
sshClientArray
[
0
].
getSSHClientInstance
;
}
}
/**
/**
* close all of ssh client
* close all of ssh client
*/
*/
public
closeAllSSHClient
()
{
public
closeAllSSHClient
()
:
void
{
for
(
le
t
sshClient
of
this
.
sshClientArray
)
{
for
(
cons
t
sshClient
of
this
.
sshClientArray
)
{
sshClient
.
getSSHClientInstance
.
end
();
sshClient
.
getSSHClientInstance
.
end
();
}
}
}
}
/**
/**
* retrieve resource, minus a number for given ssh client
* retrieve resource, minus a number for given ssh client
* @param client
* @param client
SSH Client
*/
*/
public
releaseConnection
(
client
:
Client
|
undefined
)
{
public
releaseConnection
(
client
:
Client
|
undefined
)
:
void
{
if
(
!
client
)
{
if
(
client
===
undefined
)
{
throw
new
Error
(
`could not release a undefined ssh client`
);
throw
new
Error
(
`could not release a undefined ssh client`
);
}
}
for
(
le
t
index
in
this
.
sshClientArray
)
{
for
(
cons
t
index
of
this
.
sshClientArray
.
keys
()
)
{
if
(
this
.
sshClientArray
[
index
].
getSSHClientInstance
===
client
)
{
if
(
this
.
sshClientArray
[
index
].
getSSHClientInstance
===
client
)
{
this
.
sshClientArray
[
index
].
minusUsedConnectionNumber
();
this
.
sshClientArray
[
index
].
minusUsedConnectionNumber
();
break
;
break
;
}
}
}
}
}
}
}
/**
* Create a new ssh connection client and initialize it
*/
// tslint:disable:non-literal-fs-path
private
initNewSSHClient
():
Promise
<
Client
>
{
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
const
conn
:
Client
=
new
Client
();
const
connectConfig
:
ConnectConfig
=
{
host
:
this
.
rmMeta
.
ip
,
port
:
this
.
rmMeta
.
port
,
username
:
this
.
rmMeta
.
username
};
if
(
this
.
rmMeta
.
passwd
!==
undefined
)
{
connectConfig
.
password
=
this
.
rmMeta
.
passwd
;
}
else
if
(
this
.
rmMeta
.
sshKeyPath
!==
undefined
)
{
if
(
!
fs
.
existsSync
(
this
.
rmMeta
.
sshKeyPath
))
{
//SSh key path is not a valid file, reject
deferred
.
reject
(
new
Error
(
`
${
this
.
rmMeta
.
sshKeyPath
}
does not exist.`
));
}
const
privateKey
:
string
=
fs
.
readFileSync
(
this
.
rmMeta
.
sshKeyPath
,
'
utf8
'
);
connectConfig
.
privateKey
=
privateKey
;
connectConfig
.
passphrase
=
this
.
rmMeta
.
passphrase
;
}
else
{
deferred
.
reject
(
new
Error
(
`No valid passwd or sshKeyPath is configed.`
));
}
conn
.
on
(
'
ready
'
,
()
=>
{
this
.
addNewSSHClient
(
conn
);
deferred
.
resolve
(
conn
);
})
.
on
(
'
error
'
,
(
err
:
Error
)
=>
{
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
})
.
connect
(
connectConfig
);
return
deferred
.
promise
;
}
}
export
type
RemoteMachineScheduleResult
=
{
scheduleInfo
:
RemoteMachineScheduleInfo
|
undefined
;
resultType
:
ScheduleResultType
};
export
type
RemoteMachineScheduleResult
=
{
scheduleInfo
:
RemoteMachineScheduleInfo
|
undefined
;
resultType
:
ScheduleResultType
};
export
type
RemoteMachineScheduleInfo
=
{
rmMeta
:
RemoteMachineMeta
;
cuda_visible_device
:
string
};
export
type
RemoteMachineScheduleInfo
=
{
rmMeta
:
RemoteMachineMeta
;
cuda_visible_device
:
string
};
export
enum
ScheduleResultType
{
export
enum
ScheduleResultType
{
/
*
Schedule succeeded
*/
/
/
Schedule succeeded
SUCCEED
,
SUCCEED
,
/
*
Temporarily, no enough available GPU right now
*/
/
/
Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU
,
TMP_NO_AVAILABLE_GPU
,
/
*
Cannot match requirement even if all GPU are a
*/
/
/
Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
REQUIRE_EXCEED_TOTAL
}
}
export
const
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
:
string
=
export
const
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
:
string
=
`#!/bin/bash
`#!/bin/bash
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5}
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \
NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5}
cd $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
sh install_nni.sh
echo $$ >{6}
echo $$ >{6}
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $?
\`
date +%s%3N
\`
>{12}`
;
echo $?
\`
date +%s%3N
\`
>{12}`
;
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
...
...
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
View file @
ba8dccd6
...
@@ -19,17 +19,17 @@
...
@@ -19,17 +19,17 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
RemoteMachineTrainingService
}
from
'
./remoteMachineTrainingService
'
;
import
{
RemoteMachineTrainingService
}
from
'
./remoteMachineTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
/**
* RemoteMachine Training service Rest server, provides rest RemoteMachine to support remotemachine job metrics update
* RemoteMachine Training service Rest server, provides rest RemoteMachine to support remotemachine job metrics update
*
*
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
@
Inject
@
Inject
private
readonly
remoteMachineTrainingService
:
RemoteMachineTrainingService
;
private
readonly
remoteMachineTrainingService
:
RemoteMachineTrainingService
;
...
@@ -41,6 +41,7 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer{
...
@@ -41,6 +41,7 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer{
this
.
remoteMachineTrainingService
=
component
.
get
(
RemoteMachineTrainingService
);
this
.
remoteMachineTrainingService
=
component
.
get
(
RemoteMachineTrainingService
);
}
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
// Split metrics array into single metric, then emit
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWNls
// Warning: If not split metrics into single ones, the behavior will be UNKNOWNls
...
@@ -51,4 +52,4 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer{
...
@@ -51,4 +52,4 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer{
});
});
}
}
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
ba8dccd6
...
@@ -34,42 +34,45 @@ import { getExperimentId, getInitTrialSequenceId } from '../../common/experiment
...
@@ -34,42 +34,45 @@ import { getExperimentId, getInitTrialSequenceId } from '../../common/experiment
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
ObservableTimer
}
from
'
../../common/observableTimer
'
;
import
{
ObservableTimer
}
from
'
../../common/observableTimer
'
;
import
{
import
{
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
NNIManagerIpConfig
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
NNIManagerIpConfig
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
,
getJobCancelStatus
,
getRemoteTmpDir
,
getIPV4Address
,
getVersion
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
delay
,
generateParamFileName
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getRemoteTmpDir
,
getVersion
,
uniqueString
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execCopydir
,
execMkdir
,
execRemove
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
import
{
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
ScheduleResultType
,
SSHClient
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
}
from
'
./remoteMachineData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
}
from
'
../common/gpuData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
,
execRemove
,
execMkdir
,
execCopydir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
/**
/**
* Training Service implementation for Remote Machine (Linux)
* Training Service implementation for Remote Machine (Linux)
*/
*/
@
component
.
Singleton
@
component
.
Singleton
class
RemoteMachineTrainingService
implements
TrainingService
{
class
RemoteMachineTrainingService
implements
TrainingService
{
private
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
//machine ssh client map
private
readonly
machineSSHClientMap
:
Map
<
RemoteMachineMeta
,
SSHClientManager
>
;
//machine ssh client map
private
trialSSHClientMap
:
Map
<
string
,
Client
>
;
//trial ssh client map
private
readonly
trialSSHClientMap
:
Map
<
string
,
Client
>
;
//trial ssh client map
private
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
trialJobsMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
;
private
readonly
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
:
number
=
5
// every ssh client has a max trial concurrency number
private
readonly
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
:
number
=
5
;
// every ssh client has a max trial concurrency number
private
expRootDir
:
string
;
private
readonly
expRootDir
:
string
;
private
remoteExpRootDir
:
string
;
private
readonly
remoteExpRootDir
:
string
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
gpuScheduler
:
GPUScheduler
;
private
readonly
gpuScheduler
:
GPUScheduler
;
private
jobQueue
:
string
[];
private
readonly
jobQueue
:
string
[];
private
timer
:
ObservableTimer
;
private
readonly
timer
:
ObservableTimer
;
private
stopping
:
boolean
=
false
;
private
stopping
:
boolean
=
false
;
private
metricsEmitter
:
EventEmitter
;
private
readonly
metricsEmitter
:
EventEmitter
;
private
log
:
Logger
;
private
readonly
log
:
Logger
;
private
isMultiPhase
:
boolean
=
false
;
private
isMultiPhase
:
boolean
=
false
;
private
trialSequenceId
:
number
;
private
trialSequenceId
:
number
;
private
remoteRestServerPort
?:
number
;
private
remoteRestServerPort
?:
number
;
...
@@ -117,7 +120,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -117,7 +120,7 @@ class RemoteMachineTrainingService implements TrainingService {
break
;
break
;
}
}
}
}
if
(
restServer
.
getErrorMessage
)
{
if
(
restServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
throw
new
Error
(
restServer
.
getErrorMessage
);
this
.
stopping
=
true
;
this
.
stopping
=
true
;
}
}
...
@@ -125,36 +128,37 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -125,36 +128,37 @@ class RemoteMachineTrainingService implements TrainingService {
}
}
this
.
log
.
info
(
'
Remote machine training service exit.
'
);
this
.
log
.
info
(
'
Remote machine training service exit.
'
);
}
}
/**
/**
* give trial a ssh connection
* give trial a ssh connection
* @param trial
* @param trial
remote machine trial job detail
*/
*/
public
async
allocateSSHClientForTrial
(
trial
:
RemoteMachineTrialJobDetail
):
Promise
<
void
>
{
public
async
allocateSSHClientForTrial
(
trial
:
RemoteMachineTrialJobDetail
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
!
trial
.
rmMeta
)
{
if
(
trial
.
rmMeta
===
undefined
)
{
throw
new
Error
(
`rmMeta not set in trial
${
trial
.
id
}
`
);
throw
new
Error
(
`rmMeta not set in trial
${
trial
.
id
}
`
);
}
}
le
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
cons
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
if
(
!
sshClientManager
)
{
if
(
sshClientManager
===
undefined
)
{
throw
new
Error
(
`remoteSSHClient not initialized`
);
throw
new
Error
(
`remoteSSHClient not initialized`
);
}
}
le
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
cons
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
this
.
trialSSHClientMap
.
set
(
trial
.
id
,
sshClient
);
this
.
trialSSHClientMap
.
set
(
trial
.
id
,
sshClient
);
deferred
.
resolve
();
deferred
.
resolve
();
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* If a trial is finished, release the connection resource
* If a trial is finished, release the connection resource
* @param trial
* @param trial
remote machine trial job detail
*/
*/
public
releaseTrialSSHClient
(
trial
:
RemoteMachineTrialJobDetail
):
void
{
public
releaseTrialSSHClient
(
trial
:
RemoteMachineTrialJobDetail
):
void
{
if
(
!
trial
.
rmMeta
)
{
if
(
trial
.
rmMeta
===
undefined
)
{
throw
new
Error
(
`rmMeta not set in trial
${
trial
.
id
}
`
);
throw
new
Error
(
`rmMeta not set in trial
${
trial
.
id
}
`
);
}
}
le
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
cons
t
sshClientManager
:
SSHClientManager
|
undefined
=
this
.
machineSSHClientMap
.
get
(
trial
.
rmMeta
);
if
(
!
sshClientManager
)
{
if
(
sshClientManager
===
undefined
)
{
throw
new
Error
(
`sshClientManager not initialized`
);
throw
new
Error
(
`sshClientManager not initialized`
);
}
}
sshClientManager
.
releaseConnection
(
this
.
trialSSHClientMap
.
get
(
trial
.
id
));
sshClientManager
.
releaseConnection
(
this
.
trialSSHClientMap
.
get
(
trial
.
id
));
...
@@ -167,11 +171,11 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -167,11 +171,11 @@ class RemoteMachineTrainingService implements TrainingService {
const
jobs
:
TrialJobDetail
[]
=
[];
const
jobs
:
TrialJobDetail
[]
=
[];
const
deferred
:
Deferred
<
TrialJobDetail
[]
>
=
new
Deferred
<
TrialJobDetail
[]
>
();
const
deferred
:
Deferred
<
TrialJobDetail
[]
>
=
new
Deferred
<
TrialJobDetail
[]
>
();
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
value
.
form
.
jobType
===
'
TRIAL
'
)
{
if
(
value
.
form
.
jobType
===
'
TRIAL
'
)
{
jobs
.
push
(
await
this
.
getTrialJob
(
key
));
jobs
.
push
(
await
this
.
getTrialJob
(
key
));
}
}
}
;
}
deferred
.
resolve
(
jobs
);
deferred
.
resolve
(
jobs
);
return
deferred
.
promise
;
return
deferred
.
promise
;
...
@@ -183,7 +187,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -183,7 +187,7 @@ class RemoteMachineTrainingService implements TrainingService {
*/
*/
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
public
async
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
const
trialJob
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
trialJob
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJob
)
{
if
(
trialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
}
//TO DO: add another job status, and design new job status change logic
//TO DO: add another job status, and design new job status change logic
...
@@ -193,7 +197,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -193,7 +197,7 @@ class RemoteMachineTrainingService implements TrainingService {
throw
new
Error
(
`rmMeta not set for submitted job
${
trialJobId
}
`
);
throw
new
Error
(
`rmMeta not set for submitted job
${
trialJobId
}
`
);
}
}
const
sshClient
:
Client
|
undefined
=
this
.
trialSSHClientMap
.
get
(
trialJob
.
id
);
const
sshClient
:
Client
|
undefined
=
this
.
trialSSHClientMap
.
get
(
trialJob
.
id
);
if
(
!
sshClient
)
{
if
(
sshClient
===
undefined
)
{
throw
new
Error
(
`Invalid job id:
${
trialJobId
}
, cannot find ssh client`
);
throw
new
Error
(
`Invalid job id:
${
trialJobId
}
, cannot find ssh client`
);
}
}
...
@@ -223,8 +227,9 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -223,8 +227,9 @@ class RemoteMachineTrainingService implements TrainingService {
* Submit trial job
* Submit trial job
* @param form trial job description form
* @param form trial job description form
*/
*/
// tslint:disable-next-line:informative-docs
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
trialConfig
)
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
throw
new
Error
(
'
trial config is not initialized
'
);
}
}
...
@@ -275,17 +280,6 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -275,17 +280,6 @@ class RemoteMachineTrainingService implements TrainingService {
return
trialJobDetail
;
return
trialJobDetail
;
}
}
/**
* remove gpu reversion when job is not running
*/
private
updateGpuReservation
()
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
}
};
}
/**
/**
* Is multiphase job supported in current training service
* Is multiphase job supported in current training service
...
@@ -298,10 +292,11 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -298,10 +292,11 @@ class RemoteMachineTrainingService implements TrainingService {
* Cancel trial job
* Cancel trial job
* @param trialJobId ID of trial job
* @param trialJobId ID of trial job
*/
*/
// tslint:disable:informative-docs no-unsafe-any
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
trialJob
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
trialJob
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
!
trialJob
)
{
if
(
trialJob
===
undefined
)
{
deferred
.
reject
();
deferred
.
reject
();
throw
new
Error
(
`trial job id
${
trialJobId
}
not found`
);
throw
new
Error
(
`trial job id
${
trialJobId
}
not found`
);
}
}
...
@@ -316,7 +311,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -316,7 +311,7 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
trialJob
.
rmMeta
!==
undefined
)
{
if
(
trialJob
.
rmMeta
!==
undefined
)
{
// If the trial job is already scheduled, check its status and kill the trial process in remote machine
// If the trial job is already scheduled, check its status and kill the trial process in remote machine
const
sshClient
:
Client
|
undefined
=
this
.
trialSSHClientMap
.
get
(
trialJob
.
id
);
const
sshClient
:
Client
|
undefined
=
this
.
trialSSHClientMap
.
get
(
trialJob
.
id
);
if
(
!
sshClient
)
{
if
(
sshClient
===
undefined
)
{
deferred
.
reject
();
deferred
.
reject
();
throw
new
Error
(
`Invalid job id
${
trialJobId
}
, cannot find ssh client`
);
throw
new
Error
(
`Invalid job id
${
trialJobId
}
, cannot find ssh client`
);
}
}
...
@@ -358,20 +353,23 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -358,20 +353,23 @@ class RemoteMachineTrainingService implements TrainingService {
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
// Parse trial config failed, throw Error
if
(
!
remoteMachineTrailConfig
)
{
if
(
remoteMachineTrailConfig
===
undefined
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
throw
new
Error
(
'
trial config parsed failed
'
);
}
}
// codeDir is not a valid directory, throw Error
// codeDir is not a valid directory, throw Error
if
(
!
fs
.
lstatSync
(
remoteMachineTrailConfig
.
codeDir
).
isDirectory
())
{
// tslint:disable-next-line:non-literal-fs-path
if
(
!
fs
.
lstatSync
(
remoteMachineTrailConfig
.
codeDir
)
.
isDirectory
())
{
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
}
}
// Validate to make sure codeDir doesn't have too many files
// Validate to make sure codeDir doesn't have too many files
try
{
try
{
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
return
Promise
.
reject
(
new
Error
(
error
));
}
}
this
.
trialConfig
=
remoteMachineTrailConfig
;
this
.
trialConfig
=
remoteMachineTrailConfig
;
...
@@ -400,60 +398,73 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -400,60 +398,73 @@ class RemoteMachineTrainingService implements TrainingService {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* cleanup() has a time out of 10s to clean remote connections
* cleanup() has a time out of 10s to clean remote connections
*/
*/
public
async
cleanUp
():
Promise
<
void
>
{
public
async
cleanUp
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Stopping remote machine training service...
'
);
this
.
log
.
info
(
'
Stopping remote machine training service...
'
);
this
.
stopping
=
true
;
this
.
stopping
=
true
;
await
Promise
.
race
([
delay
(
10000
),
this
.
cleanupConnections
()]);
await
Promise
.
race
([
delay
(
10000
),
this
.
cleanupConnections
()]);
}
}
/**
* remove gpu reversion when job is not running
*/
private
updateGpuReservation
():
void
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
}
}
}
/**
/**
* stop gpu_metric_collector process in remote machine and remove unused scripts
* stop gpu_metric_collector process in remote machine and remove unused scripts
*/
*/
private
async
cleanupConnections
():
Promise
<
void
>
{
private
async
cleanupConnections
():
Promise
<
void
>
{
try
{
try
{
for
(
const
[
rmMeta
,
sshClientManager
]
of
this
.
machineSSHClientMap
.
entries
())
{
for
(
const
[
rmMeta
,
sshClientManager
]
of
this
.
machineSSHClientMap
.
entries
())
{
le
t
jobpidPath
:
string
=
unixPathJoin
(
this
.
getRemoteScriptsPath
(
rmMeta
.
username
),
'
pid
'
);
cons
t
jobpidPath
:
string
=
unixPathJoin
(
this
.
getRemoteScriptsPath
(
rmMeta
.
username
),
'
pid
'
);
le
t
client
:
Client
|
undefined
=
sshClientManager
.
getFirstSSHClient
();
cons
t
client
:
Client
|
undefined
=
sshClientManager
.
getFirstSSHClient
();
if
(
client
)
{
if
(
client
!==
undefined
)
{
await
SSHClientUtility
.
remoteExeCommand
(
`pkill -P
\`
cat
${
jobpidPath
}
\`
`
,
client
);
await
SSHClientUtility
.
remoteExeCommand
(
`pkill -P
\`
cat
${
jobpidPath
}
\`
`
,
client
);
await
SSHClientUtility
.
remoteExeCommand
(
`rm -rf
${
this
.
getRemoteScriptsPath
(
rmMeta
.
username
)}
`
,
client
);
await
SSHClientUtility
.
remoteExeCommand
(
`rm -rf
${
this
.
getRemoteScriptsPath
(
rmMeta
.
username
)}
`
,
client
);
}
}
sshClientManager
.
closeAllSSHClient
();
sshClientManager
.
closeAllSSHClient
();
}
}
}
catch
(
error
)
{
}
catch
(
error
)
{
//ignore error, this function is called to cleanup remote connections when experiment is stopping
//ignore error, this function is called to cleanup remote connections when experiment is stopping
this
.
log
.
error
(
`Cleanup connection exception, error is
${
error
.
message
}
`
);
this
.
log
.
error
(
`Cleanup connection exception, error is
${
error
.
message
}
`
);
}
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
/**
/**
* Generate gpu metric collector directory to store temp gpu metric collector script files
* Generate gpu metric collector directory to store temp gpu metric collector script files
*/
*/
private
getLocalGpuMetricCollectorDir
():
string
{
private
getLocalGpuMetricCollectorDir
():
string
{
let
userName
:
string
=
path
.
basename
(
os
.
homedir
());
//get current user name of os
const
userName
:
string
=
path
.
basename
(
os
.
homedir
());
//get current user name of os
return
path
.
join
(
os
.
tmpdir
(),
userName
,
'
nni
'
,
'
scripts
'
);
return
path
.
join
(
os
.
tmpdir
(),
userName
,
'
nni
'
,
'
scripts
'
);
}
}
/**
/**
* Generate gpu metric collector shell script in local machine,
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
* used to run in remote machine, and will be deleted after uploaded from local.
*/
*/
private
async
generateGpuMetricsCollectorScript
(
userName
:
string
):
Promise
<
void
>
{
private
async
generateGpuMetricsCollectorScript
(
userName
:
string
):
Promise
<
void
>
{
le
t
gpuMetricCollectorScriptFolder
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
cons
t
gpuMetricCollectorScriptFolder
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
await
execMkdir
(
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
));
await
execMkdir
(
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
));
//generate gpu_metrics_collector.sh
//generate gpu_metrics_collector.sh
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
// This directory is used to store gpu_metrics and pid created by script
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
remoteGPUScriptsDir
,
remoteGPUScriptsDir
,
unixPathJoin
(
remoteGPUScriptsDir
,
'
pid
'
)
,
unixPathJoin
(
remoteGPUScriptsDir
,
'
pid
'
)
);
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
...
@@ -467,39 +478,44 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -467,39 +478,44 @@ class RemoteMachineTrainingService implements TrainingService {
rmMetaList
.
forEach
(
async
(
rmMeta
:
RemoteMachineMeta
)
=>
{
rmMetaList
.
forEach
(
async
(
rmMeta
:
RemoteMachineMeta
)
=>
{
rmMeta
.
occupiedGpuIndexMap
=
new
Map
<
number
,
number
>
();
rmMeta
.
occupiedGpuIndexMap
=
new
Map
<
number
,
number
>
();
le
t
sshClientManager
:
SSHClientManager
=
new
SSHClientManager
([],
this
.
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
,
rmMeta
);
cons
t
sshClientManager
:
SSHClientManager
=
new
SSHClientManager
([],
this
.
MAX_TRIAL_NUMBER_PER_SSHCONNECTION
,
rmMeta
);
le
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
cons
t
sshClient
:
Client
=
await
sshClientManager
.
getAvailableSSHClient
();
this
.
machineSSHClientMap
.
set
(
rmMeta
,
sshClientManager
);
this
.
machineSSHClientMap
.
set
(
rmMeta
,
sshClientManager
);
await
this
.
initRemoteMachineOnConnected
(
rmMeta
,
sshClient
);
await
this
.
initRemoteMachineOnConnected
(
rmMeta
,
sshClient
);
if
(
++
connectedRMNum
===
rmMetaList
.
length
)
{
if
(
++
connectedRMNum
===
rmMetaList
.
length
)
{
deferred
.
resolve
();
deferred
.
resolve
();
}
}
});
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
private
async
initRemoteMachineOnConnected
(
rmMeta
:
RemoteMachineMeta
,
conn
:
Client
):
Promise
<
void
>
{
private
async
initRemoteMachineOnConnected
(
rmMeta
:
RemoteMachineMeta
,
conn
:
Client
):
Promise
<
void
>
{
// Create root working directory after ssh connection is ready
// Create root working directory after ssh connection is ready
await
this
.
generateGpuMetricsCollectorScript
(
rmMeta
.
username
);
//generate gpu script in local machine first, will copy to remote machine later
// generate gpu script in local machine first, will copy to remote machine later
await
this
.
generateGpuMetricsCollectorScript
(
rmMeta
.
username
);
const
nniRootDir
:
string
=
unixPathJoin
(
getRemoteTmpDir
(
this
.
remoteOS
),
'
nni
'
);
const
nniRootDir
:
string
=
unixPathJoin
(
getRemoteTmpDir
(
this
.
remoteOS
),
'
nni
'
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
this
.
remoteExpRootDir
}
`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
this
.
remoteExpRootDir
}
`
,
conn
);
// Copy NNI scripts to remote expeirment working directory
// Copy NNI scripts to remote expeirment working directory
const
localGpuScriptCollectorDir
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
const
localGpuScriptCollectorDir
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
const
remoteGpuScriptCollectorDir
:
string
=
this
.
getRemoteScriptsPath
(
rmMeta
.
username
);
//the directory to store temp scripts in remote machine
// the directory to store temp scripts in remote machine
const
remoteGpuScriptCollectorDir
:
string
=
this
.
getRemoteScriptsPath
(
rmMeta
.
username
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
remoteGpuScriptCollectorDir
}
`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
remoteGpuScriptCollectorDir
}
`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`chmod 777
${
nniRootDir
}
${
nniRootDir
}
/*
${
nniRootDir
}
/scripts/*`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`chmod 777
${
nniRootDir
}
${
nniRootDir
}
/*
${
nniRootDir
}
/scripts/*`
,
conn
);
//copy gpu_metrics_collector.sh to remote
//copy gpu_metrics_collector.sh to remote
await
SSHClientUtility
.
copyFileToRemote
(
path
.
join
(
localGpuScriptCollectorDir
,
rmMeta
.
username
,
'
gpu_metrics_collector.sh
'
),
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
),
conn
);
await
SSHClientUtility
.
copyFileToRemote
(
path
.
join
(
localGpuScriptCollectorDir
,
rmMeta
.
username
,
'
gpu_metrics_collector.sh
'
),
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
),
conn
);
//Begin to execute gpu_metrics_collection scripts
//Begin to execute gpu_metrics_collection scripts
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
)}
`
,
conn
);
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
)}
`
,
conn
);
this
.
timer
.
subscribe
(
this
.
timer
.
subscribe
(
async
(
tick
:
number
)
=>
{
async
(
tick
:
number
)
=>
{
const
cmdresult
:
RemoteCommandResult
=
await
SSHClientUtility
.
remoteExeCommand
(
const
cmdresult
:
RemoteCommandResult
=
await
SSHClientUtility
.
remoteExeCommand
(
`tail -n 1
${
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics
'
)}
`
,
conn
);
`tail -n 1
${
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics
'
)}
`
,
conn
);
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
if
(
cmdresult
!==
undefined
&&
cmdresult
.
stdout
!==
undefined
)
{
rmMeta
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
rmMeta
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
}
}
}
...
@@ -509,7 +525,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -509,7 +525,7 @@ class RemoteMachineTrainingService implements TrainingService {
private
async
prepareTrialJob
(
trialJobId
:
string
):
Promise
<
boolean
>
{
private
async
prepareTrialJob
(
trialJobId
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
if
(
!
this
.
trialConfig
)
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
throw
new
Error
(
'
trial config is not initialized
'
);
}
}
const
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
...
@@ -519,6 +535,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -519,6 +535,7 @@ class RemoteMachineTrainingService implements TrainingService {
// If job is not WATIING, Don't prepare and resolve true immediately
// If job is not WATIING, Don't prepare and resolve true immediately
if
(
trialJobDetail
.
status
!==
'
WAITING
'
)
{
if
(
trialJobDetail
.
status
!==
'
WAITING
'
)
{
deferred
.
resolve
(
true
);
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
// get an ssh client from scheduler
// get an ssh client from scheduler
...
@@ -557,7 +574,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -557,7 +574,7 @@ class RemoteMachineTrainingService implements TrainingService {
private
async
launchTrialOnScheduledMachine
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
TrialJobApplicationForm
,
private
async
launchTrialOnScheduledMachine
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
TrialJobApplicationForm
,
rmScheduleInfo
:
RemoteMachineScheduleInfo
):
Promise
<
void
>
{
rmScheduleInfo
:
RemoteMachineScheduleInfo
):
Promise
<
void
>
{
if
(
!
this
.
trialConfig
)
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
throw
new
Error
(
'
trial config is not initialized
'
);
}
}
const
cuda_visible_device
:
string
=
rmScheduleInfo
.
cuda_visible_device
;
const
cuda_visible_device
:
string
=
rmScheduleInfo
.
cuda_visible_device
;
...
@@ -584,18 +601,19 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -584,18 +601,19 @@ class RemoteMachineTrainingService implements TrainingService {
let
command
:
string
;
let
command
:
string
;
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
// If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
if
(
typeof
cuda_visible_device
===
'
string
'
&&
cuda_visible_device
.
length
>
0
)
{
if
(
typeof
cuda_visible_device
===
'
string
'
&&
cuda_visible_device
.
length
>
0
)
{
command
=
`CUDA_VISIBLE_DEVICES=
${
cuda_visible_device
}
${
this
.
trialConfig
.
command
}
`
;
command
=
`CUDA_VISIBLE_DEVICES=
${
cuda_visible_device
}
${
this
.
trialConfig
.
command
}
`
;
}
else
{
}
else
{
command
=
`CUDA_VISIBLE_DEVICES=" "
${
this
.
trialConfig
.
command
}
`
;
command
=
`CUDA_VISIBLE_DEVICES=" "
${
this
.
trialConfig
.
command
}
`
;
}
}
const
nniManagerIp
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
// tslint:disable-next-line: strict-boolean-expressions
if
(
!
this
.
remoteRestServerPort
)
{
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
if
(
this
.
remoteRestServerPort
===
undefined
)
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
this
.
remoteRestServerPort
=
restServer
.
clusterRestServerPort
;
this
.
remoteRestServerPort
=
restServer
.
clusterRestServerPort
;
}
}
const
version
=
this
.
versionCheck
?
await
getVersion
():
''
;
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
runScriptTrialContent
:
string
=
String
.
Format
(
const
runScriptTrialContent
:
string
=
String
.
Format
(
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
trialWorkingFolder
,
trialWorkingFolder
,
...
@@ -611,7 +629,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -611,7 +629,7 @@ class RemoteMachineTrainingService implements TrainingService {
version
,
version
,
this
.
logCollection
,
this
.
logCollection
,
unixPathJoin
(
trialWorkingFolder
,
'
.nni
'
,
'
code
'
)
unixPathJoin
(
trialWorkingFolder
,
'
.nni
'
,
'
code
'
)
)
)
;
//create tmp trial working folder locally.
//create tmp trial working folder locally.
await
execMkdir
(
path
.
join
(
trialLocalTempFolder
,
'
.nni
'
));
await
execMkdir
(
path
.
join
(
trialLocalTempFolder
,
'
.nni
'
));
...
@@ -627,6 +645,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -627,6 +645,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Copy files in codeDir to remote working directory
// Copy files in codeDir to remote working directory
await
SSHClientUtility
.
copyDirectoryToRemote
(
trialLocalTempFolder
,
trialWorkingFolder
,
sshClient
,
this
.
remoteOS
);
await
SSHClientUtility
.
copyDirectoryToRemote
(
trialLocalTempFolder
,
trialWorkingFolder
,
sshClient
,
this
.
remoteOS
);
// Execute command in remote machine
// Execute command in remote machine
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
trialWorkingFolder
,
'
run.sh
'
)}
`
,
sshClient
);
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
trialWorkingFolder
,
'
run.sh
'
)}
`
,
sshClient
);
}
}
...
@@ -636,7 +655,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -636,7 +655,7 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
sshClientManager
===
undefined
)
{
if
(
sshClientManager
===
undefined
)
{
throw
new
Error
(
'
sshClient not found.
'
);
throw
new
Error
(
'
sshClient not found.
'
);
}
}
le
t
sshClient
:
Client
=
sshClientManager
.
getFirstSSHClient
();
cons
t
sshClient
:
Client
=
sshClientManager
.
getFirstSSHClient
();
const
jobId
:
string
=
uniqueString
(
5
);
const
jobId
:
string
=
uniqueString
(
5
);
const
localDir
:
string
=
path
.
join
(
this
.
expRootDir
,
'
hostjobs-local
'
,
jobId
);
const
localDir
:
string
=
path
.
join
(
this
.
expRootDir
,
'
hostjobs-local
'
,
jobId
);
const
remoteDir
:
string
=
this
.
getHostJobRemoteDir
(
jobId
);
const
remoteDir
:
string
=
this
.
getHostJobRemoteDir
(
jobId
);
...
@@ -648,6 +667,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -648,6 +667,7 @@ class RemoteMachineTrainingService implements TrainingService {
await
fs
.
promises
.
writeFile
(
path
.
join
(
localDir
,
'
run.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
localDir
,
'
run.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
await
SSHClientUtility
.
copyFileToRemote
(
await
SSHClientUtility
.
copyFileToRemote
(
path
.
join
(
localDir
,
'
run.sh
'
),
unixPathJoin
(
remoteDir
,
'
run.sh
'
),
sshClient
);
path
.
join
(
localDir
,
'
run.sh
'
),
unixPathJoin
(
remoteDir
,
'
run.sh
'
),
sshClient
);
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
remoteDir
,
'
run.sh
'
)}
`
,
sshClient
);
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
remoteDir
,
'
run.sh
'
)}
`
,
sshClient
);
const
jobDetail
:
RemoteMachineTrialJobDetail
=
new
RemoteMachineTrialJobDetail
(
const
jobDetail
:
RemoteMachineTrialJobDetail
=
new
RemoteMachineTrialJobDetail
(
...
@@ -680,8 +700,9 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -680,8 +700,9 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
killResult
!==
0
)
{
if
(
killResult
!==
0
)
{
const
trailReturnCode
:
string
=
await
SSHClientUtility
.
getRemoteFileContent
(
trialReturnCodeFilePath
,
sshClient
);
const
trailReturnCode
:
string
=
await
SSHClientUtility
.
getRemoteFileContent
(
trialReturnCodeFilePath
,
sshClient
);
this
.
log
.
debug
(
`trailjob
${
trialJob
.
id
}
return code:
${
trailReturnCode
}
`
);
this
.
log
.
debug
(
`trailjob
${
trialJob
.
id
}
return code:
${
trailReturnCode
}
`
);
const
match
:
RegExpMatchArray
|
null
=
trailReturnCode
.
trim
().
match
(
/^
(\d
+
)\s
+
(\d
+
)
$/
);
const
match
:
RegExpMatchArray
|
null
=
trailReturnCode
.
trim
()
if
(
match
)
{
.
match
(
/^
(\d
+
)\s
+
(\d
+
)
$/
);
if
(
match
!==
null
)
{
const
{
1
:
code
,
2
:
timestamp
}
=
match
;
const
{
1
:
code
,
2
:
timestamp
}
=
match
;
// Update trial job's status based on result code
// Update trial job's status based on result code
if
(
parseInt
(
code
,
10
)
===
0
)
{
if
(
parseInt
(
code
,
10
)
===
0
)
{
...
@@ -709,6 +730,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -709,6 +730,7 @@ class RemoteMachineTrainingService implements TrainingService {
deferred
.
resolve
(
trialJob
);
deferred
.
resolve
(
trialJob
);
}
}
}
}
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
...
@@ -720,7 +742,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -720,7 +742,7 @@ class RemoteMachineTrainingService implements TrainingService {
return
unixPathJoin
(
this
.
remoteExpRootDir
,
'
hostjobs
'
,
jobId
);
return
unixPathJoin
(
this
.
remoteExpRootDir
,
'
hostjobs
'
,
jobId
);
}
}
private
getRemoteExperimentRootDir
():
string
{
private
getRemoteExperimentRootDir
():
string
{
return
unixPathJoin
(
getRemoteTmpDir
(
this
.
remoteOS
),
'
nni
'
,
'
experiments
'
,
getExperimentId
());
return
unixPathJoin
(
getRemoteTmpDir
(
this
.
remoteOS
),
'
nni
'
,
'
experiments
'
,
getExperimentId
());
}
}
...
...
src/nni_manager/training_service/remote_machine/sshClientUtility.ts
View file @
ba8dccd6
...
@@ -21,16 +21,16 @@
...
@@ -21,16 +21,16 @@
import
*
as
assert
from
'
assert
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
path
from
'
path
'
;
import
*
as
os
from
'
os
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
{
Client
,
ClientChannel
,
SFTPWrapper
}
from
'
ssh2
'
;
import
{
Client
,
ClientChannel
,
SFTPWrapper
}
from
'
ssh2
'
;
import
*
as
stream
from
'
stream
'
;
import
*
as
stream
from
'
stream
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
uniqueString
,
getRemoteTmpDir
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
getRemoteTmpDir
,
uniqueString
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
RemoteCommandResult
}
from
'
./remoteMachineData
'
;
import
{
execRemove
,
tarAdd
}
from
'
../common/util
'
;
import
{
execRemove
,
tarAdd
}
from
'
../common/util
'
;
import
{
RemoteCommandResult
}
from
'
./remoteMachineData
'
;
/**
/**
*
*
...
@@ -44,7 +44,8 @@ export namespace SSHClientUtility {
...
@@ -44,7 +44,8 @@ export namespace SSHClientUtility {
* @param remoteDirectory remote directory
* @param remoteDirectory remote directory
* @param sshClient SSH client
* @param sshClient SSH client
*/
*/
export
async
function
copyDirectoryToRemote
(
localDirectory
:
string
,
remoteDirectory
:
string
,
sshClient
:
Client
,
remoteOS
:
string
)
:
Promise
<
void
>
{
export
async
function
copyDirectoryToRemote
(
localDirectory
:
string
,
remoteDirectory
:
string
,
sshClient
:
Client
,
remoteOS
:
string
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
tmpTarName
:
string
=
`
${
uniqueString
(
10
)}
.tar.gz`
;
const
tmpTarName
:
string
=
`
${
uniqueString
(
10
)}
.tar.gz`
;
const
localTarPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
tmpTarName
);
const
localTarPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
tmpTarName
);
...
@@ -75,7 +76,7 @@ export namespace SSHClientUtility {
...
@@ -75,7 +76,7 @@ export namespace SSHClientUtility {
assert
(
sshClient
!==
undefined
);
assert
(
sshClient
!==
undefined
);
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
if
(
err
)
{
if
(
err
!==
undefined
&&
err
!==
null
)
{
log
.
error
(
`copyFileToRemote:
${
err
.
message
}
,
${
localFilePath
}
,
${
remoteFilePath
}
`
);
log
.
error
(
`copyFileToRemote:
${
err
.
message
}
,
${
localFilePath
}
,
${
remoteFilePath
}
`
);
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
...
@@ -84,7 +85,7 @@ export namespace SSHClientUtility {
...
@@ -84,7 +85,7 @@ export namespace SSHClientUtility {
assert
(
sftp
!==
undefined
);
assert
(
sftp
!==
undefined
);
sftp
.
fastPut
(
localFilePath
,
remoteFilePath
,
(
fastPutErr
:
Error
)
=>
{
sftp
.
fastPut
(
localFilePath
,
remoteFilePath
,
(
fastPutErr
:
Error
)
=>
{
sftp
.
end
();
sftp
.
end
();
if
(
fastPutErr
)
{
if
(
fastPutErr
!==
undefined
&&
fastPutErr
!==
null
)
{
deferred
.
reject
(
fastPutErr
);
deferred
.
reject
(
fastPutErr
);
}
else
{
}
else
{
deferred
.
resolve
(
true
);
deferred
.
resolve
(
true
);
...
@@ -100,6 +101,7 @@ export namespace SSHClientUtility {
...
@@ -100,6 +101,7 @@ export namespace SSHClientUtility {
* @param command the command to execute remotely
* @param command the command to execute remotely
* @param client SSH Client
* @param client SSH Client
*/
*/
// tslint:disable:no-unsafe-any no-any
export
function
remoteExeCommand
(
command
:
string
,
client
:
Client
):
Promise
<
RemoteCommandResult
>
{
export
function
remoteExeCommand
(
command
:
string
,
client
:
Client
):
Promise
<
RemoteCommandResult
>
{
const
log
:
Logger
=
getLogger
();
const
log
:
Logger
=
getLogger
();
log
.
debug
(
`remoteExeCommand: command: [
${
command
}
]`
);
log
.
debug
(
`remoteExeCommand: command: [
${
command
}
]`
);
...
@@ -109,7 +111,7 @@ export namespace SSHClientUtility {
...
@@ -109,7 +111,7 @@ export namespace SSHClientUtility {
let
exitCode
:
number
;
let
exitCode
:
number
;
client
.
exec
(
command
,
(
err
:
Error
,
channel
:
ClientChannel
)
=>
{
client
.
exec
(
command
,
(
err
:
Error
,
channel
:
ClientChannel
)
=>
{
if
(
err
)
{
if
(
err
!==
undefined
&&
err
!==
null
)
{
log
.
error
(
`remoteExeCommand:
${
err
.
message
}
`
);
log
.
error
(
`remoteExeCommand:
${
err
.
message
}
`
);
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
...
@@ -117,13 +119,14 @@ export namespace SSHClientUtility {
...
@@ -117,13 +119,14 @@ export namespace SSHClientUtility {
}
}
channel
.
on
(
'
data
'
,
(
data
:
any
,
dataStderr
:
any
)
=>
{
channel
.
on
(
'
data
'
,
(
data
:
any
,
dataStderr
:
any
)
=>
{
if
(
dataStderr
)
{
if
(
dataStderr
!==
undefined
&&
dataStderr
!==
null
)
{
stderr
+=
data
.
toString
();
stderr
+=
data
.
toString
();
}
else
{
}
else
{
stdout
+=
data
.
toString
();
stdout
+=
data
.
toString
();
}
}
}).
on
(
'
exit
'
,
(
code
,
signal
)
=>
{
})
exitCode
=
code
as
number
;
.
on
(
'
exit
'
,
(
code
:
any
,
signal
:
any
)
=>
{
exitCode
=
<
number
>
code
;
deferred
.
resolve
({
deferred
.
resolve
({
stdout
:
stdout
,
stdout
:
stdout
,
stderr
:
stderr
,
stderr
:
stderr
,
...
@@ -138,8 +141,9 @@ export namespace SSHClientUtility {
...
@@ -138,8 +141,9 @@ export namespace SSHClientUtility {
export
function
getRemoteFileContent
(
filePath
:
string
,
sshClient
:
Client
):
Promise
<
string
>
{
export
function
getRemoteFileContent
(
filePath
:
string
,
sshClient
:
Client
):
Promise
<
string
>
{
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
if
(
err
)
{
if
(
err
!==
undefined
&&
err
!==
null
)
{
getLogger
().
error
(
`getRemoteFileContent:
${
err
.
message
}
`
);
getLogger
()
.
error
(
`getRemoteFileContent:
${
err
.
message
}
`
);
deferred
.
reject
(
new
Error
(
`SFTP error:
${
err
.
message
}
`
));
deferred
.
reject
(
new
Error
(
`SFTP error:
${
err
.
message
}
`
));
return
;
return
;
...
@@ -150,16 +154,19 @@ export namespace SSHClientUtility {
...
@@ -150,16 +154,19 @@ export namespace SSHClientUtility {
let
dataBuffer
:
string
=
''
;
let
dataBuffer
:
string
=
''
;
sftpStream
.
on
(
'
data
'
,
(
data
:
Buffer
|
string
)
=>
{
sftpStream
.
on
(
'
data
'
,
(
data
:
Buffer
|
string
)
=>
{
dataBuffer
+=
data
;
dataBuffer
+=
data
;
}).
on
(
'
error
'
,
(
streamErr
:
Error
)
=>
{
})
.
on
(
'
error
'
,
(
streamErr
:
Error
)
=>
{
sftp
.
end
();
sftp
.
end
();
deferred
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
streamErr
.
message
));
deferred
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
streamErr
.
message
));
}).
on
(
'
end
'
,
()
=>
{
})
.
on
(
'
end
'
,
()
=>
{
// sftp connection need to be released manually once operation is done
// sftp connection need to be released manually once operation is done
sftp
.
end
();
sftp
.
end
();
deferred
.
resolve
(
dataBuffer
);
deferred
.
resolve
(
dataBuffer
);
});
});
}
catch
(
error
)
{
}
catch
(
error
)
{
getLogger
().
error
(
`getRemoteFileContent:
${
error
.
message
}
`
);
getLogger
()
.
error
(
`getRemoteFileContent:
${
error
.
message
}
`
);
sftp
.
end
();
sftp
.
end
();
deferred
.
reject
(
new
Error
(
`SFTP error:
${
error
.
message
}
`
));
deferred
.
reject
(
new
Error
(
`SFTP error:
${
error
.
message
}
`
));
}
}
...
@@ -167,4 +174,5 @@ export namespace SSHClientUtility {
...
@@ -167,4 +174,5 @@ export namespace SSHClientUtility {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
// tslint:enable:no-unsafe-any no-any
}
}
src/nni_manager/training_service/test/hdfsClientUtility.test.ts
View file @
ba8dccd6
...
@@ -37,7 +37,7 @@ describe('WebHDFS', function () {
...
@@ -37,7 +37,7 @@ describe('WebHDFS', function () {
{
{
"user": "user1",
"user": "user1",
"port": 50070,
"port": 50070,
"host": "10.0.0.0"
"host": "10.0.0.0"
}
}
*/
*/
let
skip
:
boolean
=
false
;
let
skip
:
boolean
=
false
;
...
@@ -45,7 +45,7 @@ describe('WebHDFS', function () {
...
@@ -45,7 +45,7 @@ describe('WebHDFS', function () {
let
hdfsClient
:
any
;
let
hdfsClient
:
any
;
try
{
try
{
testHDFSInfo
=
JSON
.
parse
(
fs
.
readFileSync
(
'
../../.vscode/hdfsInfo.json
'
,
'
utf8
'
));
testHDFSInfo
=
JSON
.
parse
(
fs
.
readFileSync
(
'
../../.vscode/hdfsInfo.json
'
,
'
utf8
'
));
console
.
log
(
testHDFSInfo
);
console
.
log
(
testHDFSInfo
);
hdfsClient
=
WebHDFS
.
createClient
({
hdfsClient
=
WebHDFS
.
createClient
({
user
:
testHDFSInfo
.
user
,
user
:
testHDFSInfo
.
user
,
port
:
testHDFSInfo
.
port
,
port
:
testHDFSInfo
.
port
,
...
@@ -120,7 +120,7 @@ describe('WebHDFS', function () {
...
@@ -120,7 +120,7 @@ describe('WebHDFS', function () {
chai
.
expect
(
actualFileData
).
to
.
be
.
equals
(
testFileData
);
chai
.
expect
(
actualFileData
).
to
.
be
.
equals
(
testFileData
);
const
testHDFSDirPath
:
string
=
path
.
join
(
'
/nni_unittest_
'
+
uniqueString
(
6
)
+
'
_dir
'
);
const
testHDFSDirPath
:
string
=
path
.
join
(
'
/nni_unittest_
'
+
uniqueString
(
6
)
+
'
_dir
'
);
await
HDFSClientUtility
.
copyDirectoryToHdfs
(
tmpLocalDirectoryPath
,
testHDFSDirPath
,
hdfsClient
);
await
HDFSClientUtility
.
copyDirectoryToHdfs
(
tmpLocalDirectoryPath
,
testHDFSDirPath
,
hdfsClient
);
const
files
:
any
[]
=
await
HDFSClientUtility
.
readdir
(
testHDFSDirPath
,
hdfsClient
);
const
files
:
any
[]
=
await
HDFSClientUtility
.
readdir
(
testHDFSDirPath
,
hdfsClient
);
...
@@ -133,7 +133,7 @@ describe('WebHDFS', function () {
...
@@ -133,7 +133,7 @@ describe('WebHDFS', function () {
// Cleanup
// Cleanup
rmdir
(
tmpLocalDirectoryPath
);
rmdir
(
tmpLocalDirectoryPath
);
let
deleteRestult
:
boolean
=
await
HDFSClientUtility
.
deletePath
(
testHDFSFilePath
,
hdfsClient
);
let
deleteRestult
:
boolean
=
await
HDFSClientUtility
.
deletePath
(
testHDFSFilePath
,
hdfsClient
);
chai
.
expect
(
deleteRestult
).
to
.
be
.
equals
(
true
);
chai
.
expect
(
deleteRestult
).
to
.
be
.
equals
(
true
);
...
...
src/nni_manager/training_service/test/kubeflowTrainingService.test.ts
View file @
ba8dccd6
...
@@ -63,7 +63,7 @@ describe('Unit Test for KubeflowTrainingService', () => {
...
@@ -63,7 +63,7 @@ describe('Unit Test for KubeflowTrainingService', () => {
if
(
skip
)
{
if
(
skip
)
{
return
;
return
;
}
}
kubeflowTrainingService
=
component
.
get
(
KubeflowTrainingService
);
kubeflowTrainingService
=
component
.
get
(
KubeflowTrainingService
);
});
});
afterEach
(()
=>
{
afterEach
(()
=>
{
...
@@ -78,6 +78,6 @@ describe('Unit Test for KubeflowTrainingService', () => {
...
@@ -78,6 +78,6 @@ describe('Unit Test for KubeflowTrainingService', () => {
return
;
return
;
}
}
await
kubeflowTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
,
testKubeflowConfig
),
await
kubeflowTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
,
testKubeflowConfig
),
await
kubeflowTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
testKubeflowTrialConfig
);
await
kubeflowTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
testKubeflowTrialConfig
);
});
});
});
});
\ No newline at end of file
src/nni_manager/training_service/test/localTrainingService.test.ts
View file @
ba8dccd6
...
@@ -63,7 +63,7 @@ describe('Unit Test for LocalTrainingService', () => {
...
@@ -63,7 +63,7 @@ describe('Unit Test for LocalTrainingService', () => {
//trial jobs should be empty, since there are no submitted jobs
//trial jobs should be empty, since there are no submitted jobs
chai
.
expect
(
await
localTrainingService
.
listTrialJobs
()).
to
.
be
.
empty
;
chai
.
expect
(
await
localTrainingService
.
listTrialJobs
()).
to
.
be
.
empty
;
});
});
it
(
'
setClusterMetadata and getClusterMetadata
'
,
async
()
=>
{
it
(
'
setClusterMetadata and getClusterMetadata
'
,
async
()
=>
{
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
localTrainingService
.
getClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
).
then
((
data
)
=>
{
localTrainingService
.
getClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
).
then
((
data
)
=>
{
...
@@ -87,7 +87,7 @@ describe('Unit Test for LocalTrainingService', () => {
...
@@ -87,7 +87,7 @@ describe('Unit Test for LocalTrainingService', () => {
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
USER_CANCELED
'
);
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
USER_CANCELED
'
);
}).
timeout
(
20000
);
}).
timeout
(
20000
);
it
(
'
Read metrics, Add listener, and remove listener
'
,
async
()
=>
{
it
(
'
Read metrics, Add listener, and remove listener
'
,
async
()
=>
{
// set meta data
// set meta data
const
trialConfig
:
string
=
`{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"
${
localCodeDir
}
\",\"gpuNum\":0}`
const
trialConfig
:
string
=
`{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"
${
localCodeDir
}
\",\"gpuNum\":0}`
...
...
src/nni_manager/training_service/test/paiTrainingService.test.ts
View file @
ba8dccd6
...
@@ -89,7 +89,7 @@ describe('Unit Test for PAITrainingService', () => {
...
@@ -89,7 +89,7 @@ describe('Unit Test for PAITrainingService', () => {
chai
.
expect
(
trialDetail
.
status
).
to
.
be
.
equals
(
'
WAITING
'
);
chai
.
expect
(
trialDetail
.
status
).
to
.
be
.
equals
(
'
WAITING
'
);
}
catch
(
error
)
{
}
catch
(
error
)
{
console
.
log
(
'
Submit job failed:
'
+
error
);
console
.
log
(
'
Submit job failed:
'
+
error
);
chai
.
assert
(
error
)
chai
.
assert
(
error
)
}
}
});
});
});
});
\ No newline at end of file
src/nni_manager/tslint.json
View file @
ba8dccd6
...
@@ -9,7 +9,10 @@
...
@@ -9,7 +9,10 @@
"no-increment-decrement"
:
false
,
"no-increment-decrement"
:
false
,
"promise-function-async"
:
false
,
"promise-function-async"
:
false
,
"no-console"
:
[
true
,
"log"
],
"no-console"
:
[
true
,
"log"
],
"no-multiline-string"
:
false
"no-multiline-string"
:
false
,
"no-suspicious-comment"
:
false
,
"no-backbone-get-set-outside-model"
:
false
,
"max-classes-per-file"
:
false
},
},
"rulesDirectory"
:
[],
"rulesDirectory"
:
[],
"linterOptions"
:
{
"linterOptions"
:
{
...
...
Prev
1
…
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment