Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
125ec21f
Unverified
Commit
125ec21f
authored
Aug 15, 2022
by
Shudong Yang
Committed by
GitHub
Aug 15, 2022
Browse files
Fix reusable k8s training service bug (#5045)
parent
9e8a0bf0
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
30 additions
and
12 deletions
+30
-12
ts/nni_manager/training_service/reusable/environments/kubernetes/frameworkcontrollerEnvironmentService.ts
...ments/kubernetes/frameworkcontrollerEnvironmentService.ts
+10
-3
ts/nni_manager/training_service/reusable/environments/kubernetes/kubeflowEnvironmentService.ts
...ble/environments/kubernetes/kubeflowEnvironmentService.ts
+15
-6
ts/nni_manager/training_service/reusable/environments/kubernetes/kubernetesEnvironmentService.ts
...e/environments/kubernetes/kubernetesEnvironmentService.ts
+5
-3
No files found.
ts/nni_manager/training_service/reusable/environments/kubernetes/frameworkcontrollerEnvironmentService.ts
View file @
125ec21f
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
'
use strict
'
;
'
use strict
'
;
import
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../../common/component
'
;
import
*
as
component
from
'
../../../../common/component
'
;
...
@@ -81,7 +82,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
...
@@ -81,7 +82,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
const
frameworkcontrollerJobName
:
string
=
`nniexp
${
this
.
experimentId
}
env
${
environment
.
id
}
`
.
toLowerCase
();
const
frameworkcontrollerJobName
:
string
=
`nniexp
${
this
.
experimentId
}
env
${
environment
.
id
}
`
.
toLowerCase
();
const
command
=
this
.
generateCommandScript
(
this
.
config
.
taskRoles
,
environment
.
command
);
const
command
=
this
.
generateCommandScript
(
this
.
config
.
taskRoles
,
environment
.
command
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
this
.
environmentLocalTempFolder
,
"
run.sh
"
),
command
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
this
.
environmentLocalTempFolder
,
`
${
environment
.
id
}
_
run.sh
`
),
command
,
{
encoding
:
'
utf8
'
});
//upload script files to sotrage
//upload script files to sotrage
const
trialJobOutputUrl
:
string
=
await
this
.
uploadFolder
(
this
.
environmentLocalTempFolder
,
`nni/
${
this
.
experimentId
}
`
);
const
trialJobOutputUrl
:
string
=
await
this
.
uploadFolder
(
this
.
environmentLocalTempFolder
,
`nni/
${
this
.
experimentId
}
`
);
...
@@ -106,7 +107,13 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
...
@@ -106,7 +107,13 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
}
}
return
await
this
.
uploadFolderToAzureStorage
(
srcDirectory
,
destDirectory
,
2
);
return
await
this
.
uploadFolderToAzureStorage
(
srcDirectory
,
destDirectory
,
2
);
}
else
{
}
else
{
// do not need to upload files to nfs server, temp folder already mounted to nfs
try
{
// copy envs and run.sh from environments-temp to nfs-root(mounted)
await
cpp
.
exec
(
`mkdir -p
${
this
.
nfsRootDir
}
/
${
destDirectory
}
`
);
await
cpp
.
exec
(
`cp -r
${
srcDirectory
}
/*
${
this
.
nfsRootDir
}
/
${
destDirectory
}
`
);
}
catch
(
uploadError
)
{
return
Promise
.
reject
(
uploadError
);
}
return
`nfs://
${
this
.
config
.
storage
.
server
}
:
${
destDirectory
}
`
;
return
`nfs://
${
this
.
config
.
storage
.
server
}
:
${
destDirectory
}
`
;
}
}
}
}
...
@@ -174,7 +181,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
...
@@ -174,7 +181,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
const
taskRole
:
any
=
this
.
generateTaskRoleConfig
(
const
taskRole
:
any
=
this
.
generateTaskRoleConfig
(
trialWorkingFolder
,
trialWorkingFolder
,
this
.
config
.
taskRoles
[
index
].
dockerImage
,
this
.
config
.
taskRoles
[
index
].
dockerImage
,
`run.sh`
,
`
${
envId
}
_
run.sh`
,
podResources
[
index
],
podResources
[
index
],
containerPort
,
containerPort
,
await
this
.
createRegistrySecret
(
this
.
config
.
taskRoles
[
index
].
privateRegistryAuthPath
)
await
this
.
createRegistrySecret
(
this
.
config
.
taskRoles
[
index
].
privateRegistryAuthPath
)
...
...
ts/nni_manager/training_service/reusable/environments/kubernetes/kubeflowEnvironmentService.ts
View file @
125ec21f
// Copyright (c) Microsoft Corporation.
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT license.
import
cpp
from
'
child-process-promise
'
;
import
fs
from
'
fs
'
;
import
fs
from
'
fs
'
;
import
path
from
'
path
'
;
import
path
from
'
path
'
;
import
*
as
component
from
'
common/component
'
;
import
*
as
component
from
'
common/component
'
;
...
@@ -85,7 +86,7 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
...
@@ -85,7 +86,7 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
const
kubeflowJobName
:
string
=
`nniexp
${
this
.
experimentId
}
env
${
environment
.
id
}
`
.
toLowerCase
();
const
kubeflowJobName
:
string
=
`nniexp
${
this
.
experimentId
}
env
${
environment
.
id
}
`
.
toLowerCase
();
await
fs
.
promises
.
writeFile
(
path
.
join
(
this
.
environmentLocalTempFolder
,
"
run.sh
"
),
environment
.
command
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
this
.
environmentLocalTempFolder
,
`
${
environment
.
id
}
_
run.sh
`
),
environment
.
command
,
{
encoding
:
'
utf8
'
});
//upload script files to sotrage
//upload script files to sotrage
const
trialJobOutputUrl
:
string
=
await
this
.
uploadFolder
(
this
.
environmentLocalTempFolder
,
`nni/
${
this
.
experimentId
}
`
);
const
trialJobOutputUrl
:
string
=
await
this
.
uploadFolder
(
this
.
environmentLocalTempFolder
,
`nni/
${
this
.
experimentId
}
`
);
...
@@ -106,7 +107,13 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
...
@@ -106,7 +107,13 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
}
}
return
await
this
.
uploadFolderToAzureStorage
(
srcDirectory
,
destDirectory
,
2
);
return
await
this
.
uploadFolderToAzureStorage
(
srcDirectory
,
destDirectory
,
2
);
}
else
{
}
else
{
// do not need to upload files to nfs server, temp folder already mounted to nfs
try
{
// copy envs and run.sh from environments-temp to nfs-root(mounted)
await
cpp
.
exec
(
`mkdir -p
${
this
.
nfsRootDir
}
/
${
destDirectory
}
`
);
await
cpp
.
exec
(
`cp -r
${
srcDirectory
}
/*
${
this
.
nfsRootDir
}
/
${
destDirectory
}
`
);
}
catch
(
uploadError
)
{
return
Promise
.
reject
(
uploadError
);
}
return
`nfs://
${
this
.
config
.
storage
.
server
}
:
${
destDirectory
}
`
;
return
`nfs://
${
this
.
config
.
storage
.
server
}
:
${
destDirectory
}
`
;
}
}
}
}
...
@@ -159,24 +166,26 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
...
@@ -159,24 +166,26 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
if
(
this
.
config
.
worker
)
{
if
(
this
.
config
.
worker
)
{
const
privateRegistrySecretName
=
await
this
.
createRegistrySecret
(
this
.
config
.
worker
.
privateRegistryAuthPath
);
const
privateRegistrySecretName
=
await
this
.
createRegistrySecret
(
this
.
config
.
worker
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
this
.
config
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
this
.
config
.
worker
.
replicas
,
this
.
config
.
worker
.
dockerImage
,
'
run.sh
'
,
workerPodResources
,
privateRegistrySecretName
);
this
.
config
.
worker
.
dockerImage
,
`
${
envId
}
_run.sh`
,
workerPodResources
,
privateRegistrySecretName
);
}
}
if
(
this
.
config
.
ps
!==
undefined
)
{
if
(
this
.
config
.
ps
!==
undefined
)
{
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
ps
.
privateRegistryAuthPath
);
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
ps
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
this
.
config
.
ps
.
replicas
,
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
this
.
config
.
ps
.
replicas
,
this
.
config
.
ps
.
dockerImage
,
'
run.sh
'
,
nonWorkerPodResources
,
privateRegistrySecretName
);
this
.
config
.
ps
.
dockerImage
,
`
${
envId
}
_run.sh`
,
nonWorkerPodResources
,
privateRegistrySecretName
);
}
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
tfReplicaSpecs
:
replicaSpecsObj
});
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
tfReplicaSpecs
:
replicaSpecsObj
});
}
else
if
(
this
.
config
.
operator
===
'
pytorch-operator
'
)
{
}
else
if
(
this
.
config
.
operator
===
'
pytorch-operator
'
)
{
if
(
this
.
config
.
worker
!==
undefined
)
{
if
(
this
.
config
.
worker
!==
undefined
)
{
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
worker
.
privateRegistryAuthPath
);
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
worker
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
this
.
config
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
this
.
config
.
worker
.
replicas
,
this
.
config
.
worker
.
dockerImage
,
'
run.sh
'
,
workerPodResources
,
privateRegistrySecretName
);
this
.
config
.
worker
.
dockerImage
,
`
${
envId
}
_
run.sh
`
,
workerPodResources
,
privateRegistrySecretName
);
}
}
if
(
this
.
config
.
master
!==
undefined
)
{
if
(
this
.
config
.
master
!==
undefined
)
{
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
master
.
privateRegistryAuthPath
);
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
master
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
this
.
config
.
master
.
replicas
,
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
this
.
config
.
master
.
replicas
,
this
.
config
.
master
.
dockerImage
,
'
run.sh
'
,
nonWorkerPodResources
,
privateRegistrySecretName
);
this
.
config
.
master
.
dockerImage
,
`
${
envId
}
_
run.sh
`
,
nonWorkerPodResources
,
privateRegistrySecretName
);
}
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
pytorchReplicaSpecs
:
replicaSpecsObj
});
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
pytorchReplicaSpecs
:
replicaSpecsObj
});
...
...
ts/nni_manager/training_service/reusable/environments/kubernetes/kubernetesEnvironmentService.ts
View file @
125ec21f
...
@@ -32,6 +32,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
...
@@ -32,6 +32,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
protected
CONTAINER_MOUNT_PATH
:
string
;
protected
CONTAINER_MOUNT_PATH
:
string
;
protected
log
:
Logger
=
getLogger
(
'
KubernetesEnvironmentService
'
);
protected
log
:
Logger
=
getLogger
(
'
KubernetesEnvironmentService
'
);
protected
environmentWorkingFolder
:
string
;
protected
environmentWorkingFolder
:
string
;
protected
nfsRootDir
:
string
;
constructor
(
_config
:
any
,
info
:
ExperimentStartupInfo
)
{
constructor
(
_config
:
any
,
info
:
ExperimentStartupInfo
)
{
super
();
super
();
...
@@ -39,6 +40,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
...
@@ -39,6 +40,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
experimentRootDir
=
info
.
logDir
;
this
.
experimentRootDir
=
info
.
logDir
;
this
.
environmentLocalTempFolder
=
path
.
join
(
this
.
experimentRootDir
,
"
environment-temp
"
);
this
.
environmentLocalTempFolder
=
path
.
join
(
this
.
experimentRootDir
,
"
environment-temp
"
);
this
.
nfsRootDir
=
path
.
join
(
this
.
experimentRootDir
,
"
nfs-root
"
);
this
.
experimentId
=
info
.
experimentId
;
this
.
experimentId
=
info
.
experimentId
;
this
.
environmentWorkingFolder
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
this
.
experimentId
);
this
.
environmentWorkingFolder
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
this
.
experimentId
);
}
}
...
@@ -147,11 +149,11 @@ export class KubernetesEnvironmentService extends EnvironmentService {
...
@@ -147,11 +149,11 @@ export class KubernetesEnvironmentService extends EnvironmentService {
}
}
protected
async
createNFSStorage
(
nfsServer
:
string
,
nfsPath
:
string
):
Promise
<
void
>
{
protected
async
createNFSStorage
(
nfsServer
:
string
,
nfsPath
:
string
):
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
environmentLocalTempFolde
r
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
this
.
nfsRootDi
r
}
`
);
try
{
try
{
await
cpp
.
exec
(
`sudo mount
${
nfsServer
}
:
${
nfsPath
}
${
this
.
environmentLocalTempFolde
r
}
`
);
await
cpp
.
exec
(
`sudo mount
${
nfsServer
}
:
${
nfsPath
}
${
this
.
nfsRootDi
r
}
`
);
}
catch
(
error
)
{
}
catch
(
error
)
{
const
mountError
:
string
=
`Mount NFS
${
nfsServer
}
:
${
nfsPath
}
to
${
this
.
environmentLocalTempFolde
r
}
failed, error is
${
error
}
`
;
const
mountError
:
string
=
`Mount NFS
${
nfsServer
}
:
${
nfsPath
}
to
${
this
.
nfsRootDi
r
}
failed, error is
${
error
}
`
;
this
.
log
.
error
(
mountError
);
this
.
log
.
error
(
mountError
);
return
Promise
.
reject
(
mountError
);
return
Promise
.
reject
(
mountError
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment