Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
125ec21f
"tilelang/git@developer.sourcefind.cn:OpenDAS/tilelang.git" did not exist on "0921328dc5557478c3d20481916e648a415d4258"
Unverified
Commit
125ec21f
authored
Aug 15, 2022
by
Shudong Yang
Committed by
GitHub
Aug 15, 2022
Browse files
Fix reusable k8s training service bug (#5045)
parent
9e8a0bf0
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
30 additions
and
12 deletions
+30
-12
ts/nni_manager/training_service/reusable/environments/kubernetes/frameworkcontrollerEnvironmentService.ts
...ments/kubernetes/frameworkcontrollerEnvironmentService.ts
+10
-3
ts/nni_manager/training_service/reusable/environments/kubernetes/kubeflowEnvironmentService.ts
...ble/environments/kubernetes/kubeflowEnvironmentService.ts
+15
-6
ts/nni_manager/training_service/reusable/environments/kubernetes/kubernetesEnvironmentService.ts
...e/environments/kubernetes/kubernetesEnvironmentService.ts
+5
-3
No files found.
ts/nni_manager/training_service/reusable/environments/kubernetes/frameworkcontrollerEnvironmentService.ts
View file @
125ec21f
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
'
use strict
'
;
'
use strict
'
;
import
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../../common/component
'
;
import
*
as
component
from
'
../../../../common/component
'
;
...
@@ -81,7 +82,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
...
@@ -81,7 +82,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
const
frameworkcontrollerJobName
:
string
=
`nniexp
${
this
.
experimentId
}
env
${
environment
.
id
}
`
.
toLowerCase
();
const
frameworkcontrollerJobName
:
string
=
`nniexp
${
this
.
experimentId
}
env
${
environment
.
id
}
`
.
toLowerCase
();
const
command
=
this
.
generateCommandScript
(
this
.
config
.
taskRoles
,
environment
.
command
);
const
command
=
this
.
generateCommandScript
(
this
.
config
.
taskRoles
,
environment
.
command
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
this
.
environmentLocalTempFolder
,
"
run.sh
"
),
command
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
this
.
environmentLocalTempFolder
,
`
${
environment
.
id
}
_
run.sh
`
),
command
,
{
encoding
:
'
utf8
'
});
//upload script files to sotrage
//upload script files to sotrage
const
trialJobOutputUrl
:
string
=
await
this
.
uploadFolder
(
this
.
environmentLocalTempFolder
,
`nni/
${
this
.
experimentId
}
`
);
const
trialJobOutputUrl
:
string
=
await
this
.
uploadFolder
(
this
.
environmentLocalTempFolder
,
`nni/
${
this
.
experimentId
}
`
);
...
@@ -106,7 +107,13 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
...
@@ -106,7 +107,13 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
}
}
return
await
this
.
uploadFolderToAzureStorage
(
srcDirectory
,
destDirectory
,
2
);
return
await
this
.
uploadFolderToAzureStorage
(
srcDirectory
,
destDirectory
,
2
);
}
else
{
}
else
{
// do not need to upload files to nfs server, temp folder already mounted to nfs
try
{
// copy envs and run.sh from environments-temp to nfs-root(mounted)
await
cpp
.
exec
(
`mkdir -p
${
this
.
nfsRootDir
}
/
${
destDirectory
}
`
);
await
cpp
.
exec
(
`cp -r
${
srcDirectory
}
/*
${
this
.
nfsRootDir
}
/
${
destDirectory
}
`
);
}
catch
(
uploadError
)
{
return
Promise
.
reject
(
uploadError
);
}
return
`nfs://
${
this
.
config
.
storage
.
server
}
:
${
destDirectory
}
`
;
return
`nfs://
${
this
.
config
.
storage
.
server
}
:
${
destDirectory
}
`
;
}
}
}
}
...
@@ -174,7 +181,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
...
@@ -174,7 +181,7 @@ export class FrameworkControllerEnvironmentService extends KubernetesEnvironment
const
taskRole
:
any
=
this
.
generateTaskRoleConfig
(
const
taskRole
:
any
=
this
.
generateTaskRoleConfig
(
trialWorkingFolder
,
trialWorkingFolder
,
this
.
config
.
taskRoles
[
index
].
dockerImage
,
this
.
config
.
taskRoles
[
index
].
dockerImage
,
`run.sh`
,
`
${
envId
}
_
run.sh`
,
podResources
[
index
],
podResources
[
index
],
containerPort
,
containerPort
,
await
this
.
createRegistrySecret
(
this
.
config
.
taskRoles
[
index
].
privateRegistryAuthPath
)
await
this
.
createRegistrySecret
(
this
.
config
.
taskRoles
[
index
].
privateRegistryAuthPath
)
...
...
ts/nni_manager/training_service/reusable/environments/kubernetes/kubeflowEnvironmentService.ts
View file @
125ec21f
// Copyright (c) Microsoft Corporation.
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
// Licensed under the MIT license.
import
cpp
from
'
child-process-promise
'
;
import
fs
from
'
fs
'
;
import
fs
from
'
fs
'
;
import
path
from
'
path
'
;
import
path
from
'
path
'
;
import
*
as
component
from
'
common/component
'
;
import
*
as
component
from
'
common/component
'
;
...
@@ -85,7 +86,7 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
...
@@ -85,7 +86,7 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
const
kubeflowJobName
:
string
=
`nniexp
${
this
.
experimentId
}
env
${
environment
.
id
}
`
.
toLowerCase
();
const
kubeflowJobName
:
string
=
`nniexp
${
this
.
experimentId
}
env
${
environment
.
id
}
`
.
toLowerCase
();
await
fs
.
promises
.
writeFile
(
path
.
join
(
this
.
environmentLocalTempFolder
,
"
run.sh
"
),
environment
.
command
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
this
.
environmentLocalTempFolder
,
`
${
environment
.
id
}
_
run.sh
`
),
environment
.
command
,
{
encoding
:
'
utf8
'
});
//upload script files to sotrage
//upload script files to sotrage
const
trialJobOutputUrl
:
string
=
await
this
.
uploadFolder
(
this
.
environmentLocalTempFolder
,
`nni/
${
this
.
experimentId
}
`
);
const
trialJobOutputUrl
:
string
=
await
this
.
uploadFolder
(
this
.
environmentLocalTempFolder
,
`nni/
${
this
.
experimentId
}
`
);
...
@@ -106,7 +107,13 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
...
@@ -106,7 +107,13 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
}
}
return
await
this
.
uploadFolderToAzureStorage
(
srcDirectory
,
destDirectory
,
2
);
return
await
this
.
uploadFolderToAzureStorage
(
srcDirectory
,
destDirectory
,
2
);
}
else
{
}
else
{
// do not need to upload files to nfs server, temp folder already mounted to nfs
try
{
// copy envs and run.sh from environments-temp to nfs-root(mounted)
await
cpp
.
exec
(
`mkdir -p
${
this
.
nfsRootDir
}
/
${
destDirectory
}
`
);
await
cpp
.
exec
(
`cp -r
${
srcDirectory
}
/*
${
this
.
nfsRootDir
}
/
${
destDirectory
}
`
);
}
catch
(
uploadError
)
{
return
Promise
.
reject
(
uploadError
);
}
return
`nfs://
${
this
.
config
.
storage
.
server
}
:
${
destDirectory
}
`
;
return
`nfs://
${
this
.
config
.
storage
.
server
}
:
${
destDirectory
}
`
;
}
}
}
}
...
@@ -159,24 +166,26 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
...
@@ -159,24 +166,26 @@ export class KubeflowEnvironmentService extends KubernetesEnvironmentService {
if
(
this
.
config
.
worker
)
{
if
(
this
.
config
.
worker
)
{
const
privateRegistrySecretName
=
await
this
.
createRegistrySecret
(
this
.
config
.
worker
.
privateRegistryAuthPath
);
const
privateRegistrySecretName
=
await
this
.
createRegistrySecret
(
this
.
config
.
worker
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
this
.
config
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
this
.
config
.
worker
.
replicas
,
this
.
config
.
worker
.
dockerImage
,
'
run.sh
'
,
workerPodResources
,
privateRegistrySecretName
);
this
.
config
.
worker
.
dockerImage
,
`
${
envId
}
_run.sh`
,
workerPodResources
,
privateRegistrySecretName
);
}
}
if
(
this
.
config
.
ps
!==
undefined
)
{
if
(
this
.
config
.
ps
!==
undefined
)
{
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
ps
.
privateRegistryAuthPath
);
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
ps
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
this
.
config
.
ps
.
replicas
,
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
this
.
config
.
ps
.
replicas
,
this
.
config
.
ps
.
dockerImage
,
'
run.sh
'
,
nonWorkerPodResources
,
privateRegistrySecretName
);
this
.
config
.
ps
.
dockerImage
,
`
${
envId
}
_run.sh`
,
nonWorkerPodResources
,
privateRegistrySecretName
);
}
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
tfReplicaSpecs
:
replicaSpecsObj
});
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
tfReplicaSpecs
:
replicaSpecsObj
});
}
else
if
(
this
.
config
.
operator
===
'
pytorch-operator
'
)
{
}
else
if
(
this
.
config
.
operator
===
'
pytorch-operator
'
)
{
if
(
this
.
config
.
worker
!==
undefined
)
{
if
(
this
.
config
.
worker
!==
undefined
)
{
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
worker
.
privateRegistryAuthPath
);
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
worker
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
this
.
config
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
this
.
config
.
worker
.
replicas
,
this
.
config
.
worker
.
dockerImage
,
'
run.sh
'
,
workerPodResources
,
privateRegistrySecretName
);
this
.
config
.
worker
.
dockerImage
,
`
${
envId
}
_
run.sh
`
,
workerPodResources
,
privateRegistrySecretName
);
}
}
if
(
this
.
config
.
master
!==
undefined
)
{
if
(
this
.
config
.
master
!==
undefined
)
{
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
master
.
privateRegistryAuthPath
);
const
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
this
.
config
.
master
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
this
.
config
.
master
.
replicas
,
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
this
.
config
.
master
.
replicas
,
this
.
config
.
master
.
dockerImage
,
'
run.sh
'
,
nonWorkerPodResources
,
privateRegistrySecretName
);
this
.
config
.
master
.
dockerImage
,
`
${
envId
}
_
run.sh
`
,
nonWorkerPodResources
,
privateRegistrySecretName
);
}
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
pytorchReplicaSpecs
:
replicaSpecsObj
});
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
pytorchReplicaSpecs
:
replicaSpecsObj
});
...
...
ts/nni_manager/training_service/reusable/environments/kubernetes/kubernetesEnvironmentService.ts
View file @
125ec21f
...
@@ -32,6 +32,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
...
@@ -32,6 +32,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
protected
CONTAINER_MOUNT_PATH
:
string
;
protected
CONTAINER_MOUNT_PATH
:
string
;
protected
log
:
Logger
=
getLogger
(
'
KubernetesEnvironmentService
'
);
protected
log
:
Logger
=
getLogger
(
'
KubernetesEnvironmentService
'
);
protected
environmentWorkingFolder
:
string
;
protected
environmentWorkingFolder
:
string
;
protected
nfsRootDir
:
string
;
constructor
(
_config
:
any
,
info
:
ExperimentStartupInfo
)
{
constructor
(
_config
:
any
,
info
:
ExperimentStartupInfo
)
{
super
();
super
();
...
@@ -39,6 +40,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
...
@@ -39,6 +40,7 @@ export class KubernetesEnvironmentService extends EnvironmentService {
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
genericK8sClient
=
new
GeneralK8sClient
();
this
.
experimentRootDir
=
info
.
logDir
;
this
.
experimentRootDir
=
info
.
logDir
;
this
.
environmentLocalTempFolder
=
path
.
join
(
this
.
experimentRootDir
,
"
environment-temp
"
);
this
.
environmentLocalTempFolder
=
path
.
join
(
this
.
experimentRootDir
,
"
environment-temp
"
);
this
.
nfsRootDir
=
path
.
join
(
this
.
experimentRootDir
,
"
nfs-root
"
);
this
.
experimentId
=
info
.
experimentId
;
this
.
experimentId
=
info
.
experimentId
;
this
.
environmentWorkingFolder
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
this
.
experimentId
);
this
.
environmentWorkingFolder
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
this
.
experimentId
);
}
}
...
@@ -147,11 +149,11 @@ export class KubernetesEnvironmentService extends EnvironmentService {
...
@@ -147,11 +149,11 @@ export class KubernetesEnvironmentService extends EnvironmentService {
}
}
protected
async
createNFSStorage
(
nfsServer
:
string
,
nfsPath
:
string
):
Promise
<
void
>
{
protected
async
createNFSStorage
(
nfsServer
:
string
,
nfsPath
:
string
):
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
environmentLocalTempFolde
r
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
this
.
nfsRootDi
r
}
`
);
try
{
try
{
await
cpp
.
exec
(
`sudo mount
${
nfsServer
}
:
${
nfsPath
}
${
this
.
environmentLocalTempFolde
r
}
`
);
await
cpp
.
exec
(
`sudo mount
${
nfsServer
}
:
${
nfsPath
}
${
this
.
nfsRootDi
r
}
`
);
}
catch
(
error
)
{
}
catch
(
error
)
{
const
mountError
:
string
=
`Mount NFS
${
nfsServer
}
:
${
nfsPath
}
to
${
this
.
environmentLocalTempFolde
r
}
failed, error is
${
error
}
`
;
const
mountError
:
string
=
`Mount NFS
${
nfsServer
}
:
${
nfsPath
}
to
${
this
.
nfsRootDi
r
}
failed, error is
${
error
}
`
;
this
.
log
.
error
(
mountError
);
this
.
log
.
error
(
mountError
);
return
Promise
.
reject
(
mountError
);
return
Promise
.
reject
(
mountError
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment