Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
76f39903
Unverified
Commit
76f39903
authored
Apr 07, 2021
by
J-shang
Committed by
GitHub
Apr 07, 2021
Browse files
sharedstorage support remote umount and fix bug (#3456)
parent
200a1086
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
91 additions
and
11 deletions
+91
-11
ts/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
.../training_service/remote_machine/extends/linuxCommands.ts
+6
-0
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
...service/reusable/environments/remoteEnvironmentService.ts
+18
-6
ts/nni_manager/training_service/reusable/sharedStorage.ts
ts/nni_manager/training_service/reusable/sharedStorage.ts
+2
-0
ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts
...rvice/reusable/shared_storages/azureblobStorageService.ts
+30
-3
ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts
...ing_service/reusable/shared_storages/nfsStorageService.ts
+30
-2
ts/nni_manager/training_service/reusable/trialDispatcher.ts
ts/nni_manager/training_service/reusable/trialDispatcher.ts
+5
-0
No files found.
ts/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
View file @
76f39903
...
...
@@ -124,6 +124,12 @@ class LinuxCommands extends OsCommands {
command
=
`bash '
${
script
}
'`
;
}
else
{
script
=
script
.
replace
(
/"/g
,
'
\\
"
'
);
const
result
=
script
.
match
(
/
[^\\]\\\\
"/g
);
if
(
result
)
{
result
.
forEach
((
res
)
=>
{
script
=
script
.
replace
(
res
,
res
.
replace
(
/"$/g
,
'
\\
"
'
));
})
}
command
=
`bash -c "
${
script
}
"`
;
}
return
command
;
...
...
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
View file @
76f39903
...
...
@@ -181,7 +181,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
}
else
{
environment
.
setStatus
(
'
FAILED
'
);
}
this
.
releaseEnvironmentResource
(
environment
);
await
this
.
releaseEnvironmentResource
(
environment
);
}
}
}
...
...
@@ -194,7 +194,16 @@ export class RemoteEnvironmentService extends EnvironmentService {
* If a environment is finished, release the connection resource
* @param environment remote machine environment job detail
*/
private
releaseEnvironmentResource
(
environment
:
EnvironmentInformation
):
void
{
private
async
releaseEnvironmentResource
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
if
(
environment
.
useSharedStorage
)
{
const
executor
=
await
this
.
getExecutor
(
environment
.
id
);
const
remoteUmountCommand
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteUmountCommand
;
const
result
=
await
executor
.
executeScript
(
remoteUmountCommand
,
false
,
false
);
if
(
result
.
exitCode
!==
0
)
{
this
.
log
.
error
(
`Umount shared storage on remote machine failed.\n ERROR:
${
result
.
stderr
}
`
);
}
}
const
executorManager
=
this
.
environmentExecutorManagerMap
.
get
(
environment
.
id
);
if
(
executorManager
===
undefined
)
{
throw
new
Error
(
`ExecutorManager is not assigned for environment
${
environment
.
id
}
`
);
...
...
@@ -251,8 +260,11 @@ export class RemoteEnvironmentService extends EnvironmentService {
const
executor
=
await
this
.
getExecutor
(
environment
.
id
);
if
(
environment
.
useSharedStorage
)
{
this
.
remoteExperimentRootDir
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteWorkingRoot
;
const
remoteMountCommand
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteMountCommand
;
await
executor
.
executeScript
(
remoteMountCommand
,
false
,
false
);
const
remoteMountCommand
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteMountCommand
.
replace
(
/echo -e /g
,
`echo `
).
replace
(
/echo /g
,
`echo -e `
);
const
result
=
await
executor
.
executeScript
(
remoteMountCommand
,
false
,
false
);
if
(
result
.
exitCode
!==
0
)
{
throw
new
Error
(
`Mount shared storage on remote machine failed.\n ERROR:
${
result
.
stderr
}
`
);
}
}
else
{
this
.
remoteExperimentRootDir
=
executor
.
getRemoteExperimentRootDir
(
getExperimentId
());
}
...
...
@@ -304,14 +316,14 @@ export class RemoteEnvironmentService extends EnvironmentService {
if
(
environment
.
status
===
'
UNKNOWN
'
)
{
environment
.
status
=
'
USER_CANCELED
'
;
this
.
releaseEnvironmentResource
(
environment
);
await
this
.
releaseEnvironmentResource
(
environment
);
return
}
const
jobpidPath
:
string
=
`
${
environment
.
runnerWorkingFolder
}
/pid`
;
try
{
await
executor
.
killChildProcesses
(
jobpidPath
);
this
.
releaseEnvironmentResource
(
environment
);
await
this
.
releaseEnvironmentResource
(
environment
);
}
catch
(
error
)
{
this
.
log
.
error
(
`stopEnvironment:
${
error
}
`
);
}
...
...
ts/nni_manager/training_service/reusable/sharedStorage.ts
View file @
76f39903
...
...
@@ -20,6 +20,8 @@ export abstract class SharedStorageService {
public
abstract
get
storageService
():
StorageService
;
public
abstract
get
localMountCommand
():
string
;
public
abstract
get
remoteMountCommand
():
string
;
public
abstract
get
remoteUmountCommand
():
string
;
public
abstract
get
localWorkingRoot
():
string
;
public
abstract
get
remoteWorkingRoot
():
string
;
public
abstract
cleanUp
():
Promise
<
void
>
;
}
ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts
View file @
76f39903
...
...
@@ -79,6 +79,7 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
private
log
:
Logger
;
private
internalStorageService
:
MountedStorageService
;
private
experimentId
:
string
;
private
localMounted
?:
string
;
private
storageType
?:
SharedStorageType
;
private
storageAccountName
?:
string
;
...
...
@@ -113,10 +114,10 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
if
(
azureblobConfig
.
localMounted
===
'
nnimount
'
)
{
this
.
localMounted
=
azureblobConfig
.
localMounted
;
if
(
this
.
localMounted
===
'
nnimount
'
)
{
await
this
.
helpLocalMount
();
}
else
if
(
azureblobConfig
.
localMounted
===
'
nomount
'
)
{
}
else
if
(
this
.
localMounted
===
'
nomount
'
)
{
const
errorMessage
=
`
${
this
.
storageType
}
Shared Storage:
${
this
.
storageType
}
not Support 'nomount' yet.`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
...
...
@@ -154,6 +155,15 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
}
}
public
get
remoteUmountCommand
():
string
{
if
(
this
.
remoteMountPoint
)
{
return
`sudo umount -l
${
this
.
remoteMountPoint
}
`
;
}
else
{
this
.
log
.
error
(
`
${
this
.
storageType
}
Shared Storage: remoteMountPoint is not initialized.`
);
return
''
;
}
}
private
getCommand
(
mountPoint
:
string
):
string
{
const
install
=
`rm -f nni_install_fuseblob.sh && touch nni_install_fuseblob.sh && echo "
${
INSTALL_BLOBFUSE
.
replace
(
/
\$
/g
,
`\
\$
`
).
replace
(
/
\n
/g
,
`\\n`
).
replace
(
/"/g
,
`\\"`
)}
" >> nni_install_fuseblob.sh && bash nni_install_fuseblob.sh`
;
const
prepare
=
`sudo mkdir /mnt/resource/nniblobfusetmp -p && rm -f nni_fuse_connection.cfg && touch nni_fuse_connection.cfg && echo "accountName
${
this
.
storageAccountName
}
\\naccountKey
${
this
.
storageAccountKey
}
\\ncontainerName
${
this
.
containerName
}
" >> nni_fuse_connection.cfg`
;
...
...
@@ -206,4 +216,21 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
return
Promise
.
reject
(
errorMessage
);
}
}
public
async
cleanUp
():
Promise
<
void
>
{
if
(
this
.
localMounted
!==
'
nnimount
'
)
{
return
Promise
.
resolve
();
}
try
{
const
result
=
await
cpp
.
exec
(
`sudo umount -l
${
this
.
localMountPoint
}
`
);
if
(
result
.
stderr
)
{
throw
new
Error
(
result
.
stderr
);
}
}
catch
(
error
)
{
const
errorMessage
:
string
=
`
${
this
.
storageType
}
Shared Storage: Umount
${
this
.
localMountPoint
}
failed, error is
${
error
}
`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
return
Promise
.
resolve
();
}
}
ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts
View file @
76f39903
...
...
@@ -59,6 +59,7 @@ export class NFSSharedStorageService extends SharedStorageService {
private
log
:
Logger
;
private
internalStorageService
:
MountedStorageService
;
private
experimentId
:
string
;
private
localMounted
?:
string
;
private
storageType
?:
SharedStorageType
;
private
nfsServer
?:
string
;
...
...
@@ -83,9 +84,10 @@ export class NFSSharedStorageService extends SharedStorageService {
this
.
storageType
=
nfsConfig
.
storageType
;
this
.
nfsServer
=
nfsConfig
.
nfsServer
;
this
.
exportedDirectory
=
nfsConfig
.
exportedDirectory
;
if
(
nfsConfig
.
localMounted
===
'
nnimount
'
)
{
this
.
localMounted
=
nfsConfig
.
localMounted
;
if
(
this
.
localMounted
===
'
nnimount
'
)
{
await
this
.
helpLocalMount
();
}
else
if
(
nfsConfig
.
localMounted
===
'
nomount
'
)
{
}
else
if
(
this
.
localMounted
===
'
nomount
'
)
{
const
errorMessage
=
`
${
this
.
storageType
}
Shared Storage:
${
this
.
storageType
}
not Support 'nomount'.`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
...
...
@@ -122,6 +124,15 @@ export class NFSSharedStorageService extends SharedStorageService {
}
}
public
get
remoteUmountCommand
():
string
{
if
(
this
.
remoteMountPoint
)
{
return
`sudo umount -f -l
${
this
.
remoteMountPoint
}
`
;
}
else
{
this
.
log
.
error
(
`
${
this
.
storageType
}
Shared Storage: remoteMountPoint is not initialized.`
);
return
''
;
}
}
private
getCommand
(
mountPoint
:
string
):
string
{
const
install
=
`rm -f nni_install_nfsclient.sh && touch nni_install_nfsclient.sh && echo "
${
INSTALL_NFS_CLIENT
.
replace
(
/
\$
/g
,
`\
\$
`
).
replace
(
/
\n
/g
,
`\\n`
).
replace
(
/"/g
,
`\\"`
)}
" >> nni_install_nfsclient.sh && bash nni_install_nfsclient.sh`
;
const
mount
=
`mkdir -p
${
mountPoint
}
&& sudo mount
${
this
.
nfsServer
}
:
${
this
.
exportedDirectory
}
${
mountPoint
}
`
;
...
...
@@ -157,4 +168,21 @@ export class NFSSharedStorageService extends SharedStorageService {
return
Promise
.
resolve
();
}
public
async
cleanUp
():
Promise
<
void
>
{
if
(
this
.
localMounted
!==
'
nnimount
'
)
{
return
Promise
.
resolve
();
}
try
{
const
result
=
await
cpp
.
exec
(
`sudo umount -f -l
${
this
.
localMountPoint
}
`
);
if
(
result
.
stderr
)
{
throw
new
Error
(
result
.
stderr
);
}
}
catch
(
error
)
{
const
errorMessage
:
string
=
`
${
this
.
storageType
}
Shared Storage: Umount
${
this
.
localMountPoint
}
failed, error is
${
error
}
`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
return
Promise
.
resolve
();
}
}
ts/nni_manager/training_service/reusable/trialDispatcher.ts
View file @
76f39903
...
...
@@ -348,6 +348,11 @@ class TrialDispatcher implements TrainingService {
for
(
const
commandChannel
of
this
.
commandChannelSet
)
{
await
commandChannel
.
stop
();
}
if
(
this
.
useSharedStorage
)
{
this
.
log
.
info
(
`stopping shared storage...`
)
await
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
cleanUp
();
this
.
log
.
info
(
`shared storage stopped.`
)
}
}
private
async
environmentMaintenanceLoop
():
Promise
<
void
>
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment