Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
1338c512
Unverified
Commit
1338c512
authored
Feb 25, 2021
by
J-shang
Committed by
GitHub
Feb 25, 2021
Browse files
support shared storage for reusable mode (#3354)
parent
715b1899
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
222 additions
and
4 deletions
+222
-4
ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts
...ing_service/reusable/shared_storages/nfsStorageService.ts
+160
-0
ts/nni_manager/training_service/reusable/test/utEnvironmentService.ts
...er/training_service/reusable/test/utEnvironmentService.ts
+5
-0
ts/nni_manager/training_service/reusable/trialDispatcher.ts
ts/nni_manager/training_service/reusable/trialDispatcher.ts
+57
-4
No files found.
ts/nni_manager/training_service/reusable/shared_storages/nfsStorageService.ts
0 → 100644
View file @
1338c512
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
path
from
'
path
'
;
import
{
SharedStorageService
,
SharedStorageConfig
,
SharedStorageType
,
LocalMountedType
}
from
'
../sharedStorage
'
import
{
MountedStorageService
}
from
'
../storages/mountedStorageService
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
const
INSTALL_NFS_CLIENT
=
`
#!/bin/bash
if [ -n "$(command -v nfsstat)" ]
then
exit 0
fi
if [ -n "$(command -v apt-get)" ]
then
sudo apt-get update
sudo apt-get install -y nfs-common
elif [ -n "$(command -v yum)" ]
then
sudo yum install -y nfs-utils
elif [ -n "$(command -v dnf)" ]
then
sudo dnf install -y nfs-utils
else
echo "Unknown package management."
exit 1
fi
`
class
NFSSharedStorageConfig
implements
SharedStorageConfig
{
public
storageType
:
SharedStorageType
;
public
localMountPoint
:
string
;
public
remoteMountPoint
:
string
;
public
nfsServer
:
string
;
public
exportedDirectory
:
string
;
public
localMounted
:
LocalMountedType
;
constructor
(
storageType
:
SharedStorageType
,
localMountPoint
:
string
,
remoteMountPoint
:
string
,
nfsServer
:
string
,
exportedDirectory
:
string
,
localMounted
:
LocalMountedType
)
{
this
.
storageType
=
storageType
;
this
.
localMountPoint
=
localMountPoint
;
this
.
remoteMountPoint
=
remoteMountPoint
;
this
.
nfsServer
=
nfsServer
;
this
.
exportedDirectory
=
exportedDirectory
;
this
.
localMounted
=
localMounted
;
}
}
export
class
NFSSharedStorageService
extends
SharedStorageService
{
private
log
:
Logger
;
private
internalStorageService
:
MountedStorageService
;
private
experimentId
:
string
;
private
storageType
?:
SharedStorageType
;
private
nfsServer
?:
string
;
private
exportedDirectory
?:
string
;
private
localMountPoint
?:
string
;
private
remoteMountPoint
?:
string
;
constructor
()
{
super
();
this
.
log
=
getLogger
();
this
.
internalStorageService
=
new
MountedStorageService
();
this
.
experimentId
=
getExperimentId
();
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
if
(
key
===
TrialConfigMetadataKey
.
SHARED_STORAGE_CONFIG
)
{
const
nfsConfig
=
<
NFSSharedStorageConfig
>
JSON
.
parse
(
value
);
this
.
localMountPoint
=
nfsConfig
.
localMountPoint
;
this
.
remoteMountPoint
=
nfsConfig
.
remoteMountPoint
;
this
.
storageType
=
nfsConfig
.
storageType
;
this
.
nfsServer
=
nfsConfig
.
nfsServer
;
this
.
exportedDirectory
=
nfsConfig
.
exportedDirectory
;
if
(
nfsConfig
.
localMounted
===
'
nnimount
'
)
{
await
this
.
helpLocalMount
();
}
else
if
(
nfsConfig
.
localMounted
===
'
nomount
'
)
{
const
errorMessage
=
`
${
this
.
storageType
}
Shared Storage:
${
this
.
storageType
}
not Support 'nomount'.`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
this
.
internalStorageService
.
initialize
(
this
.
localMountPoint
,
path
.
join
(
this
.
localMountPoint
,
'
nni
'
,
this
.
experimentId
));
}
return
Promise
.
resolve
();
}
public
get
canLocalMounted
():
boolean
{
return
true
;
}
public
get
storageService
():
MountedStorageService
{
return
this
.
internalStorageService
;
}
public
get
localMountCommand
():
string
{
if
(
this
.
localMountPoint
)
{
return
this
.
getCommand
(
this
.
localMountPoint
);
}
else
{
this
.
log
.
error
(
`
${
this
.
storageType
}
Shared Storage: localMountPoint is not initialized.`
);
return
''
;
}
}
public
get
remoteMountCommand
():
string
{
if
(
this
.
remoteMountPoint
)
{
return
this
.
getCommand
(
this
.
remoteMountPoint
);
}
else
{
this
.
log
.
error
(
`
${
this
.
storageType
}
Shared Storage: remoteMountPoint is not initialized.`
);
return
''
;
}
}
private
getCommand
(
mountPoint
:
string
):
string
{
const
install
=
`rm -f nni_install_nfsclient.sh && touch nni_install_nfsclient.sh && echo "
${
INSTALL_NFS_CLIENT
.
replace
(
/
\$
/g
,
`\
\$
`
).
replace
(
/
\n
/g
,
`\\n`
).
replace
(
/"/g
,
`\\"`
)}
" >> nni_install_nfsclient.sh && bash nni_install_nfsclient.sh`
;
const
mount
=
`mkdir -p
${
mountPoint
}
&& sudo mount
${
this
.
nfsServer
}
:
${
this
.
exportedDirectory
}
${
mountPoint
}
`
;
const
clean
=
`rm -f nni_install_nfsclient.sh`
;
return
`
${
install
}
&&
${
mount
}
&&
${
clean
}
`
;
}
public
get
localWorkingRoot
():
string
{
return
`
${
this
.
localMountPoint
}
/nni/
${
this
.
experimentId
}
`
;
}
public
get
remoteWorkingRoot
():
string
{
return
`
${
this
.
remoteMountPoint
}
/nni/
${
this
.
experimentId
}
`
;
}
private
async
helpLocalMount
():
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
const
errorMessage
=
`
${
this
.
storageType
}
Shared Storage: NNI not support auto mount
${
this
.
storageType
}
under Windows yet.`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
try
{
const
result
=
await
cpp
.
exec
(
this
.
localMountCommand
);
if
(
result
.
stderr
)
{
throw
new
Error
(
result
.
stderr
);
}
}
catch
(
error
)
{
const
errorMessage
:
string
=
`
${
this
.
storageType
}
Shared Storage: Mount
${
this
.
nfsServer
}
:
${
this
.
exportedDirectory
}
to
${
this
.
localMountPoint
}
failed, error is
${
error
}
`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
return
Promise
.
resolve
();
}
}
ts/nni_manager/training_service/reusable/test/utEnvironmentService.ts
View file @
1338c512
...
...
@@ -17,6 +17,11 @@ export class UtEnvironmentService extends EnvironmentService {
// storage service is tested by integration testing.
return
false
;
}
public
get
useSharedStorage
():
boolean
{
return
false
;
}
public
get
environmentMaintenceLoopInterval
():
number
{
return
1
;
}
...
...
ts/nni_manager/training_service/reusable/trialDispatcher.ts
View file @
1338c512
...
...
@@ -7,6 +7,7 @@ import { EventEmitter } from 'events';
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
Writable
}
from
'
stream
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
NNIError
,
NNIErrorNames
,
MethodNotImplementedError
}
from
'
../../common/errors
'
;
...
...
@@ -26,6 +27,9 @@ import { EnvironmentServiceFactory } from './environments/environmentServiceFact
import
{
GpuScheduler
}
from
'
./gpuScheduler
'
;
import
{
MountedStorageService
}
from
'
./storages/mountedStorageService
'
;
import
{
StorageService
}
from
'
./storageService
'
;
import
{
SharedStorageService
,
SharedStorageConfig
}
from
'
./sharedStorage
'
;
import
{
NFSSharedStorageService
}
from
'
./shared_storages/nfsStorageService
'
import
{
AzureBlobSharedStorageService
}
from
'
./shared_storages/azureblobStorageService
'
import
{
TrialDetail
}
from
'
./trial
'
;
...
...
@@ -74,6 +78,10 @@ class TrialDispatcher implements TrainingService {
private
isLoggedNoMoreEnvironment
:
boolean
=
false
;
private
isLoggedNoGpuAvailable
:
boolean
=
false
;
// uses to mark whether to use shared storage
private
useSharedStorage
:
boolean
=
false
;
private
fileCopyCompleted
:
boolean
=
false
;
constructor
()
{
this
.
log
=
getLogger
();
this
.
trials
=
new
Map
<
string
,
TrialDetail
>
();
...
...
@@ -195,7 +203,14 @@ class TrialDispatcher implements TrainingService {
this
.
log
.
info
(
`TrialDispatcher: copying code and settings.`
);
let
storageService
:
StorageService
;
if
(
environmentService
.
hasStorageService
)
{
if
(
this
.
useSharedStorage
)
{
if
(
this
.
fileCopyCompleted
)
{
this
.
log
.
debug
(
`TrialDispatcher: file already copy to shared storage.`
);
continue
;
}
this
.
log
.
debug
(
`TrialDispatcher: use shared storage service.`
);
storageService
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
storageService
;
}
else
if
(
environmentService
.
hasStorageService
)
{
this
.
log
.
debug
(
`TrialDispatcher: use existing storage service.`
);
storageService
=
component
.
get
<
StorageService
>
(
StorageService
);
}
else
{
...
...
@@ -223,6 +238,10 @@ class TrialDispatcher implements TrainingService {
}
await
storageService
.
copyDirectory
(
trialToolsPath
,
envDir
,
true
);
}
if
(
this
.
useSharedStorage
)
{
this
.
fileCopyCompleted
=
true
;
}
}
// start channel
this
.
commandEmitter
.
on
(
"
command
"
,
(
command
:
Command
):
void
=>
{
...
...
@@ -260,7 +279,6 @@ class TrialDispatcher implements TrainingService {
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
enableVersionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
...
...
@@ -289,7 +307,16 @@ class TrialDispatcher implements TrainingService {
this
.
commandChannelSet
.
add
(
environmentService
.
getCommandChannel
);
this
.
environmentServiceList
.
push
(
environmentService
);
}
break
;
}
case
TrialConfigMetadataKey
.
SHARED_STORAGE_CONFIG
:
if
(
this
.
useSharedStorage
===
false
)
{
await
this
.
initializeSharedStorage
(
key
,
value
);
}
else
{
const
errorMessage
=
`Already has set shared storage.`
;
this
.
log
.
error
(
errorMessage
);
}
break
;
}
for
(
const
environmentService
of
this
.
environmentServiceList
)
{
await
environmentService
.
config
(
key
,
value
);
...
...
@@ -618,7 +645,7 @@ class TrialDispatcher implements TrainingService {
}
}
// Schedule a environment platform for environment
private
selectEnvironmentService
():
EnvironmentService
|
undefined
{
const
validEnvironmentServiceList
=
[];
...
...
@@ -633,7 +660,7 @@ class TrialDispatcher implements TrainingService {
// Random scheduler
return
randomSelect
(
validEnvironmentServiceList
);
}
private
async
prefetchEnvironments
():
Promise
<
void
>
{
for
(
const
environmentService
of
this
.
environmentServiceList
)
{
const
number
=
environmentService
.
prefetchedEnvironmentCount
;
...
...
@@ -658,6 +685,8 @@ class TrialDispatcher implements TrainingService {
environment
.
command
=
`mkdir -p envs/
${
envId
}
&& cd envs/
${
envId
}
&&
${
environment
.
command
}
`
;
environment
.
useSharedStorage
=
this
.
useSharedStorage
;
await
environmentService
.
startEnvironment
(
environment
);
this
.
environments
.
set
(
environment
.
id
,
environment
);
...
...
@@ -881,6 +910,30 @@ class TrialDispatcher implements TrainingService {
}
this
.
shouldUpdateTrials
=
true
;
}
private
async
initializeSharedStorage
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
const
storageType
=
(
<
SharedStorageConfig
>
JSON
.
parse
(
value
)).
storageType
;
switch
(
storageType
)
{
case
'
NFS
'
:
Container
.
bind
(
SharedStorageService
)
.
to
(
NFSSharedStorageService
)
.
scope
(
Scope
.
Singleton
);
break
;
case
'
AzureBlob
'
:
Container
.
bind
(
SharedStorageService
)
.
to
(
AzureBlobSharedStorageService
)
.
scope
(
Scope
.
Singleton
);
break
;
default
:
{
const
errorMessage
=
`Shared storage type '
${
storageType
}
' not support.`
;
this
.
log
.
error
(
errorMessage
)
return
Promise
.
reject
(
errorMessage
);
}
}
await
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
config
(
key
,
value
);
this
.
useSharedStorage
=
true
;
return
Promise
.
resolve
();
}
}
export
{
TrialDispatcher
};
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment