Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
063d6b74
Unverified
Commit
063d6b74
authored
Apr 26, 2021
by
SparkSnail
Committed by
GitHub
Apr 26, 2021
Browse files
Merge pull request #3580 from microsoft/v2.2
[do not Squash!] Merge V2.2 back to master
parents
08986c6b
e1295888
Changes
86
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
178 additions
and
140 deletions
+178
-140
ts/nni_manager/core/nnimanager.ts
ts/nni_manager/core/nnimanager.ts
+51
-13
ts/nni_manager/rest_server/restValidationSchemas.ts
ts/nni_manager/rest_server/restValidationSchemas.ts
+5
-0
ts/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+5
-3
ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+2
-1
ts/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
...er/training_service/kubernetes/kubernetesJobRestServer.ts
+0
-1
ts/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
.../training_service/remote_machine/extends/linuxCommands.ts
+4
-0
ts/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
...raining_service/remote_machine/extends/windowsCommands.ts
+4
-0
ts/nni_manager/training_service/remote_machine/osCommands.ts
ts/nni_manager/training_service/remote_machine/osCommands.ts
+1
-0
ts/nni_manager/training_service/remote_machine/shellExecutor.ts
..._manager/training_service/remote_machine/shellExecutor.ts
+10
-0
ts/nni_manager/training_service/reusable/environment.ts
ts/nni_manager/training_service/reusable/environment.ts
+4
-0
ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts
...ng_service/reusable/environments/amlEnvironmentService.ts
+16
-39
ts/nni_manager/training_service/reusable/environments/environmentServiceFactory.ts
...ervice/reusable/environments/environmentServiceFactory.ts
+1
-1
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
...service/reusable/environments/remoteEnvironmentService.ts
+16
-15
ts/nni_manager/training_service/reusable/trialDispatcher.ts
ts/nni_manager/training_service/reusable/trialDispatcher.ts
+3
-2
ts/webui/src/App.tsx
ts/webui/src/App.tsx
+1
-1
ts/webui/src/components/modals/ExperimentSummaryPanel.tsx
ts/webui/src/components/modals/ExperimentSummaryPanel.tsx
+1
-1
ts/webui/src/components/overview/count/EditExperimentParam.tsx
...bui/src/components/overview/count/EditExperimentParam.tsx
+13
-10
ts/webui/src/components/overview/count/ExpDuration.tsx
ts/webui/src/components/overview/count/ExpDuration.tsx
+1
-1
ts/webui/src/components/overview/count/TrialCount.tsx
ts/webui/src/components/overview/count/TrialCount.tsx
+1
-1
ts/webui/src/components/slideNav/TrialConfigPanel.tsx
ts/webui/src/components/slideNav/TrialConfigPanel.tsx
+39
-51
No files found.
ts/nni_manager/core/nnimanager.ts
View file @
063d6b74
...
...
@@ -175,12 +175,14 @@ class NNIManager implements Manager {
nextSequenceId
:
0
,
revision
:
0
};
this
.
config
=
config
;
this
.
log
.
info
(
`Starting experiment:
${
this
.
experimentProfile
.
id
}
`
);
await
this
.
storeExperimentProfile
();
this
.
log
.
info
(
'
Setup training service...
'
);
this
.
trainingService
=
await
this
.
initTrainingService
(
config
);
if
(
this
.
trainingService
===
undefined
)
{
this
.
log
.
info
(
'
Setup training service...
'
);
this
.
trainingService
=
await
this
.
initTrainingService
(
config
);
}
this
.
log
.
info
(
'
Setup tuner...
'
);
const
dispatcherCommand
:
string
=
getMsgDispatcherCommand
(
config
);
...
...
@@ -198,18 +200,22 @@ class NNIManager implements Manager {
}
public
async
resumeExperiment
(
readonly
:
boolean
):
Promise
<
void
>
{
this
.
log
.
info
(
`Resuming experiment:
${
this
.
experimentProfile
.
id
}
`
);
//Fetch back the experiment profile
const
experimentId
:
string
=
getExperimentId
();
this
.
log
.
info
(
`Resuming experiment:
${
experimentId
}
`
);
this
.
experimentProfile
=
await
this
.
dataStore
.
getExperimentProfile
(
experimentId
);
this
.
readonly
=
readonly
;
if
(
readonly
)
{
this
.
setStatus
(
'
VIEWED
'
);
return
Promise
.
resolve
();
}
this
.
log
.
info
(
'
Setup training service...
'
);
const
config
:
ExperimentConfig
=
this
.
experimentProfile
.
params
;
this
.
trainingService
=
await
this
.
initTrainingService
(
config
);
this
.
config
=
config
;
if
(
this
.
trainingService
===
undefined
)
{
this
.
log
.
info
(
'
Setup training service...
'
);
this
.
trainingService
=
await
this
.
initTrainingService
(
config
);
}
this
.
log
.
info
(
'
Setup tuner...
'
);
const
dispatcherCommand
:
string
=
getMsgDispatcherCommand
(
config
);
...
...
@@ -254,12 +260,35 @@ class NNIManager implements Manager {
return
this
.
dataStore
.
getTrialJob
(
trialJobId
);
}
public
async
setClusterMetadata
(
_key
:
string
,
_value
:
string
):
Promise
<
void
>
{
throw
new
Error
(
'
Calling removed API setClusterMetadata
'
);
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
// Hack for supporting v2 config, need refactor
if
(
this
.
trainingService
===
undefined
)
{
this
.
log
.
info
(
'
Setup training service...
'
);
switch
(
key
)
{
case
'
kubeflow_config
'
:
{
const
kubeflowModule
=
await
import
(
'
../training_service/kubernetes/kubeflow/kubeflowTrainingService
'
);
this
.
trainingService
=
new
kubeflowModule
.
KubeflowTrainingService
();
break
;
}
case
'
frameworkcontroller_config
'
:
{
const
fcModule
=
await
import
(
'
../training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService
'
);
this
.
trainingService
=
new
fcModule
.
FrameworkControllerTrainingService
();
break
;
}
case
'
adl_config
'
:
{
const
adlModule
=
await
import
(
'
../training_service/kubernetes/adl/adlTrainingService
'
);
this
.
trainingService
=
new
adlModule
.
AdlTrainingService
();
break
;
}
default
:
throw
new
Error
(
"
Setup training service failed.
"
);
}
}
await
this
.
trainingService
.
setClusterMetadata
(
key
,
value
);
}
public
getClusterMetadata
(
_
key
:
string
):
Promise
<
string
>
{
throw
new
Error
(
'
Calling removed API
getClusterMetadata
'
);
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
return
this
.
trainingService
.
getClusterMetadata
(
key
);
}
public
async
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
{
...
...
@@ -404,8 +433,17 @@ class NNIManager implements Manager {
}
private
async
initTrainingService
(
config
:
ExperimentConfig
):
Promise
<
TrainingService
>
{
this
.
config
=
config
;
const
platform
=
Array
.
isArray
(
config
.
trainingService
)
?
'
hybrid
'
:
config
.
trainingService
.
platform
;
let
platform
:
string
;
if
(
Array
.
isArray
(
config
.
trainingService
))
{
platform
=
'
hybrid
'
;
}
else
if
(
config
.
trainingService
.
platform
)
{
platform
=
config
.
trainingService
.
platform
;
}
else
{
platform
=
(
config
as
any
).
trainingServicePlatform
;
}
if
(
!
platform
)
{
throw
new
Error
(
'
Cannot detect training service platform
'
);
}
if
([
'
remote
'
,
'
pai
'
,
'
aml
'
,
'
hybrid
'
].
includes
(
platform
))
{
const
module_
=
await
import
(
'
../training_service/reusable/routerTrainingService
'
);
...
...
ts/nni_manager/rest_server/restValidationSchemas.ts
View file @
063d6b74
...
...
@@ -131,6 +131,9 @@ export namespace ValidationSchemas {
maxTrialNumPerGpu
:
joi
.
number
(),
useActiveGpu
:
joi
.
boolean
(),
}),
adl_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
// hack for v2 configuration
}),
kubeflow_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
operator
:
joi
.
string
().
min
(
1
).
required
(),
storage
:
joi
.
string
().
min
(
1
),
...
...
@@ -194,6 +197,8 @@ export namespace ValidationSchemas {
nni_manager_ip
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
nniManagerIp
:
joi
.
string
().
min
(
1
)
}),
version_check
:
joi
.
boolean
(),
// eslint-disable-line @typescript-eslint/camelcase
log_collection
:
joi
.
string
(),
// eslint-disable-line @typescript-eslint/camelcase
remote_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
reuse
:
joi
.
boolean
()
}),
...
...
ts/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
063d6b74
...
...
@@ -19,6 +19,7 @@ import {validateCodeDir} from '../../common/util';
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
;
import
{
FrameworkControllerClientFactory
}
from
'
./frameworkcontrollerApiClient
'
;
import
{
FrameworkControllerClusterConfig
,
...
...
@@ -52,7 +53,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
public
async
run
():
Promise
<
void
>
{
this
.
kubernetesJobRestServer
=
component
.
get
(
FrameworkController
JobRestServer
);
this
.
kubernetesJobRestServer
=
new
Kubernetes
JobRestServer
(
this
);
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
...
...
@@ -140,10 +141,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
let
frameworkcontrollerJobName
:
string
=
`nniexp
${
this
.
experimentId
}
trial
${
trialJobId
}
`
.
toLowerCase
();
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
let
frameworkcontrollerJobConfig
=
JSON
.
parse
(
JSON
.
stringify
(
this
.
fcTemplate
));
let
frameworkcontrollerJobConfig
:
any
;
if
(
this
.
fcTemplate
!==
undefined
)
{
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
frameworkcontrollerJobConfig
=
JSON
.
parse
(
JSON
.
stringify
(
this
.
fcTemplate
));
// add a custom name extension to the job name and apply it to the custom template
frameworkcontrollerJobName
+=
"
xx
"
+
this
.
fcTemplate
.
metadata
.
name
;
// Process custom task roles commands
...
...
ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
063d6b74
...
...
@@ -19,6 +19,7 @@ import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
KubeflowOperatorClientFactory
}
from
'
./kubeflowApiClient
'
;
import
{
KubeflowClusterConfig
,
KubeflowClusterConfigAzure
,
KubeflowClusterConfigFactory
,
KubeflowClusterConfigNFS
,
...
...
@@ -46,7 +47,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
public
async
run
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Run Kubeflow training service.
'
);
this
.
kubernetesJobRestServer
=
component
.
get
(
Kubeflow
JobRestServer
);
this
.
kubernetesJobRestServer
=
new
Kubernetes
JobRestServer
(
this
);
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
...
...
ts/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
View file @
063d6b74
...
...
@@ -16,7 +16,6 @@ import { KubernetesTrainingService } from './kubernetesTrainingService';
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
@
Inject
private
readonly
kubernetesTrainingService
?
:
KubernetesTrainingService
;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
...
...
ts/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
View file @
063d6b74
...
...
@@ -146,6 +146,10 @@ class LinuxCommands extends OsCommands {
public
fileExistCommand
(
filePath
:
string
):
string
{
return
`test -e
${
filePath
}
&& echo True || echo False`
;
}
public
getCurrentPath
():
string
{
return
`pwd`
;
}
}
export
{
LinuxCommands
};
ts/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
View file @
063d6b74
...
...
@@ -134,6 +134,10 @@ class WindowsCommands extends OsCommands {
public
fileExistCommand
(
filePath
:
string
):
string
{
return
`powershell Test-Path
${
filePath
}
-PathType Leaf`
;
}
public
getCurrentPath
():
string
{
return
`chdir`
;
}
}
export
{
WindowsCommands
};
ts/nni_manager/training_service/remote_machine/osCommands.ts
View file @
063d6b74
...
...
@@ -30,6 +30,7 @@ abstract class OsCommands {
public
abstract
executeScript
(
script
:
string
,
isFile
:
boolean
):
string
;
public
abstract
setPythonPath
(
pythonPath
:
string
|
undefined
,
command
:
string
|
undefined
):
string
|
undefined
;
public
abstract
fileExistCommand
(
filePath
:
string
):
string
|
undefined
;
public
abstract
getCurrentPath
():
string
;
public
joinPath
(...
paths
:
string
[]):
string
{
let
dir
:
string
=
paths
.
filter
((
path
:
any
)
=>
path
!==
''
).
join
(
this
.
pathSpliter
);
...
...
ts/nni_manager/training_service/remote_machine/shellExecutor.ts
View file @
063d6b74
...
...
@@ -169,6 +169,16 @@ class ShellExecutor {
return
this
.
tempPath
;
}
public
async
getCurrentPath
():
Promise
<
string
>
{
const
commandText
=
this
.
osCommands
&&
this
.
osCommands
.
getCurrentPath
();
const
commandResult
=
await
this
.
execute
(
commandText
);
if
(
commandResult
.
exitCode
==
0
)
{
return
commandResult
.
stdout
;
}
else
{
throw
Error
(
commandResult
.
stderr
);
}
}
public
getRemoteScriptsPath
(
experimentId
:
string
):
string
{
return
this
.
joinPath
(
this
.
getRemoteExperimentRootDir
(
experimentId
),
'
scripts
'
);
}
...
...
ts/nni_manager/training_service/reusable/environment.ts
View file @
063d6b74
...
...
@@ -128,6 +128,10 @@ export class EnvironmentInformation {
export
abstract
class
EnvironmentService
{
public
async
init
():
Promise
<
void
>
{
return
;
}
public
abstract
get
hasStorageService
():
boolean
;
public
abstract
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
;
public
abstract
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
...
...
ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts
View file @
063d6b74
...
...
@@ -9,15 +9,16 @@ import * as component from '../../../common/component';
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
getExperimentRootDir
}
from
'
../../../common/utils
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/
trialConfigMetadataKey
'
;
import
{
ExperimentConfig
,
AmlConfig
,
flattenConfig
}
from
'
../../
../
common/
experimentConfig
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AMLClient
}
from
'
../aml/amlClient
'
;
import
{
AMLClusterConfig
,
AMLEnvironmentInformation
,
AMLTrialConfig
}
from
'
../aml/amlConfig
'
;
import
{
AMLEnvironmentInformation
}
from
'
../aml/amlConfig
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
EventEmitter
}
from
"
events
"
;
import
{
AMLCommandChannel
}
from
'
../channels/amlCommandChannel
'
;
import
{
SharedStorageService
}
from
'
../sharedStorage
'
interface
FlattenAmlConfig
extends
ExperimentConfig
,
AmlConfig
{
}
/**
* Collector AML jobs info from AML cluster, and update aml job status locally
...
...
@@ -26,15 +27,16 @@ import { SharedStorageService } from '../sharedStorage'
export
class
AMLEnvironmentService
extends
EnvironmentService
{
private
readonly
log
:
Logger
=
getLogger
();
public
amlClusterConfig
:
AMLClusterConfig
|
undefined
;
public
amlTrialConfig
:
AMLTrialConfig
|
undefined
;
private
experimentId
:
string
;
private
experimentRootDir
:
string
;
private
config
:
FlattenAmlConfig
;
constructor
()
{
constructor
(
config
:
ExperimentConfig
)
{
super
();
this
.
experimentId
=
getExperimentId
();
this
.
experimentRootDir
=
getExperimentRootDir
();
this
.
config
=
flattenConfig
(
config
,
'
aml
'
);
validateCodeDir
(
this
.
config
.
trialCodeDirectory
);
}
public
get
hasStorageService
():
boolean
{
...
...
@@ -53,27 +55,6 @@ export class AMLEnvironmentService extends EnvironmentService {
return
'
aml
'
;
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
AML_CLUSTER_CONFIG
:
this
.
amlClusterConfig
=
<
AMLClusterConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
if
(
this
.
amlClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
aml cluster config is not initialized
'
);
break
;
}
this
.
amlTrialConfig
=
<
AMLTrialConfig
>
JSON
.
parse
(
value
);
// Validate to make sure codeDir doesn't have too many files
await
validateCodeDir
(
this
.
amlTrialConfig
.
codeDir
);
break
;
}
default
:
this
.
log
.
debug
(
`AML not proccessed metadata key: '
${
key
}
', value: '
${
value
}
'`
);
}
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
environments
.
forEach
(
async
(
environment
)
=>
{
const
amlClient
=
(
environment
as
AMLEnvironmentInformation
).
amlClient
;
...
...
@@ -107,12 +88,6 @@ export class AMLEnvironmentService extends EnvironmentService {
}
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
if
(
this
.
amlClusterConfig
===
undefined
)
{
throw
new
Error
(
'
AML Cluster config is not initialized
'
);
}
if
(
this
.
amlTrialConfig
===
undefined
)
{
throw
new
Error
(
'
AML trial config is not initialized
'
);
}
const
amlEnvironment
:
AMLEnvironmentInformation
=
environment
as
AMLEnvironmentInformation
;
const
environmentLocalTempFolder
=
path
.
join
(
this
.
experimentRootDir
,
"
environment-temp
"
);
if
(
!
fs
.
existsSync
(
environmentLocalTempFolder
))
{
...
...
@@ -126,22 +101,24 @@ export class AMLEnvironmentService extends EnvironmentService {
amlEnvironment
.
command
=
`mv envs outputs/envs && cd outputs &&
${
amlEnvironment
.
command
}
`
;
}
amlEnvironment
.
command
=
`import os\nos.system('
${
amlEnvironment
.
command
}
')`
;
amlEnvironment
.
useActiveGpu
=
this
.
amlClusterConfig
.
useActiveGpu
;
amlEnvironment
.
maxTrialNumberPerGpu
=
this
.
amlClusterC
onfig
.
maxTrialNumPerGpu
;
amlEnvironment
.
useActiveGpu
=
!!
this
.
config
.
deprecated
.
useActiveGpu
;
amlEnvironment
.
maxTrialNumberPerGpu
=
this
.
c
onfig
.
maxTrialNum
ber
PerGpu
;
await
fs
.
promises
.
writeFile
(
path
.
join
(
environmentLocalTempFolder
,
'
nni_script.py
'
),
amlEnvironment
.
command
,
{
encoding
:
'
utf8
'
});
const
amlClient
=
new
AMLClient
(
this
.
amlClusterC
onfig
.
subscriptionId
,
this
.
amlClusterC
onfig
.
resourceGroup
,
this
.
amlClusterC
onfig
.
workspaceName
,
this
.
c
onfig
.
subscriptionId
,
this
.
c
onfig
.
resourceGroup
,
this
.
c
onfig
.
workspaceName
,
this
.
experimentId
,
this
.
amlClusterC
onfig
.
computeTarget
,
this
.
amlTrialConfig
.
i
mage
,
this
.
c
onfig
.
computeTarget
,
this
.
config
.
dockerI
mage
,
'
nni_script.py
'
,
environmentLocalTempFolder
);
amlEnvironment
.
id
=
await
amlClient
.
submit
();
this
.
log
.
debug
(
'
aml: before getTrackingUrl
'
);
amlEnvironment
.
trackingUrl
=
await
amlClient
.
getTrackingUrl
();
this
.
log
.
debug
(
'
aml: after getTrackingUrl
'
);
amlEnvironment
.
amlClient
=
amlClient
;
}
...
...
ts/nni_manager/training_service/reusable/environments/environmentServiceFactory.ts
View file @
063d6b74
...
...
@@ -13,7 +13,7 @@ export class EnvironmentServiceFactory {
case
'
remote
'
:
return
new
RemoteEnvironmentService
(
config
);
case
'
aml
'
:
return
new
AMLEnvironmentService
();
return
new
AMLEnvironmentService
(
config
);
case
'
openpai
'
:
return
new
OpenPaiEnvironmentService
(
config
);
default
:
...
...
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
View file @
063d6b74
...
...
@@ -27,7 +27,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
private
readonly
environmentExecutorManagerMap
:
Map
<
string
,
ExecutorManager
>
;
private
readonly
remoteMachineMetaOccupiedMap
:
Map
<
RemoteMachineConfig
,
boolean
>
;
private
readonly
log
:
Logger
;
private
sshConnectionPromises
:
any
[];
private
sshConnectionPromises
:
Promise
<
void
[]
>
;
private
experimentRootDir
:
string
;
private
remoteExperimentRootDir
:
string
=
""
;
private
experimentId
:
string
;
...
...
@@ -39,7 +39,6 @@ export class RemoteEnvironmentService extends EnvironmentService {
this
.
environmentExecutorManagerMap
=
new
Map
<
string
,
ExecutorManager
>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachineConfig
,
ExecutorManager
>
();
this
.
remoteMachineMetaOccupiedMap
=
new
Map
<
RemoteMachineConfig
,
boolean
>
();
this
.
sshConnectionPromises
=
[];
this
.
experimentRootDir
=
getExperimentRootDir
();
this
.
experimentId
=
getExperimentId
();
this
.
log
=
getLogger
();
...
...
@@ -50,9 +49,18 @@ export class RemoteEnvironmentService extends EnvironmentService {
throw
new
Error
(
`codeDir
${
this
.
config
.
trialCodeDirectory
}
is not a directory`
);
}
this
.
sshConnectionPromises
=
this
.
config
.
machineList
.
map
(
this
.
sshConnectionPromises
=
Promise
.
all
(
this
.
config
.
machineList
.
map
(
machine
=>
this
.
initRemoteMachineOnConnected
(
machine
)
);
));
}
public
async
init
():
Promise
<
void
>
{
await
this
.
sshConnectionPromises
;
this
.
log
.
info
(
'
ssh connection initialized!
'
);
Array
.
from
(
this
.
machineExecutorManagerMap
.
keys
()).
forEach
(
rmMeta
=>
{
// initialize remoteMachineMetaOccupiedMap, false means not occupied
this
.
remoteMachineMetaOccupiedMap
.
set
(
rmMeta
,
false
);
});
}
public
get
prefetchedEnvironmentCount
():
number
{
...
...
@@ -204,16 +212,6 @@ export class RemoteEnvironmentService extends EnvironmentService {
}
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
if
(
this
.
sshConnectionPromises
.
length
>
0
)
{
await
Promise
.
all
(
this
.
sshConnectionPromises
);
this
.
log
.
info
(
'
ssh connection initialized!
'
);
// set sshConnectionPromises to [] to avoid log information duplicated
this
.
sshConnectionPromises
=
[];
Array
.
from
(
this
.
machineExecutorManagerMap
.
keys
()).
forEach
(
rmMeta
=>
{
// initialize remoteMachineMetaOccupiedMap, false means not occupied
this
.
remoteMachineMetaOccupiedMap
.
set
(
rmMeta
,
false
);
});
}
const
remoteEnvironment
:
RemoteMachineEnvironmentInformation
=
environment
as
RemoteMachineEnvironmentInformation
;
remoteEnvironment
.
status
=
'
WAITING
'
;
// schedule machine for environment, generate command
...
...
@@ -238,7 +236,10 @@ export class RemoteEnvironmentService extends EnvironmentService {
const
executor
=
await
this
.
getExecutor
(
environment
.
id
);
if
(
environment
.
useSharedStorage
)
{
this
.
remoteExperimentRootDir
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteWorkingRoot
;
const
remoteMountCommand
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteMountCommand
.
replace
(
/echo -e /g
,
`echo `
).
replace
(
/echo /g
,
`echo -e `
);
if
(
!
this
.
remoteExperimentRootDir
.
startsWith
(
'
/
'
))
{
this
.
remoteExperimentRootDir
=
executor
.
joinPath
((
await
executor
.
getCurrentPath
()).
trim
(),
this
.
remoteExperimentRootDir
);
}
const
remoteMountCommand
=
component
.
get
<
SharedStorageService
>
(
SharedStorageService
).
remoteMountCommand
.
replace
(
/echo -e /g
,
`echo `
).
replace
(
/echo /g
,
`echo -e `
).
replace
(
/
\\\$
/g
,
`\\\\\
\$
`
);
const
result
=
await
executor
.
executeScript
(
remoteMountCommand
,
false
,
false
);
if
(
result
.
exitCode
!==
0
)
{
throw
new
Error
(
`Mount shared storage on remote machine failed.\n ERROR:
${
result
.
stderr
}
`
);
...
...
ts/nni_manager/training_service/reusable/trialDispatcher.ts
View file @
063d6b74
...
...
@@ -122,7 +122,6 @@ class TrialDispatcher implements TrainingService {
this
.
environmentServiceList
.
push
(
env
);
}
// FIXME: max?
this
.
environmentMaintenceLoopInterval
=
Math
.
max
(
...
this
.
environmentServiceList
.
map
((
env
)
=>
env
.
environmentMaintenceLoopInterval
)
);
...
...
@@ -211,6 +210,7 @@ class TrialDispatcher implements TrainingService {
}
public
async
run
():
Promise
<
void
>
{
await
Promise
.
all
(
this
.
environmentServiceList
.
map
(
env
=>
env
.
init
()));
for
(
const
environmentService
of
this
.
environmentServiceList
)
{
const
runnerSettings
:
RunnerSettings
=
new
RunnerSettings
();
...
...
@@ -497,9 +497,10 @@ class TrialDispatcher implements TrainingService {
liveEnvironmentsCount
++
;
if
(
environment
.
status
===
"
RUNNING
"
&&
environment
.
isRunnerReady
)
{
// if environment is not reusable and used, stop and not count as idle;
const
reuseMode
=
Array
.
isArray
(
this
.
config
.
trainingService
)
||
(
this
.
config
.
trainingService
as
any
).
reuseMode
;
if
(
0
===
environment
.
runningTrialCount
&&
!
(
this
.
config
as
any
).
reuseMod
e
&&
reuseMode
===
fals
e
&&
environment
.
assignedTrialCount
>
0
)
{
if
(
environment
.
environmentService
===
undefined
)
{
...
...
ts/webui/src/App.tsx
View file @
063d6b74
...
...
@@ -237,7 +237,7 @@ class App extends React.Component<{}, AppState> {
}
// experiment status and /trial-jobs api's status could decide website update
if
([
'
DONE
'
,
'
ERROR
'
,
'
STOPPED
'
].
includes
(
EXPERIMENT
.
status
)
||
TRIALS
.
jobListError
())
{
if
([
'
DONE
'
,
'
ERROR
'
,
'
STOPPED
'
,
'
VIEWED
'
].
includes
(
EXPERIMENT
.
status
)
||
TRIALS
.
jobListError
())
{
// experiment finished, refresh once more to ensure consistency
this
.
setState
(()
=>
({
interval
:
0
,
isUpdate
:
false
}));
return
;
...
...
ts/webui/src/components/modals/ExperimentSummaryPanel.tsx
View file @
063d6b74
...
...
@@ -54,7 +54,7 @@ class ExperimentSummaryPanel extends React.Component<ExpDrawerProps, ExpDrawerSt
this
.
setState
({
experiment
:
JSON
.
stringify
(
result
,
null
,
4
)
});
}
if
([
'
DONE
'
,
'
ERROR
'
,
'
STOPPED
'
].
includes
(
EXPERIMENT
.
status
))
{
if
([
'
DONE
'
,
'
ERROR
'
,
'
STOPPED
'
,
'
VIEWED
'
].
includes
(
EXPERIMENT
.
status
))
{
if
(
this
.
refreshId
!==
null
||
this
.
refreshId
!==
undefined
)
{
window
.
clearInterval
(
this
.
refreshId
);
}
...
...
ts/webui/src/components/overview/count/EditExperimentParam.tsx
View file @
063d6b74
...
...
@@ -30,6 +30,7 @@ export const EditExperimentParam = (): any => {
const
{
title
,
field
,
editType
,
maxExecDuration
,
maxTrialNum
,
trialConcurrency
,
updateOverviewPage
}
=
useContext
(
EditExpeParamContext
);
const
originMaxDurationStr
=
EXPERIMENT
.
profile
.
params
.
maxExperimentDuration
;
const
{
maxDurationUnit
,
changeMaxDurationUnit
}
=
useContext
(
AppContext
);
const
[
unit
,
setUnit
]
=
useState
(
maxDurationUnit
);
let
defaultVal
=
''
;
...
...
@@ -101,13 +102,7 @@ export const EditExperimentParam = (): any => {
}
if
(
isMaxDuration
)
{
const
maxDura
=
JSON
.
parse
(
editInputVal
);
if
(
unit
===
'
m
'
)
{
newProfile
.
params
[
field
]
=
maxDura
*
60
;
}
else
if
(
unit
===
'
h
'
)
{
newProfile
.
params
[
field
]
=
maxDura
*
3600
;
}
else
{
newProfile
.
params
[
field
]
=
maxDura
*
24
*
60
*
60
;
}
newProfile
.
params
[
field
]
=
`
${
maxDura
}${
unit
}
`
;
}
else
{
newProfile
.
params
[
field
]
=
parseInt
(
editInputVal
,
10
);
}
...
...
@@ -118,8 +113,11 @@ export const EditExperimentParam = (): any => {
params
:
{
update_type
:
editType
}
});
if
(
res
.
status
===
200
)
{
if
(
isMaxDuration
)
{
changeMaxDurationUnit
(
unit
);
}
showMessageInfo
(
`Successfully updated experiment's
${
field
}
`
,
'
success
'
);
changeMaxDurationUnit
(
unit
);
updateOverviewPage
(
);
}
}
catch
(
error
)
{
if
(
error
.
response
&&
error
.
response
.
data
.
error
)
{
...
...
@@ -132,9 +130,14 @@ export const EditExperimentParam = (): any => {
showMessageInfo
(
`Failed to update trial
${
field
}
\nUnknown error`
,
'
error
'
);
}
setEditValInput
(
defaultVal
);
// confirm trial config panel val
if
(
isMaxDuration
)
{
newProfile
.
params
[
field
]
=
originMaxDurationStr
;
}
else
{
newProfile
.
params
[
field
]
=
beforeParam
;
}
}
showPencil
();
updateOverviewPage
();
}
function
cancelEdit
():
void
{
...
...
@@ -162,7 +165,7 @@ export const EditExperimentParam = (): any => {
<
EditExpeParamContext
.
Consumer
>
{
(
value
):
React
.
ReactNode
=>
{
let
editClassName
=
''
;
if
(
value
.
field
===
'
maxEx
ec
Duration
'
)
{
if
(
value
.
field
===
'
maxEx
periment
Duration
'
)
{
editClassName
=
isShowPencil
?
'
noEditDuration
'
:
'
editDuration
'
;
}
return
(
...
...
ts/webui/src/components/overview/count/ExpDuration.tsx
View file @
063d6b74
...
...
@@ -50,7 +50,7 @@ export const ExpDuration = (): any => (
<
EditExpeParamContext
.
Provider
value
=
{
{
editType
:
CONTROLTYPE
[
0
],
field
:
'
maxEx
ec
Duration
'
,
field
:
'
maxEx
periment
Duration
'
,
title
:
'
Max duration
'
,
maxExecDuration
:
maxExecDurationStr
,
maxTrialNum
:
EXPERIMENT
.
maxTrialNumber
,
...
...
ts/webui/src/components/overview/count/TrialCount.tsx
View file @
063d6b74
...
...
@@ -89,7 +89,7 @@ export const TrialCount = (): any => {
<
EditExpeParamContext
.
Provider
value
=
{
{
title
:
MAX_TRIAL_NUMBERS
,
field
:
'
maxTrialNum
'
,
field
:
'
maxTrialNum
ber
'
,
editType
:
CONTROLTYPE
[
1
],
maxExecDuration
:
''
,
maxTrialNum
:
EXPERIMENT
.
maxTrialNumber
,
...
...
ts/webui/src/components/slideNav/TrialConfigPanel.tsx
View file @
063d6b74
...
...
@@ -3,8 +3,7 @@ import { Stack, Panel, PrimaryButton } from '@fluentui/react';
import
{
EXPERIMENT
}
from
'
../../static/datamodel
'
;
import
MonacoEditor
from
'
react-monaco-editor
'
;
import
{
MONACO
}
from
'
../../static/const
'
;
import
{
AppContext
}
from
'
../../App
'
;
import
{
convertDuration
,
convertTimeAsUnit
,
caclMonacoEditorHeight
}
from
'
../../static/function
'
;
import
{
convertDuration
,
caclMonacoEditorHeight
}
from
'
../../static/function
'
;
import
{
prettyStringify
}
from
'
../../static/json_util
'
;
import
lodash
from
'
lodash
'
;
import
'
../../static/style/logDrawer.scss
'
;
...
...
@@ -69,56 +68,45 @@ class TrialConfigPanel extends React.Component<LogDrawerProps, LogDrawerState> {
const
prettyWidth
=
innerWidth
>
1400
?
100
:
60
;
const
showProfile
=
JSON
.
stringify
(
profile
,
filter
,
2
);
return
(
<
AppContext
.
Consumer
>
{
(
value
):
React
.
ReactNode
=>
{
const
unit
=
value
.
maxDurationUnit
;
profile
.
params
.
maxExecDuration
=
`
${
convertTimeAsUnit
(
unit
,
profile
.
params
.
maxExecDuration
)}${
unit
}
`
;
const
showProfile
=
JSON
.
stringify
(
profile
,
filter
,
2
);
return
(
<
Stack
>
<
Panel
isOpen
=
{
true
}
hasCloseButton
=
{
false
}
isFooterAtBottom
=
{
true
}
isLightDismiss
=
{
true
}
onLightDismissClick
=
{
hideConfigPanel
}
>
<
div
className
=
'panel'
>
{
panelName
===
'
search space
'
?
(
<
div
>
<
div
className
=
'panelName'
>
Search space
</
div
>
<
MonacoEditor
height
=
{
monacoEditorHeight
}
language
=
'json'
theme
=
'vs-light'
value
=
{
prettyStringify
(
EXPERIMENT
.
searchSpace
,
prettyWidth
,
2
)
}
options
=
{
MONACO
}
/>
</
div
>
)
:
(
<
div
className
=
'profile'
>
<
div
className
=
'panelName'
>
Config
</
div
>
<
MonacoEditor
width
=
'100%'
height
=
{
monacoEditorHeight
}
language
=
'json'
theme
=
'vs-light'
value
=
{
showProfile
}
options
=
{
MONACO
}
/>
</
div
>
)
}
<
PrimaryButton
text
=
'Close'
className
=
'configClose'
onClick
=
{
hideConfigPanel
}
/>
</
div
>
</
Panel
>
</
Stack
>
);
}
}
</
AppContext
.
Consumer
>
<
Stack
>
<
Panel
isOpen
=
{
true
}
hasCloseButton
=
{
false
}
isFooterAtBottom
=
{
true
}
isLightDismiss
=
{
true
}
onLightDismissClick
=
{
hideConfigPanel
}
>
<
div
className
=
'panel'
>
{
panelName
===
'
search space
'
?
(
<
div
>
<
div
className
=
'panelName'
>
Search space
</
div
>
<
MonacoEditor
height
=
{
monacoEditorHeight
}
language
=
'json'
theme
=
'vs-light'
value
=
{
prettyStringify
(
EXPERIMENT
.
searchSpace
,
prettyWidth
,
2
)
}
options
=
{
MONACO
}
/>
</
div
>
)
:
(
<
div
className
=
'profile'
>
<
div
className
=
'panelName'
>
Config
</
div
>
<
MonacoEditor
width
=
'100%'
height
=
{
monacoEditorHeight
}
language
=
'json'
theme
=
'vs-light'
value
=
{
showProfile
}
options
=
{
MONACO
}
/>
</
div
>
)
}
<
PrimaryButton
text
=
'Close'
className
=
'configClose'
onClick
=
{
hideConfigPanel
}
/>
</
div
>
</
Panel
>
</
Stack
>
);
}
}
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment