Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
87ed70cd
Unverified
Commit
87ed70cd
authored
Aug 24, 2018
by
fishyds
Committed by
GitHub
Aug 24, 2018
Browse files
Merge pull request #4 from Microsoft/merge-from-dogfood-v1-0824
[Code merge] Merge code from dogfood-v1 branch
parents
f1f6f880
61d47a4d
Changes
43
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
342 additions
and
189 deletions
+342
-189
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+40
-16
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+24
-14
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
...ager/training_service/local/localTrainingServiceForGPU.ts
+10
-12
src/nni_manager/training_service/remote_machine/metricsCollector.ts
...nager/training_service/remote_machine/metricsCollector.ts
+15
-3
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+0
-24
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+28
-25
src/nni_manager/training_service/remote_machine/sshClientUtility.ts
...nager/training_service/remote_machine/sshClientUtility.ts
+36
-18
src/nni_manager/training_service/test/remoteMachineTrainingService.test.ts
...raining_service/test/remoteMachineTrainingService.test.ts
+5
-5
src/nni_manager/training_service/test/sshClientUtility.test.ts
...ni_manager/training_service/test/sshClientUtility.test.ts
+113
-0
src/nni_manager/types/node-nvidia-smi/index.d.ts
src/nni_manager/types/node-nvidia-smi/index.d.ts
+10
-8
src/sdk/pynni/nni/protocol.py
src/sdk/pynni/nni/protocol.py
+7
-1
src/sdk/pynni/nni/tuner.py
src/sdk/pynni/nni/tuner.py
+2
-0
src/webui/README.md
src/webui/README.md
+2
-4
src/webui/public/index.html
src/webui/public/index.html
+0
-16
src/webui/src/App.css
src/webui/src/App.css
+4
-0
src/webui/src/App.tsx
src/webui/src/App.tsx
+2
-1
src/webui/src/components/Control.tsx
src/webui/src/components/Control.tsx
+3
-2
src/webui/src/components/Sessionpro.tsx
src/webui/src/components/Sessionpro.tsx
+27
-26
tools/nnicmd/constants.py
tools/nnicmd/constants.py
+1
-1
tools/nnicmd/launcher.py
tools/nnicmd/launcher.py
+13
-13
No files found.
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
87ed70cd
...
...
@@ -19,9 +19,11 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
nodeNvidiaSmi
from
'
node-nvidia-smi
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
/* Example of nvidia-smi result
{
...
...
@@ -287,9 +289,13 @@ class GPUScheduler {
private
gpuSummary
!
:
GPUSummary
;
private
stopping
:
boolean
;
private
log
:
Logger
;
private
nvdmNotFoundRegex
:
RegExp
;
constructor
()
{
this
.
stopping
=
false
;
this
.
log
=
getLogger
();
this
.
nvdmNotFoundRegex
=
/nvidia-smi: not found/gi
;
}
public
async
run
():
Promise
<
void
>
{
...
...
@@ -297,7 +303,11 @@ class GPUScheduler {
try
{
this
.
gpuSummary
=
await
this
.
readGPUSummary
();
}
catch
(
error
)
{
console
.
error
(
'
Read GPU summary failed with error
'
,
error
);
this
.
log
.
error
(
'
Read GPU summary failed with error:
'
,
error
);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if
(
this
.
nvdmNotFoundRegex
.
test
(
error
))
{
break
;
}
}
await
delay
(
5000
);
}
...
...
@@ -315,28 +325,42 @@ class GPUScheduler {
this
.
stopping
=
true
;
}
private
generateEmbededGPUSummary
(
data
:
nodeNvidiaSmi
.
GPUInfo
)
:
GPUInfo
[]
{
let
gpuInfos
:
GPUInfo
[]
=
[];
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
assert
(
gpuNumber
>
0
);
if
(
gpuNumber
==
1
)
{
const
embededGPUSummary
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
.
push
(
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
}
else
{
const
embededGPUSummaryArray
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
[]
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
=
embededGPUSummaryArray
.
map
(
embededGPUSummary
=>
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
}
return
gpuInfos
;
}
private
convertGPUSummaryToInfo
(
embededGPUSummary
:
nodeNvidiaSmi
.
EmbededGPUSummary
)
:
GPUInfo
{
return
new
GPUInfo
(
typeof
embededGPUSummary
.
process
===
'
object
'
?
1
:
0
,
parseFloat
(
embededGPUSummary
.
utilization
.
memory_util
),
parseFloat
(
embededGPUSummary
.
utilization
.
gpu_util
),
parseInt
(
embededGPUSummary
.
minor_number
,
10
));
}
private
readGPUSummary
():
Promise
<
GPUSummary
>
{
return
new
Promise
((
resolve
:
Function
,
reject
:
Function
):
void
=>
{
nodeNvidiaSmi
((
error
:
Error
,
data
:
nodeNvidiaSmi
.
GPUInfo
)
=>
{
if
(
error
!==
undefined
)
{
if
(
error
)
{
reject
(
error
);
}
else
{
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
const
gpuSummary
:
GPUSummary
=
new
GPUSummary
(
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
)
,
gpuNumber
,
Date
().
toString
(),
data
.
nvidia_smi_log
.
gpu
.
map
((
gpuInfo
:
{
minor_number
:
string
;
utilization
:
{
gpu_util
:
string
;
memory_util
:
string
;
};
process
:
string
|
object
;
})
=>
new
GPUInfo
(
typeof
gpuInfo
.
process
===
'
object
'
?
1
:
0
,
parseFloat
(
gpuInfo
.
utilization
.
memory_util
),
parseFloat
(
gpuInfo
.
utilization
.
gpu_util
),
parseInt
(
gpuInfo
.
minor_number
,
10
)
))
this
.
generateEmbededGPUSummary
(
data
)
);
resolve
(
gpuSummary
);
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
87ed70cd
...
...
@@ -27,6 +27,8 @@ import * as path from 'path';
import
*
as
ts
from
'
tail-stream
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
HostJobApplicationForm
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
...
...
@@ -92,9 +94,8 @@ class LocalTrainingService implements TrainingService {
private
initialized
:
boolean
;
private
stopping
:
boolean
;
private
rootDir
!
:
string
;
private
codeDir
!
:
string
;
private
command
!
:
string
;
private
log
:
Logger
;
protected
log
:
Logger
;
protected
localTrailConfig
?:
TrialConfig
;
constructor
()
{
this
.
eventEmitter
=
new
EventEmitter
();
...
...
@@ -227,11 +228,12 @@ class LocalTrainingService implements TrainingService {
this
.
initialized
=
true
;
}
switch
(
key
)
{
case
'
codeDir
'
:
this
.
codeDir
=
value
;
break
;
case
'
command
'
:
this
.
command
=
value
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
this
.
localTrailConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
if
(
!
this
.
localTrailConfig
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
}
break
;
default
:
}
...
...
@@ -239,10 +241,14 @@ class LocalTrainingService implements TrainingService {
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
switch
(
key
)
{
case
'
codeDir
'
:
return
Promise
.
resolve
(
this
.
codeDir
);
case
'
command
'
:
return
Promise
.
resolve
(
this
.
command
);
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
let
getResult
:
Promise
<
string
>
;
if
(
!
this
.
localTrailConfig
)
{
getResult
=
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`
${
key
}
is never set yet`
));
}
else
{
getResult
=
Promise
.
resolve
(
!
this
.
localTrailConfig
?
''
:
JSON
.
stringify
(
this
.
localTrailConfig
));
}
return
getResult
;
default
:
return
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
'
Key not found
'
));
}
...
...
@@ -292,14 +298,18 @@ class LocalTrainingService implements TrainingService {
const
variables
:
{
key
:
string
;
value
:
string
}[]
=
this
.
getEnvironmentVariables
(
trialJobDetail
,
resource
);
const
runScriptLines
:
string
[]
=
[];
if
(
!
this
.
localTrailConfig
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
runScriptLines
.
push
(
'
#!/bin/bash
'
,
`cd
${
this
.
codeDir
}
`
);
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
for
(
const
variable
of
variables
)
{
runScriptLines
.
push
(
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
);
}
runScriptLines
.
push
(
`eval
${
this
.
command
}
2>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
stderr
'
)}
`
,
`eval
${
this
.
localTrailConfig
.
command
}
2>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s%3N
\`
>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
await
cpp
.
exec
(
`mkdir -p
${
trialJobDetail
.
workingDirectory
}
`
);
...
...
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
View file @
87ed70cd
...
...
@@ -22,6 +22,7 @@
import
{
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
LocalTrainingService
}
from
'
./localTrainingService
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
type
LocalTrialJobDetailForGPU
=
TrialJobDetail
&
{
gpuIndices
:
number
[]
};
...
...
@@ -52,8 +53,14 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
await
super
.
setClusterMetadata
(
key
,
value
);
switch
(
key
)
{
case
'
requiredGPUNum
'
:
this
.
requiredGPUNum
=
parseInt
(
value
,
10
);
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
this
.
localTrailConfig
!==
undefined
)
{
this
.
requiredGPUNum
=
this
.
localTrailConfig
.
gpuNum
;
}
else
{
// If no valid trial config is initialized, set requiredGPUNum to 0 as fallback value.
this
.
requiredGPUNum
=
0
;
}
this
.
log
.
info
(
'
required GPU number is
'
+
this
.
requiredGPUNum
);
if
(
this
.
gpuScheduler
===
undefined
)
{
this
.
gpuScheduler
=
new
GPUScheduler
();
}
...
...
@@ -62,15 +69,6 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
}
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
switch
(
key
)
{
case
'
requiredGPUNum
'
:
return
Promise
.
resolve
(
`
${
this
.
requiredGPUNum
}
`
);
default
:
return
super
.
getClusterMetadata
(
key
);
}
}
public
cleanUp
():
Promise
<
void
>
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
this
.
gpuScheduler
.
stop
();
...
...
@@ -80,7 +78,7 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
protected
onTrialJobStatusChanged
(
trialJob
:
LocalTrialJobDetailForGPU
,
oldStatus
:
TrialJobStatus
):
void
{
if
(
trialJob
.
gpuIndices
.
length
!==
0
)
{
if
(
trialJob
.
gpuIndices
!==
undefined
&&
trialJob
.
gpuIndices
.
length
!==
0
)
{
if
(
oldStatus
===
'
RUNNING
'
&&
trialJob
.
status
!==
'
RUNNING
'
)
{
for
(
const
index
of
trialJob
.
gpuIndices
)
{
this
.
availableGPUIndices
[
index
]
=
false
;
...
...
src/nni_manager/training_service/remote_machine/metricsCollector.ts
View file @
87ed70cd
...
...
@@ -24,7 +24,7 @@ import { EventEmitter } from 'events';
import
*
as
path
from
'
path
'
;
import
{
Client
}
from
'
ssh2
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialJobStatus
,
TrialJobDetail
}
from
'
../../common/trainingService
'
;
import
{
JobMetrics
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineTrialJobDetail
}
from
'
./remoteMachineData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
...
...
@@ -56,8 +56,12 @@ export class MetricsCollector {
if
(
rmMetrics
!==
undefined
&&
rmMetrics
.
length
>
0
)
{
rmMetrics
.
forEach
((
jobMetrics
)
=>
{
const
trialJobId
:
string
=
jobMetrics
.
jobId
;
const
trialJobDetail
:
RemoteMachineTrialJobDetail
=
<
RemoteMachineTrialJobDetail
>
this
.
trialJobsMap
.
get
(
trialJobId
);
assert
(
trialJobDetail
);
// If job status is not alive again, remove its GPU reservation
if
(
!
[
'
RUNNING
'
].
includes
(
jobMetrics
.
jobStatus
))
{
trialJobDetail
.
status
=
jobMetrics
.
jobStatus
;
this
.
log
.
info
(
`Set trialjob
${
trialJobDetail
.
id
}
status to
${
trialJobDetail
.
status
}
`
);
runningJobsMap
.
forEach
((
jobIds
:
string
[],
rmMeta
:
RemoteMachineMeta
)
=>
{
// If remote machine has no GPU, gpuReservcation is not initialized, so check if it's undefined
if
(
rmMeta
.
gpuReservation
!==
undefined
)
{
...
...
@@ -81,11 +85,19 @@ export class MetricsCollector {
if
(
status
.
includes
(
trialJob
.
status
))
{
if
(
map
.
has
(
trialJob
.
rmMeta
))
{
const
ids
=
map
.
get
(
trialJob
.
rmMeta
);
if
(
ids
!==
undefined
)
{
if
(
ids
!==
undefined
&&
!
ids
.
includes
(
id
)
)
{
ids
.
push
(
id
);
}
}
else
{
map
.
set
(
trialJob
.
rmMeta
,
[
id
]);
let
initJobIds
:
string
[]
=
[
id
];
// If the remote machine has jobs reserve GPU, also put that jobs into list to get metrics data
if
(
trialJob
.
rmMeta
.
gpuReservation
!==
undefined
)
{
const
concatJobIds
:
string
[]
=
initJobIds
.
concat
(
Array
.
from
(
trialJob
.
rmMeta
.
gpuReservation
.
values
()));
initJobIds
=
concatJobIds
.
filter
((
item
,
pos
)
=>
concatJobIds
.
indexOf
(
item
)
===
pos
);
}
map
.
set
(
trialJob
.
rmMeta
,
initJobIds
);
}
}
});
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
87ed70cd
...
...
@@ -23,15 +23,6 @@ import { Client } from 'ssh2';
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
/**
* Enum of key for remote machine metadata for configuration
*/
export
enum
RemoteMachineMetadataKey
{
MACHINE_LIST
=
'
machine_list
'
,
TRIAL_CONFIG
=
'
trial_config
'
,
EXPERIMENT_ID
=
'
experimentId
'
,
RANDOM_SCHEDULER
=
'
random_scheduler
'
}
/**
* Metadata of remote machine for configuration and statuc query
...
...
@@ -54,21 +45,6 @@ export class RemoteMachineMeta {
}
}
/**
* Configuration for trial job on remote machine
*/
export
class
RemoteMachineTrialConfig
{
public
readonly
command
:
string
;
public
readonly
codeDir
:
string
;
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
)
{
this
.
command
=
command
;
this
.
codeDir
=
codeDir
;
this
.
gpuNum
=
gpuNum
;
}
}
/**
* The execution result for command executed on remote machine
*/
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
87ed70cd
...
...
@@ -37,12 +37,14 @@ import {
}
from
'
../../common/trainingService
'
;
import
{
delay
,
getExperimentRootDir
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
MetricsCollector
}
from
'
./metricsCollector
'
;
import
{
HOSTJOBSHELLFORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineMetadataKey
,
REMOTEMACHINERUNSHELLFORMAT
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialConfig
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
HOSTJOBSHELLFORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
REMOTEMACHINERUNSHELLFORMAT
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
}
from
'
./remoteMachineData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
...
...
@@ -56,7 +58,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Experiment root directory
private
expRootDir
:
string
;
private
remoteExpRootDir
:
string
;
private
trialConfig
:
RemoteMachine
TrialConfig
|
undefined
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
gpuScheduler
:
GPUScheduler
;
private
jobQueue
:
string
[];
private
timer
:
ObservableTimer
;
...
...
@@ -89,11 +91,11 @@ class RemoteMachineTrainingService implements TrainingService {
// Remove trial job with trialJobId from job queue
this
.
jobQueue
.
shift
();
}
else
{
// Break the while loop since no GPU resource is available right now,
// Break the while loop since no GPU resource is available right now,
// Wait to schedule job in next time iteration
break
;
}
}
;
}
const
metricsCollector
:
MetricsCollector
=
new
MetricsCollector
(
this
.
machineSSHClientMap
,
this
.
trialJobsMap
,
this
.
remoteExpRootDir
,
this
.
metricsEmitter
);
await
metricsCollector
.
collectMetrics
();
...
...
@@ -186,6 +188,7 @@ class RemoteMachineTrainingService implements TrainingService {
form
);
this
.
jobQueue
.
push
(
trialJobId
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
}
else
{
return
Promise
.
reject
(
new
Error
(
`Job form not supported:
${
JSON
.
stringify
(
form
)}
, jobType should be HOST or TRIAL.`
));
...
...
@@ -207,7 +210,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Remove the job with trialJobId from job queue
const
index
:
number
=
this
.
jobQueue
.
indexOf
(
trialJobId
);
if
(
index
>=
0
)
{
if
(
index
>=
0
)
{
this
.
jobQueue
.
splice
(
index
,
1
);
}
...
...
@@ -243,11 +246,11 @@ class RemoteMachineTrainingService implements TrainingService {
*/
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
RemoteMachine
MetadataKey
.
MACHINE_LIST
:
case
TrialConfig
MetadataKey
.
MACHINE_LIST
:
await
this
.
setupConnections
(
value
);
break
;
case
RemoteMachine
MetadataKey
.
TRIAL_CONFIG
:
const
remoteMachineTrailConfig
:
RemoteMachine
TrialConfig
=
<
RemoteMachine
TrialConfig
>
JSON
.
parse
(
value
);
case
TrialConfig
MetadataKey
.
TRIAL_CONFIG
:
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
if
(
!
remoteMachineTrailConfig
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
...
...
@@ -294,14 +297,14 @@ class RemoteMachineTrainingService implements TrainingService {
deferred
.
resolve
();
}
}).
on
(
'
error
'
,
(
err
:
Error
)
=>
{
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
}).
connect
({
host
:
rmMeta
.
ip
,
port
:
rmMeta
.
port
,
username
:
rmMeta
.
username
,
password
:
rmMeta
.
passwd
});
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
}).
connect
({
host
:
rmMeta
.
ip
,
port
:
rmMeta
.
port
,
username
:
rmMeta
.
username
,
password
:
rmMeta
.
passwd
});
});
return
deferred
.
promise
;
...
...
@@ -312,16 +315,16 @@ class RemoteMachineTrainingService implements TrainingService {
//TO DO: Should we mk experiments rootDir here?
const
nniRootDir
:
string
=
'
/tmp/nni
'
;
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
this
.
remoteExpRootDir
}
`
,
conn
);
// Copy NNI scripts to remote expeirment working directory
const
remoteScriptsDir
:
string
=
this
.
getRemoteScriptsPath
();
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
remoteScriptsDir
}
`
,
conn
);
await
SSHClientUtility
.
copyDirectoryToRemote
(
'
./scripts
'
,
remoteScriptsDir
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`chmod 777
${
nniRootDir
}
${
nniRootDir
}
/*
${
nniRootDir
}
/scripts/*`
,
conn
);
//Begin to execute gpu_metrics_collection scripts
SSHClientUtility
.
remoteExeCommand
(
`cd
${
remoteScriptsDir
}
&& python3 gpu_metrics_collector.py`
,
conn
);
this
.
timer
.
subscribe
(
async
(
tick
:
number
)
=>
{
const
cmdresult
:
RemoteCommandResult
=
await
SSHClientUtility
.
remoteExeCommand
(
...
...
@@ -351,7 +354,7 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
log
.
error
(
errorMessage
);
deferred
.
reject
();
throw
new
NNIError
(
NNIErrorNames
.
RESOURCE_NOT_AVAILABLE
,
errorMessage
);
}
else
if
(
rmScheduleResult
.
resultType
==
ScheduleResultType
.
SUCCEED
}
else
if
(
rmScheduleResult
.
resultType
==
=
ScheduleResultType
.
SUCCEED
&&
rmScheduleResult
.
scheduleInfo
!==
undefined
)
{
const
rmScheduleInfo
:
RemoteMachineScheduleInfo
=
rmScheduleResult
.
scheduleInfo
;
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
remoteExpRootDir
,
'
trials
'
,
trialJobId
);
...
...
@@ -364,11 +367,11 @@ class RemoteMachineTrainingService implements TrainingService {
trialJobDetail
.
rmMeta
=
rmScheduleInfo
.
rmMeta
;
deferred
.
resolve
(
true
);
}
else
if
(
rmScheduleResult
.
resultType
==
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
)
{
}
else
if
(
rmScheduleResult
.
resultType
==
=
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
)
{
this
.
log
.
info
(
`Right now no available GPU can be allocated for trial
${
trialJobId
}
, will try to schedule later`
);
deferred
.
resolve
(
false
);
}
else
{
deferred
.
reject
(
'
Invalid schedule resutl type:
'
+
rmScheduleResult
.
resultType
);
deferred
.
reject
(
`
Invalid schedule resutl type:
${
rmScheduleResult
.
resultType
}
`
);
}
return
deferred
.
promise
;
...
...
@@ -394,7 +397,7 @@ class RemoteMachineTrainingService implements TrainingService {
trialWorkingFolder
,
trialJobId
,
path
.
join
(
trialWorkingFolder
,
'
.nni
'
,
'
jobpid
'
),
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
(
typeof
cuda_visible_device
===
'
string
'
&&
cuda_visible_device
.
length
>
0
)
?
`CUDA_VISIBLE_DEVICES=
${
cuda_visible_device
}
`
:
`CUDA_VISIBLE_DEVICES=" " `
,
...
...
src/nni_manager/training_service/remote_machine/sshClientUtility.ts
View file @
87ed70cd
...
...
@@ -19,14 +19,16 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
os
from
'
os
'
;
import
{
Client
,
ClientChannel
,
SFTPWrapper
}
from
'
ssh2
'
;
import
*
as
stream
from
"
stream
"
;
import
*
as
stream
from
'
stream
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getExperimentRootDir
}
from
'
../../common/utils
'
;
import
{
getLogger
}
from
'
../../common/log
'
;
import
{
uniqueString
}
from
'
../../common/utils
'
;
import
{
RemoteCommandResult
}
from
'
./remoteMachineData
'
;
/**
...
...
@@ -43,17 +45,18 @@ export namespace SSHClientUtility {
*/
export
async
function
copyDirectoryToRemote
(
localDirectory
:
string
,
remoteDirectory
:
string
,
sshClient
:
Client
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
localCompressedDir
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
directory.tar.gz
'
);
const
remoteCompressedDir
:
string
=
path
.
join
(
remoteDirectory
,
'
directory.tar.gz
'
);
const
tmpTarName
:
string
=
`
${
uniqueString
(
10
)}
.tar.gz`
;
const
localTarPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
tmpTarName
);
const
remoteTarPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
tmpTarName
);
// Compress files in local directory to experiment root directory
await
cpp
.
exec
(
`tar -czf
${
local
CompressedDir
}
-C
${
localDirectory
}
.`
);
await
cpp
.
exec
(
`tar -czf
${
local
TarPath
}
-C
${
localDirectory
}
.`
);
// Copy the compressed file to remoteDirectory and delete it
await
copyFileToRemote
(
local
CompressedDir
,
remoteCompressedDir
,
sshClient
);
await
cpp
.
exec
(
`rm
${
local
CompressedDir
}
`
);
await
copyFileToRemote
(
local
TarPath
,
remoteTarPath
,
sshClient
);
await
cpp
.
exec
(
`rm
${
local
TarPath
}
`
);
// Decompress the remote compressed file in and delete it
await
remoteExeCommand
(
`tar -oxzf
${
remote
CompressedDir
}
-C
${
remoteDirectory
}
`
,
sshClient
);
await
remoteExeCommand
(
`rm
${
remote
CompressedDir
}
`
,
sshClient
);
await
remoteExeCommand
(
`tar -oxzf
${
remote
TarPath
}
-C
${
remoteDirectory
}
`
,
sshClient
);
await
remoteExeCommand
(
`rm
${
remote
TarPath
}
`
,
sshClient
);
deferred
.
resolve
();
return
deferred
.
promise
;
...
...
@@ -65,18 +68,23 @@ export namespace SSHClientUtility {
* @param remoteFilePath the target path in remote machine
* @param sshClient SSH Client
*/
export
function
copyFileToRemote
(
localFilePath
:
string
,
remoteFilePath
:
string
,
sshClient
:
Client
)
:
Promise
<
string
>
{
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
export
function
copyFileToRemote
(
localFilePath
:
string
,
remoteFilePath
:
string
,
sshClient
:
Client
)
:
Promise
<
boolean
>
{
assert
(
sshClient
!==
undefined
);
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
if
(
err
)
{
deferred
.
reject
();
getLogger
().
error
(
`copyFileToRemote:
${
err
.
message
}
,
${
localFilePath
}
,
${
remoteFilePath
}
`
);
deferred
.
reject
(
err
);
return
;
}
assert
(
sftp
!==
undefined
);
sftp
.
fastPut
(
localFilePath
,
remoteFilePath
,
(
fastPutErr
:
Error
)
=>
{
sftp
.
end
();
if
(
fastPutErr
)
{
deferred
.
reject
();
deferred
.
reject
(
fastPutErr
);
}
else
{
deferred
.
resolve
(
'
success
'
);
deferred
.
resolve
(
true
);
}
});
});
...
...
@@ -97,14 +105,16 @@ export namespace SSHClientUtility {
client
.
exec
(
command
,
(
err
:
Error
,
channel
:
ClientChannel
)
=>
{
if
(
err
)
{
getLogger
().
error
(
`remoteExeCommand:
${
err
.
message
}
`
);
deferred
.
reject
(
err
);
return
;
}
channel
.
on
(
'
data
'
,
function
(
data
:
any
,
dataStderr
:
any
)
{
channel
.
on
(
'
data
'
,
(
data
:
any
,
dataStderr
:
any
)
=>
{
if
(
dataStderr
)
{
stderr
+=
data
.
toString
();
}
else
{
}
else
{
stdout
+=
data
.
toString
();
}
}).
on
(
'
exit
'
,
(
code
,
signal
)
=>
{
...
...
@@ -124,7 +134,10 @@ export namespace SSHClientUtility {
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
if
(
err
)
{
getLogger
().
error
(
`getRemoteFileContent:
${
err
.
message
}
`
);
deferred
.
reject
(
new
Error
(
`SFTP error:
${
err
.
message
}
`
));
return
;
}
try
{
const
sftpStream
:
stream
.
Readable
=
sftp
.
createReadStream
(
filePath
);
...
...
@@ -133,11 +146,16 @@ export namespace SSHClientUtility {
sftpStream
.
on
(
'
data
'
,
(
data
:
Buffer
|
string
)
=>
{
dataBuffer
+=
data
;
}).
on
(
'
error
'
,
(
streamErr
:
Error
)
=>
{
sftp
.
end
();
deferred
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
streamErr
.
message
));
}).
on
(
'
end
'
,
()
=>
{
// sftp connection need to be released manually once operation is done
sftp
.
end
();
deferred
.
resolve
(
dataBuffer
);
});
}
catch
(
error
)
{
getLogger
().
error
(
`getRemoteFileContent:
${
error
.
message
}
`
);
sftp
.
end
();
deferred
.
reject
(
new
Error
(
`SFTP error:
${
error
.
message
}
`
));
}
});
...
...
src/nni_manager/training_service/test/remoteMachineTrainingService.test.ts
View file @
87ed70cd
...
...
@@ -27,7 +27,7 @@ import * as tmp from 'tmp';
import
*
as
component
from
'
../../common/component
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
cleanupUnitTest
,
delay
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
RemoteMachine
MetadataKey
}
from
'
../
remote_machine/remoteMachineData
'
;
import
{
TrialConfig
MetadataKey
}
from
'
../
common/trialConfigMetadataKey
'
;
import
{
RemoteMachineTrainingService
}
from
'
../remote_machine/remoteMachineTrainingService
'
;
// copy mockedTrail.py to local folder
...
...
@@ -95,9 +95,9 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
if
(
skip
)
{
return
;
}
await
remoteMachineTrainingService
.
setClusterMetadata
(
RemoteMachine
MetadataKey
.
MACHINE_LIST
,
machineList
);
await
remoteMachineTrainingService
.
setClusterMetadata
(
TrialConfig
MetadataKey
.
MACHINE_LIST
,
machineList
);
await
remoteMachineTrainingService
.
setClusterMetadata
(
RemoteMachine
MetadataKey
.
TRIAL_CONFIG
,
`{"command":"sleep 1h && echo ","codeDir":"
${
localCodeDir
}
","gpuNum":1}`
);
TrialConfig
MetadataKey
.
TRIAL_CONFIG
,
`{"command":"sleep 1h && echo ","codeDir":"
${
localCodeDir
}
","gpuNum":1}`
);
const
form
:
TrialJobApplicationForm
=
{
jobType
:
'
TRIAL
'
,
hyperParameters
:
'
mock hyperparameters
'
...
...
@@ -126,11 +126,11 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
return
;
}
// set machine list'
await
remoteMachineTrainingService
.
setClusterMetadata
(
RemoteMachine
MetadataKey
.
MACHINE_LIST
,
machineList
);
await
remoteMachineTrainingService
.
setClusterMetadata
(
TrialConfig
MetadataKey
.
MACHINE_LIST
,
machineList
);
// set meta data
const
trialConfig
:
string
=
`{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"
${
localCodeDir
}
\",\"gpuNum\":0}`
await
remoteMachineTrainingService
.
setClusterMetadata
(
RemoteMachine
MetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
await
remoteMachineTrainingService
.
setClusterMetadata
(
TrialConfig
MetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
// submit job
const
form
:
TrialJobApplicationForm
=
{
...
...
src/nni_manager/training_service/test/sshClientUtility.test.ts
0 → 100644
View file @
87ed70cd
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
{
Client
}
from
'
ssh2
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
SSHClientUtility
}
from
'
../remote_machine/sshClientUtility
'
;
const
LOCALFILE
:
string
=
'
/tmp/sshclientUTData
'
;
const
REMOTEFILE
:
string
=
'
/tmp/sshclientUTData
'
;
async
function
copyFile
(
conn
:
Client
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
conn
.
sftp
((
err
,
sftp
)
=>
{
if
(
err
)
{
deferred
.
reject
(
err
);
return
;
}
sftp
.
fastPut
(
LOCALFILE
,
REMOTEFILE
,
(
fastPutErr
:
Error
)
=>
{
sftp
.
end
();
if
(
fastPutErr
)
{
deferred
.
reject
(
fastPutErr
);
}
else
{
deferred
.
resolve
();
}
}
);
});
return
deferred
.
promise
;
}
async
function
copyFileToRemoteLoop
(
conn
:
Client
):
Promise
<
void
>
{
for
(
let
i
:
number
=
0
;
i
<
500
;
i
++
)
{
console
.
log
(
i
);
await
SSHClientUtility
.
copyFileToRemote
(
LOCALFILE
,
REMOTEFILE
,
conn
);
}
}
async
function
remoteExeCommandLoop
(
conn
:
Client
):
Promise
<
void
>
{
for
(
let
i
:
number
=
0
;
i
<
500
;
i
++
)
{
console
.
log
(
i
);
await
SSHClientUtility
.
remoteExeCommand
(
'
ls
'
,
conn
);
}
}
async
function
getRemoteFileContentLoop
(
conn
:
Client
):
Promise
<
void
>
{
for
(
let
i
:
number
=
0
;
i
<
500
;
i
++
)
{
console
.
log
(
i
);
await
SSHClientUtility
.
getRemoteFileContent
(
REMOTEFILE
,
conn
);
}
}
describe
(
'
sshClientUtility test
'
,
()
=>
{
let
skip
:
boolean
=
true
;
let
rmMeta
:
any
;
try
{
rmMeta
=
JSON
.
parse
(
fs
.
readFileSync
(
'
../../.vscode/rminfo.json
'
,
'
utf8
'
));
}
catch
(
err
)
{
skip
=
true
;
}
before
(
async
()
=>
{
await
cpp
.
exec
(
`echo '1234' >
${
LOCALFILE
}
`
);
});
after
(()
=>
{
fs
.
unlinkSync
(
LOCALFILE
);
});
it
(
'
Test SSHClientUtility
'
,
(
done
)
=>
{
if
(
skip
)
{
done
();
return
;
}
const
conn
:
Client
=
new
Client
();
conn
.
on
(
'
ready
'
,
async
()
=>
{
await
copyFile
(
conn
);
await
Promise
.
all
([
copyFileToRemoteLoop
(
conn
),
copyFileToRemoteLoop
(
conn
),
copyFileToRemoteLoop
(
conn
),
remoteExeCommandLoop
(
conn
),
getRemoteFileContentLoop
(
conn
)
]);
done
();
}).
connect
(
rmMeta
);
});
});
src/nni_manager/types/node-nvidia-smi/index.d.ts
View file @
87ed70cd
...
...
@@ -2,17 +2,19 @@ declare module 'node-nvidia-smi' {
function
smi
(
callback
:
(
error
:
Error
,
data
:
smi
.
GPUInfo
)
=>
void
):
void
;
namespace
smi
{
interface
EmbededGPUSummary
{
minor_number
:
string
;
utilization
:
{
gpu_util
:
string
;
memory_util
:
string
;
};
process
:
string
|
object
;
}
interface
GPUInfo
{
nvidia_smi_log
:
{
attached_gpus
:
string
;
gpu
:
{
minor_number
:
string
;
utilization
:
{
gpu_util
:
string
;
memory_util
:
string
;
};
process
:
string
|
object
;
}[];
gpu
:
EmbededGPUSummary
[]
|
EmbededGPUSummary
;
};
}
}
...
...
src/sdk/pynni/nni/protocol.py
View file @
87ed70cd
...
...
@@ -18,7 +18,7 @@
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import
logging
from
enum
import
Enum
...
...
@@ -64,8 +64,14 @@ def receive():
Returns a tuple of command (CommandType) and payload (str)
"""
header
=
_in_file
.
read
(
8
)
logging
.
getLogger
(
__name__
).
debug
(
'Received command, header: [%s]'
%
header
)
if
header
is
None
or
len
(
header
)
<
8
:
# Pipe EOF encountered
logging
.
getLogger
(
__name__
).
debug
(
'Pipe EOF encountered'
)
return
None
,
None
length
=
int
(
header
[
2
:])
data
=
_in_file
.
read
(
length
)
command
=
CommandType
(
header
[:
2
])
data
=
data
.
decode
(
'utf8'
)
logging
.
getLogger
(
__name__
).
debug
(
'Received command, data: [%s]'
%
data
)
return
command
,
data
src/sdk/pynni/nni/tuner.py
View file @
87ed70cd
...
...
@@ -127,6 +127,8 @@ def _handle_request(tuner):
_logger
.
debug
(
'waiting receive_message'
)
command
,
data
=
receive
()
if
command
is
None
:
return
False
_logger
.
debug
(
command
)
_logger
.
debug
(
data
)
...
...
src/webui/README.md
View file @
87ed70cd
...
...
@@ -18,7 +18,6 @@ Click the tab "Overview".
*
See good performance trial.
*
See search_space json.
*
See complete trial cdf graph.
### View job accuracy
...
...
@@ -29,7 +28,7 @@ Click the tab "Optimization Progress" to see the point graph of all trials. Hove
Click the tab "Hyper Parameter" to see the parallel graph.
*
You can select the percentage to cut down some lines.
*
Choose two ax
e
s to swap its positions
*
Choose two ax
i
s to swap its positions
### View trial status
...
...
@@ -39,11 +38,10 @@ Click the tab "Trial Status" to see the status of the all trials. Specifically:
*
Trial detail: trial's id, trial's duration, start time, end time, status and accuracy.
*
Kill: you can kill a job that status is running.
*
Tensor: you can see a job in the tensorflow graph, it will link to the Tensorboard page.
*
Log: click the button, you can see the log about NNI and pai.
### Control
Click the tab "Control" to add a new trial or update the search_space file.
Click the tab "Control" to add a new trial or update the search_space file
and some experiment parameters
.
### View Tensorboard Graph
...
...
src/webui/public/index.html
View file @
87ed70cd
...
...
@@ -4,22 +4,6 @@
<head>
<meta
charset=
"utf-8"
>
<meta
name=
"viewport"
content=
"width=device-width, initial-scale=1, shrink-to-fit=no"
>
<meta
name=
"theme-color"
content=
"#000000"
>
<!--
manifest.json provides metadata used when your web app is added to the
homescreen on Android. See https://developers.google.com/web/fundamentals/engage-and-retain/web-app-manifest/
-->
<link
rel=
"manifest"
href=
"%PUBLIC_URL%/manifest.json"
>
<link
rel=
"shortcut icon"
href=
"%PUBLIC_URL%/icon.jpg"
>
<!--
Notice the use of %PUBLIC_URL% in the tags above.
It will be replaced with the URL of the `public` folder during the build.
Only files inside the `public` folder can be referenced from the HTML.
Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
work correctly both with client-side routing and a non-root public URL.
Learn how to configure a non-root public URL by running `npm run build`.
-->
<title>
Neural Network Intelligence
</title>
</head>
...
...
src/webui/src/App.css
View file @
87ed70cd
.header_title
{
width
:
100%
;
height
:
60px
;
line-height
:
60px
;
font-size
:
24px
;
font-family
:
'Segoe UI'
,
Tahoma
,
Geneva
,
Verdana
,
sans-serif
;
color
:
white
;
background-color
:
rgb
(
60
,
141
,
188
)
;
user-select
:
none
;
text-align
:
center
;
...
...
src/webui/src/App.tsx
View file @
87ed70cd
...
...
@@ -6,7 +6,8 @@ class App extends React.Component<{}, {}> {
render
()
{
return
(
<
div
className
=
"App"
>
<
header
className
=
"header_title"
><
img
src
=
{
require
(
'
./logo.jpg
'
)
}
alt
=
""
/></
header
>
{
/* <header className="header_title"><img src={require('./logo.jpg')} alt=""/></header> */
}
<
header
className
=
"header_title"
>
Neural Network Intelligence
</
header
>
<
div
className
=
"content"
>
<
SlideBar
/>
<
div
className
=
"right"
>
{
this
.
props
.
children
}
</
div
>
...
...
src/webui/src/components/Control.tsx
View file @
87ed70cd
...
...
@@ -203,7 +203,7 @@ class Control extends React.Component<{}, ControlState> {
// update trial number parameters
trialParameterMess
=
(
exper
:
Experiments
,
str
:
string
)
=>
{
this
.
getUpdateExample
();
axios
(
`
${
MANAGER_IP
}
/experiment`
,
{
method
:
'
PUT
'
,
headers
:
{
...
...
@@ -216,6 +216,7 @@ class Control extends React.Component<{}, ControlState> {
}).
then
(
res
=>
{
if
(
res
.
status
===
200
)
{
message
.
success
(
`Update
${
str
.
toLocaleLowerCase
()}
successfully`
);
this
.
getUpdateExample
();
}
else
{
message
.
error
(
`Update
${
str
.
toLocaleLowerCase
()}
failed`
);
}
...
...
@@ -284,8 +285,8 @@ class Control extends React.Component<{}, ControlState> {
}
userUpdateSeaspace
=
()
=>
{
this
.
updateSearchLoad
();
this
.
getUpdateExample
();
const
{
updateSearch
}
=
this
.
state
;
if
(
updateSearch
!==
''
||
updateSearch
!==
null
)
{
const
{
experiment
}
=
this
.
state
;
...
...
src/webui/src/components/Sessionpro.tsx
View file @
87ed70cd
...
...
@@ -2,12 +2,12 @@ import * as React from 'react';
import
axios
from
'
axios
'
;
import
{
Table
,
Select
,
Row
,
Col
,
Icon
}
from
'
antd
'
;
import
{
MANAGER_IP
,
overviewItem
,
roundNum
}
from
'
../const
'
;
import
ReactEcharts
from
'
echarts-for-react
'
;
//
import ReactEcharts from 'echarts-for-react';
const
Option
=
Select
.
Option
;
import
JSONTree
from
'
react-json-tree
'
;
require
(
'
echarts/lib/chart/line
'
);
require
(
'
echarts/lib/component/tooltip
'
);
require
(
'
echarts/lib/component/title
'
);
//
require('echarts/lib/chart/line');
//
require('echarts/lib/component/tooltip');
//
require('echarts/lib/component/title');
require
(
'
../style/sessionpro.css
'
);
interface
TableObj
{
...
...
@@ -266,26 +266,26 @@ class Sessionpro extends React.Component<{}, SessionState> {
});
}
// draw CDF
const
{
trialRun
}
=
this
.
state
;
if
(
this
.
_isMounted
)
{
this
.
setState
({
option
:
this
.
getOption
(
trialRun
)
});
}
//
const { trialRun } = this.state;
//
if (this._isMounted) {
//
this.setState({
//
option: this.getOption(trialRun)
//
});
//
}
// CDF graph 'No data' judge
if
(
trialRun
.
length
===
0
)
{
if
(
this
.
_isMounted
)
{
this
.
setState
({
noData
:
'
No data
'
});
}
}
else
{
if
(
this
.
_isMounted
)
{
this
.
setState
({
noData
:
''
});
}
}
//
if (trialRun.length === 0) {
//
if (this._isMounted) {
//
this.setState({
//
noData: 'No data'
//
});
//
}
//
} else {
//
if (this._isMounted) {
//
this.setState({
//
noData: ''
//
});
//
}
//
}
}
});
}
...
...
@@ -372,7 +372,8 @@ class Sessionpro extends React.Component<{}, SessionState> {
};
const
{
trialProfile
,
searchSpace
,
tunerAssessor
,
tableData
,
option
,
noData
trialProfile
,
searchSpace
,
tunerAssessor
,
tableData
,
// option, noData
}
=
this
.
state
;
let
running
;
if
(
trialProfile
.
endTime
===
'
not over
'
)
{
...
...
@@ -500,13 +501,13 @@ class Sessionpro extends React.Component<{}, SessionState> {
scroll
=
{
{
x
:
'
100%
'
,
y
:
540
}
}
/>
</
div
>
<
div
className
=
"cdf"
>
{
/*
<div className="cdf">
<ReactEcharts
option={option}
style={{ height: 500, padding: '0px' }}
/>
<div className="addNodata">{noData}</div>
</
div
>
</div>
*/
}
</
div
>
);
}
...
...
tools/nnicmd/constants.py
View file @
87ed70cd
...
...
@@ -43,7 +43,7 @@ WARNING_INFO = 'Waining: %s'
EXPERIMENT_SUCCESS_INFO
=
'Start experiment success! The experiment id is %s, and the restful server post is %s.
\n
'
\
'You can use these commands to get more information about this experiment:
\n
'
\
' commands description
\n
'
\
'1. nnictl experiment
l
s
list all
of experiments
\n
'
\
'1. nnictl experiment s
how
show the information
of experiments
\n
'
\
'2. nnictl trial ls list all of trial jobs
\n
'
\
'3. nnictl stop stop a experiment
\n
'
\
'4. nnictl trial kill kill a trial job by id
\n
'
\
...
...
tools/nnicmd/launcher.py
View file @
87ed70cd
...
...
@@ -54,13 +54,20 @@ def start_rest_server(manager, port, platform, mode, experiment_id=None):
process
=
Popen
(
cmds
,
stdout
=
stdout_file
,
stderr
=
stderr_file
)
return
process
def
set_
loc
al_config
(
experiment_config
,
port
):
'''
Call setClusterMetadata (rest PUT /parameters/cluster-metadata) to pass platform and machineList"
'''
def
set_
tri
al_config
(
experiment_config
,
port
):
'''
set trial configuration
'''
request_data
=
dict
()
request_data
[
'codeDir'
]
=
experiment_config
[
'trial'
][
'trialCodeDir'
]
request_data
[
'command'
]
=
experiment_config
[
'trial'
][
'trialCommand'
]
value_dict
=
dict
()
value_dict
[
'command'
]
=
experiment_config
[
'trial'
][
'trialCommand'
]
value_dict
[
'codeDir'
]
=
experiment_config
[
'trial'
][
'trialCodeDir'
]
value_dict
[
'gpuNum'
]
=
experiment_config
[
'trial'
][
'trialGpuNum'
]
request_data
[
'trial_config'
]
=
value_dict
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
20
)
return
True
if
response
and
response
.
status_code
==
200
else
False
return
True
if
response
.
status_code
==
200
else
False
def
set_local_config
(
experiment_config
,
port
):
'''set local configuration'''
return
set_trial_config
(
experiment_config
,
port
)
def
set_remote_config
(
experiment_config
,
port
):
'''Call setClusterMetadata to pass trial'''
...
...
@@ -72,14 +79,7 @@ def set_remote_config(experiment_config, port):
return
False
#set trial_config
request_data
=
dict
()
value_dict
=
dict
()
value_dict
[
'command'
]
=
experiment_config
[
'trial'
][
'trialCommand'
]
value_dict
[
'codeDir'
]
=
experiment_config
[
'trial'
][
'trialCodeDir'
]
value_dict
[
'gpuNum'
]
=
experiment_config
[
'trial'
][
'trialGpuNum'
]
request_data
[
'trial_config'
]
=
value_dict
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
20
)
return
True
if
response
.
status_code
==
200
else
False
return
set_trial_config
(
experiment_config
,
port
)
def
set_experiment
(
experiment_config
,
mode
,
port
):
'''Call startExperiment (rest POST /experiment) with yaml file content'''
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment