Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
61d47a4d
Commit
61d47a4d
authored
Aug 24, 2018
by
Deshui Yu
Browse files
[Code merge] Merge code from dogfood-v1 branch
parent
f1f6f880
Changes
43
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
342 additions
and
189 deletions
+342
-189
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+40
-16
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+24
-14
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
...ager/training_service/local/localTrainingServiceForGPU.ts
+10
-12
src/nni_manager/training_service/remote_machine/metricsCollector.ts
...nager/training_service/remote_machine/metricsCollector.ts
+15
-3
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+0
-24
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+28
-25
src/nni_manager/training_service/remote_machine/sshClientUtility.ts
...nager/training_service/remote_machine/sshClientUtility.ts
+36
-18
src/nni_manager/training_service/test/remoteMachineTrainingService.test.ts
...raining_service/test/remoteMachineTrainingService.test.ts
+5
-5
src/nni_manager/training_service/test/sshClientUtility.test.ts
...ni_manager/training_service/test/sshClientUtility.test.ts
+113
-0
src/nni_manager/types/node-nvidia-smi/index.d.ts
src/nni_manager/types/node-nvidia-smi/index.d.ts
+10
-8
src/sdk/pynni/nni/protocol.py
src/sdk/pynni/nni/protocol.py
+7
-1
src/sdk/pynni/nni/tuner.py
src/sdk/pynni/nni/tuner.py
+2
-0
src/webui/README.md
src/webui/README.md
+2
-4
src/webui/public/index.html
src/webui/public/index.html
+0
-16
src/webui/src/App.css
src/webui/src/App.css
+4
-0
src/webui/src/App.tsx
src/webui/src/App.tsx
+2
-1
src/webui/src/components/Control.tsx
src/webui/src/components/Control.tsx
+3
-2
src/webui/src/components/Sessionpro.tsx
src/webui/src/components/Sessionpro.tsx
+27
-26
tools/nnicmd/constants.py
tools/nnicmd/constants.py
+1
-1
tools/nnicmd/launcher.py
tools/nnicmd/launcher.py
+13
-13
No files found.
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
61d47a4d
...
@@ -19,9 +19,11 @@
...
@@ -19,9 +19,11 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
nodeNvidiaSmi
from
'
node-nvidia-smi
'
;
import
*
as
nodeNvidiaSmi
from
'
node-nvidia-smi
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
/* Example of nvidia-smi result
/* Example of nvidia-smi result
{
{
...
@@ -287,9 +289,13 @@ class GPUScheduler {
...
@@ -287,9 +289,13 @@ class GPUScheduler {
private
gpuSummary
!
:
GPUSummary
;
private
gpuSummary
!
:
GPUSummary
;
private
stopping
:
boolean
;
private
stopping
:
boolean
;
private
log
:
Logger
;
private
nvdmNotFoundRegex
:
RegExp
;
constructor
()
{
constructor
()
{
this
.
stopping
=
false
;
this
.
stopping
=
false
;
this
.
log
=
getLogger
();
this
.
nvdmNotFoundRegex
=
/nvidia-smi: not found/gi
;
}
}
public
async
run
():
Promise
<
void
>
{
public
async
run
():
Promise
<
void
>
{
...
@@ -297,7 +303,11 @@ class GPUScheduler {
...
@@ -297,7 +303,11 @@ class GPUScheduler {
try
{
try
{
this
.
gpuSummary
=
await
this
.
readGPUSummary
();
this
.
gpuSummary
=
await
this
.
readGPUSummary
();
}
catch
(
error
)
{
}
catch
(
error
)
{
console
.
error
(
'
Read GPU summary failed with error
'
,
error
);
this
.
log
.
error
(
'
Read GPU summary failed with error:
'
,
error
);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if
(
this
.
nvdmNotFoundRegex
.
test
(
error
))
{
break
;
}
}
}
await
delay
(
5000
);
await
delay
(
5000
);
}
}
...
@@ -315,28 +325,42 @@ class GPUScheduler {
...
@@ -315,28 +325,42 @@ class GPUScheduler {
this
.
stopping
=
true
;
this
.
stopping
=
true
;
}
}
private
generateEmbededGPUSummary
(
data
:
nodeNvidiaSmi
.
GPUInfo
)
:
GPUInfo
[]
{
let
gpuInfos
:
GPUInfo
[]
=
[];
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
assert
(
gpuNumber
>
0
);
if
(
gpuNumber
==
1
)
{
const
embededGPUSummary
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
.
push
(
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
}
else
{
const
embededGPUSummaryArray
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
[]
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
=
embededGPUSummaryArray
.
map
(
embededGPUSummary
=>
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
}
return
gpuInfos
;
}
private
convertGPUSummaryToInfo
(
embededGPUSummary
:
nodeNvidiaSmi
.
EmbededGPUSummary
)
:
GPUInfo
{
return
new
GPUInfo
(
typeof
embededGPUSummary
.
process
===
'
object
'
?
1
:
0
,
parseFloat
(
embededGPUSummary
.
utilization
.
memory_util
),
parseFloat
(
embededGPUSummary
.
utilization
.
gpu_util
),
parseInt
(
embededGPUSummary
.
minor_number
,
10
));
}
private
readGPUSummary
():
Promise
<
GPUSummary
>
{
private
readGPUSummary
():
Promise
<
GPUSummary
>
{
return
new
Promise
((
resolve
:
Function
,
reject
:
Function
):
void
=>
{
return
new
Promise
((
resolve
:
Function
,
reject
:
Function
):
void
=>
{
nodeNvidiaSmi
((
error
:
Error
,
data
:
nodeNvidiaSmi
.
GPUInfo
)
=>
{
nodeNvidiaSmi
((
error
:
Error
,
data
:
nodeNvidiaSmi
.
GPUInfo
)
=>
{
if
(
error
!==
undefined
)
{
if
(
error
)
{
reject
(
error
);
reject
(
error
);
}
else
{
}
else
{
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
const
gpuSummary
:
GPUSummary
=
new
GPUSummary
(
const
gpuSummary
:
GPUSummary
=
new
GPUSummary
(
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
)
,
gpuNumber
,
Date
().
toString
(),
Date
().
toString
(),
data
.
nvidia_smi_log
.
gpu
.
map
((
gpuInfo
:
{
this
.
generateEmbededGPUSummary
(
data
)
minor_number
:
string
;
utilization
:
{
gpu_util
:
string
;
memory_util
:
string
;
};
process
:
string
|
object
;
})
=>
new
GPUInfo
(
typeof
gpuInfo
.
process
===
'
object
'
?
1
:
0
,
parseFloat
(
gpuInfo
.
utilization
.
memory_util
),
parseFloat
(
gpuInfo
.
utilization
.
gpu_util
),
parseInt
(
gpuInfo
.
minor_number
,
10
)
))
);
);
resolve
(
gpuSummary
);
resolve
(
gpuSummary
);
}
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
61d47a4d
...
@@ -27,6 +27,8 @@ import * as path from 'path';
...
@@ -27,6 +27,8 @@ import * as path from 'path';
import
*
as
ts
from
'
tail-stream
'
;
import
*
as
ts
from
'
tail-stream
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
import
{
HostJobApplicationForm
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
HostJobApplicationForm
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
...
@@ -92,9 +94,8 @@ class LocalTrainingService implements TrainingService {
...
@@ -92,9 +94,8 @@ class LocalTrainingService implements TrainingService {
private
initialized
:
boolean
;
private
initialized
:
boolean
;
private
stopping
:
boolean
;
private
stopping
:
boolean
;
private
rootDir
!
:
string
;
private
rootDir
!
:
string
;
private
codeDir
!
:
string
;
protected
log
:
Logger
;
private
command
!
:
string
;
protected
localTrailConfig
?:
TrialConfig
;
private
log
:
Logger
;
constructor
()
{
constructor
()
{
this
.
eventEmitter
=
new
EventEmitter
();
this
.
eventEmitter
=
new
EventEmitter
();
...
@@ -227,11 +228,12 @@ class LocalTrainingService implements TrainingService {
...
@@ -227,11 +228,12 @@ class LocalTrainingService implements TrainingService {
this
.
initialized
=
true
;
this
.
initialized
=
true
;
}
}
switch
(
key
)
{
switch
(
key
)
{
case
'
codeDir
'
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
this
.
codeDir
=
value
;
this
.
localTrailConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
break
;
// Parse trial config failed, throw Error
case
'
command
'
:
if
(
!
this
.
localTrailConfig
)
{
this
.
command
=
value
;
throw
new
Error
(
'
trial config parsed failed
'
);
}
break
;
break
;
default
:
default
:
}
}
...
@@ -239,10 +241,14 @@ class LocalTrainingService implements TrainingService {
...
@@ -239,10 +241,14 @@ class LocalTrainingService implements TrainingService {
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
switch
(
key
)
{
switch
(
key
)
{
case
'
codeDir
'
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
return
Promise
.
resolve
(
this
.
codeDir
);
let
getResult
:
Promise
<
string
>
;
case
'
command
'
:
if
(
!
this
.
localTrailConfig
)
{
return
Promise
.
resolve
(
this
.
command
);
getResult
=
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`
${
key
}
is never set yet`
));
}
else
{
getResult
=
Promise
.
resolve
(
!
this
.
localTrailConfig
?
''
:
JSON
.
stringify
(
this
.
localTrailConfig
));
}
return
getResult
;
default
:
default
:
return
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
'
Key not found
'
));
return
Promise
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
'
Key not found
'
));
}
}
...
@@ -292,14 +298,18 @@ class LocalTrainingService implements TrainingService {
...
@@ -292,14 +298,18 @@ class LocalTrainingService implements TrainingService {
const
variables
:
{
key
:
string
;
value
:
string
}[]
=
this
.
getEnvironmentVariables
(
trialJobDetail
,
resource
);
const
variables
:
{
key
:
string
;
value
:
string
}[]
=
this
.
getEnvironmentVariables
(
trialJobDetail
,
resource
);
const
runScriptLines
:
string
[]
=
[];
const
runScriptLines
:
string
[]
=
[];
if
(
!
this
.
localTrailConfig
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
runScriptLines
.
push
(
runScriptLines
.
push
(
'
#!/bin/bash
'
,
'
#!/bin/bash
'
,
`cd
${
this
.
codeDir
}
`
);
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
for
(
const
variable
of
variables
)
{
for
(
const
variable
of
variables
)
{
runScriptLines
.
push
(
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
);
runScriptLines
.
push
(
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
);
}
}
runScriptLines
.
push
(
runScriptLines
.
push
(
`eval
${
this
.
command
}
2>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
stderr
'
)}
`
,
`eval
${
this
.
localTrailConfig
.
command
}
2>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s%3N
\`
>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
`echo $?
\`
date +%s%3N
\`
>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
await
cpp
.
exec
(
`mkdir -p
${
trialJobDetail
.
workingDirectory
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
trialJobDetail
.
workingDirectory
}
`
);
...
...
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
View file @
61d47a4d
...
@@ -22,6 +22,7 @@
...
@@ -22,6 +22,7 @@
import
{
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
LocalTrainingService
}
from
'
./localTrainingService
'
;
import
{
LocalTrainingService
}
from
'
./localTrainingService
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
type
LocalTrialJobDetailForGPU
=
TrialJobDetail
&
{
gpuIndices
:
number
[]
};
type
LocalTrialJobDetailForGPU
=
TrialJobDetail
&
{
gpuIndices
:
number
[]
};
...
@@ -52,8 +53,14 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
...
@@ -52,8 +53,14 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
await
super
.
setClusterMetadata
(
key
,
value
);
await
super
.
setClusterMetadata
(
key
,
value
);
switch
(
key
)
{
switch
(
key
)
{
case
'
requiredGPUNum
'
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
this
.
requiredGPUNum
=
parseInt
(
value
,
10
);
if
(
this
.
localTrailConfig
!==
undefined
)
{
this
.
requiredGPUNum
=
this
.
localTrailConfig
.
gpuNum
;
}
else
{
// If no valid trial config is initialized, set requiredGPUNum to 0 as fallback value.
this
.
requiredGPUNum
=
0
;
}
this
.
log
.
info
(
'
required GPU number is
'
+
this
.
requiredGPUNum
);
if
(
this
.
gpuScheduler
===
undefined
)
{
if
(
this
.
gpuScheduler
===
undefined
)
{
this
.
gpuScheduler
=
new
GPUScheduler
();
this
.
gpuScheduler
=
new
GPUScheduler
();
}
}
...
@@ -62,15 +69,6 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
...
@@ -62,15 +69,6 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
}
}
}
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
switch
(
key
)
{
case
'
requiredGPUNum
'
:
return
Promise
.
resolve
(
`
${
this
.
requiredGPUNum
}
`
);
default
:
return
super
.
getClusterMetadata
(
key
);
}
}
public
cleanUp
():
Promise
<
void
>
{
public
cleanUp
():
Promise
<
void
>
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
this
.
gpuScheduler
.
stop
();
this
.
gpuScheduler
.
stop
();
...
@@ -80,7 +78,7 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
...
@@ -80,7 +78,7 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
}
protected
onTrialJobStatusChanged
(
trialJob
:
LocalTrialJobDetailForGPU
,
oldStatus
:
TrialJobStatus
):
void
{
protected
onTrialJobStatusChanged
(
trialJob
:
LocalTrialJobDetailForGPU
,
oldStatus
:
TrialJobStatus
):
void
{
if
(
trialJob
.
gpuIndices
.
length
!==
0
)
{
if
(
trialJob
.
gpuIndices
!==
undefined
&&
trialJob
.
gpuIndices
.
length
!==
0
)
{
if
(
oldStatus
===
'
RUNNING
'
&&
trialJob
.
status
!==
'
RUNNING
'
)
{
if
(
oldStatus
===
'
RUNNING
'
&&
trialJob
.
status
!==
'
RUNNING
'
)
{
for
(
const
index
of
trialJob
.
gpuIndices
)
{
for
(
const
index
of
trialJob
.
gpuIndices
)
{
this
.
availableGPUIndices
[
index
]
=
false
;
this
.
availableGPUIndices
[
index
]
=
false
;
...
...
src/nni_manager/training_service/remote_machine/metricsCollector.ts
View file @
61d47a4d
...
@@ -24,7 +24,7 @@ import { EventEmitter } from 'events';
...
@@ -24,7 +24,7 @@ import { EventEmitter } from 'events';
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
{
Client
}
from
'
ssh2
'
;
import
{
Client
}
from
'
ssh2
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialJobStatus
,
TrialJobDetail
}
from
'
../../common/trainingService
'
;
import
{
JobMetrics
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineTrialJobDetail
}
from
'
./remoteMachineData
'
;
import
{
JobMetrics
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineTrialJobDetail
}
from
'
./remoteMachineData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
...
@@ -56,8 +56,12 @@ export class MetricsCollector {
...
@@ -56,8 +56,12 @@ export class MetricsCollector {
if
(
rmMetrics
!==
undefined
&&
rmMetrics
.
length
>
0
)
{
if
(
rmMetrics
!==
undefined
&&
rmMetrics
.
length
>
0
)
{
rmMetrics
.
forEach
((
jobMetrics
)
=>
{
rmMetrics
.
forEach
((
jobMetrics
)
=>
{
const
trialJobId
:
string
=
jobMetrics
.
jobId
;
const
trialJobId
:
string
=
jobMetrics
.
jobId
;
const
trialJobDetail
:
RemoteMachineTrialJobDetail
=
<
RemoteMachineTrialJobDetail
>
this
.
trialJobsMap
.
get
(
trialJobId
);
assert
(
trialJobDetail
);
// If job status is not alive again, remove its GPU reservation
// If job status is not alive again, remove its GPU reservation
if
(
!
[
'
RUNNING
'
].
includes
(
jobMetrics
.
jobStatus
))
{
if
(
!
[
'
RUNNING
'
].
includes
(
jobMetrics
.
jobStatus
))
{
trialJobDetail
.
status
=
jobMetrics
.
jobStatus
;
this
.
log
.
info
(
`Set trialjob
${
trialJobDetail
.
id
}
status to
${
trialJobDetail
.
status
}
`
);
runningJobsMap
.
forEach
((
jobIds
:
string
[],
rmMeta
:
RemoteMachineMeta
)
=>
{
runningJobsMap
.
forEach
((
jobIds
:
string
[],
rmMeta
:
RemoteMachineMeta
)
=>
{
// If remote machine has no GPU, gpuReservcation is not initialized, so check if it's undefined
// If remote machine has no GPU, gpuReservcation is not initialized, so check if it's undefined
if
(
rmMeta
.
gpuReservation
!==
undefined
)
{
if
(
rmMeta
.
gpuReservation
!==
undefined
)
{
...
@@ -81,11 +85,19 @@ export class MetricsCollector {
...
@@ -81,11 +85,19 @@ export class MetricsCollector {
if
(
status
.
includes
(
trialJob
.
status
))
{
if
(
status
.
includes
(
trialJob
.
status
))
{
if
(
map
.
has
(
trialJob
.
rmMeta
))
{
if
(
map
.
has
(
trialJob
.
rmMeta
))
{
const
ids
=
map
.
get
(
trialJob
.
rmMeta
);
const
ids
=
map
.
get
(
trialJob
.
rmMeta
);
if
(
ids
!==
undefined
)
{
if
(
ids
!==
undefined
&&
!
ids
.
includes
(
id
)
)
{
ids
.
push
(
id
);
ids
.
push
(
id
);
}
}
}
else
{
}
else
{
map
.
set
(
trialJob
.
rmMeta
,
[
id
]);
let
initJobIds
:
string
[]
=
[
id
];
// If the remote machine has jobs reserve GPU, also put that jobs into list to get metrics data
if
(
trialJob
.
rmMeta
.
gpuReservation
!==
undefined
)
{
const
concatJobIds
:
string
[]
=
initJobIds
.
concat
(
Array
.
from
(
trialJob
.
rmMeta
.
gpuReservation
.
values
()));
initJobIds
=
concatJobIds
.
filter
((
item
,
pos
)
=>
concatJobIds
.
indexOf
(
item
)
===
pos
);
}
map
.
set
(
trialJob
.
rmMeta
,
initJobIds
);
}
}
}
}
});
});
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
61d47a4d
...
@@ -23,15 +23,6 @@ import { Client } from 'ssh2';
...
@@ -23,15 +23,6 @@ import { Client } from 'ssh2';
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
/**
* Enum of key for remote machine metadata for configuration
*/
export
enum
RemoteMachineMetadataKey
{
MACHINE_LIST
=
'
machine_list
'
,
TRIAL_CONFIG
=
'
trial_config
'
,
EXPERIMENT_ID
=
'
experimentId
'
,
RANDOM_SCHEDULER
=
'
random_scheduler
'
}
/**
/**
* Metadata of remote machine for configuration and statuc query
* Metadata of remote machine for configuration and statuc query
...
@@ -54,21 +45,6 @@ export class RemoteMachineMeta {
...
@@ -54,21 +45,6 @@ export class RemoteMachineMeta {
}
}
}
}
/**
* Configuration for trial job on remote machine
*/
export
class
RemoteMachineTrialConfig
{
public
readonly
command
:
string
;
public
readonly
codeDir
:
string
;
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
)
{
this
.
command
=
command
;
this
.
codeDir
=
codeDir
;
this
.
gpuNum
=
gpuNum
;
}
}
/**
/**
* The execution result for command executed on remote machine
* The execution result for command executed on remote machine
*/
*/
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
61d47a4d
...
@@ -37,12 +37,14 @@ import {
...
@@ -37,12 +37,14 @@ import {
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
delay
,
getExperimentRootDir
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
delay
,
getExperimentRootDir
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
MetricsCollector
}
from
'
./metricsCollector
'
;
import
{
MetricsCollector
}
from
'
./metricsCollector
'
;
import
{
import
{
HOSTJOBSHELLFORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineMetadataKey
,
HOSTJOBSHELLFORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
REMOTEMACHINERUNSHELLFORMAT
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
REMOTEMACHINERUNSHELLFORMAT
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
RemoteMachineTrialConfig
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
RemoteMachineTrialJobDetail
,
ScheduleResultType
}
from
'
./remoteMachineData
'
;
}
from
'
./remoteMachineData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
...
@@ -56,7 +58,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -56,7 +58,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Experiment root directory
// Experiment root directory
private
expRootDir
:
string
;
private
expRootDir
:
string
;
private
remoteExpRootDir
:
string
;
private
remoteExpRootDir
:
string
;
private
trialConfig
:
RemoteMachine
TrialConfig
|
undefined
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
gpuScheduler
:
GPUScheduler
;
private
gpuScheduler
:
GPUScheduler
;
private
jobQueue
:
string
[];
private
jobQueue
:
string
[];
private
timer
:
ObservableTimer
;
private
timer
:
ObservableTimer
;
...
@@ -89,11 +91,11 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -89,11 +91,11 @@ class RemoteMachineTrainingService implements TrainingService {
// Remove trial job with trialJobId from job queue
// Remove trial job with trialJobId from job queue
this
.
jobQueue
.
shift
();
this
.
jobQueue
.
shift
();
}
else
{
}
else
{
// Break the while loop since no GPU resource is available right now,
// Break the while loop since no GPU resource is available right now,
// Wait to schedule job in next time iteration
// Wait to schedule job in next time iteration
break
;
break
;
}
}
}
;
}
const
metricsCollector
:
MetricsCollector
=
new
MetricsCollector
(
const
metricsCollector
:
MetricsCollector
=
new
MetricsCollector
(
this
.
machineSSHClientMap
,
this
.
trialJobsMap
,
this
.
remoteExpRootDir
,
this
.
metricsEmitter
);
this
.
machineSSHClientMap
,
this
.
trialJobsMap
,
this
.
remoteExpRootDir
,
this
.
metricsEmitter
);
await
metricsCollector
.
collectMetrics
();
await
metricsCollector
.
collectMetrics
();
...
@@ -186,6 +188,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -186,6 +188,7 @@ class RemoteMachineTrainingService implements TrainingService {
form
);
form
);
this
.
jobQueue
.
push
(
trialJobId
);
this
.
jobQueue
.
push
(
trialJobId
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
}
else
{
}
else
{
return
Promise
.
reject
(
new
Error
(
`Job form not supported:
${
JSON
.
stringify
(
form
)}
, jobType should be HOST or TRIAL.`
));
return
Promise
.
reject
(
new
Error
(
`Job form not supported:
${
JSON
.
stringify
(
form
)}
, jobType should be HOST or TRIAL.`
));
...
@@ -207,7 +210,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -207,7 +210,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Remove the job with trialJobId from job queue
// Remove the job with trialJobId from job queue
const
index
:
number
=
this
.
jobQueue
.
indexOf
(
trialJobId
);
const
index
:
number
=
this
.
jobQueue
.
indexOf
(
trialJobId
);
if
(
index
>=
0
)
{
if
(
index
>=
0
)
{
this
.
jobQueue
.
splice
(
index
,
1
);
this
.
jobQueue
.
splice
(
index
,
1
);
}
}
...
@@ -243,11 +246,11 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -243,11 +246,11 @@ class RemoteMachineTrainingService implements TrainingService {
*/
*/
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
switch
(
key
)
{
case
RemoteMachine
MetadataKey
.
MACHINE_LIST
:
case
TrialConfig
MetadataKey
.
MACHINE_LIST
:
await
this
.
setupConnections
(
value
);
await
this
.
setupConnections
(
value
);
break
;
break
;
case
RemoteMachine
MetadataKey
.
TRIAL_CONFIG
:
case
TrialConfig
MetadataKey
.
TRIAL_CONFIG
:
const
remoteMachineTrailConfig
:
RemoteMachine
TrialConfig
=
<
RemoteMachine
TrialConfig
>
JSON
.
parse
(
value
);
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
// Parse trial config failed, throw Error
// Parse trial config failed, throw Error
if
(
!
remoteMachineTrailConfig
)
{
if
(
!
remoteMachineTrailConfig
)
{
throw
new
Error
(
'
trial config parsed failed
'
);
throw
new
Error
(
'
trial config parsed failed
'
);
...
@@ -294,14 +297,14 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -294,14 +297,14 @@ class RemoteMachineTrainingService implements TrainingService {
deferred
.
resolve
();
deferred
.
resolve
();
}
}
}).
on
(
'
error
'
,
(
err
:
Error
)
=>
{
}).
on
(
'
error
'
,
(
err
:
Error
)
=>
{
// SSH connection error, reject with error message
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
deferred
.
reject
(
new
Error
(
err
.
message
));
}).
connect
({
}).
connect
({
host
:
rmMeta
.
ip
,
host
:
rmMeta
.
ip
,
port
:
rmMeta
.
port
,
port
:
rmMeta
.
port
,
username
:
rmMeta
.
username
,
username
:
rmMeta
.
username
,
password
:
rmMeta
.
passwd
password
:
rmMeta
.
passwd
});
});
});
});
return
deferred
.
promise
;
return
deferred
.
promise
;
...
@@ -312,16 +315,16 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -312,16 +315,16 @@ class RemoteMachineTrainingService implements TrainingService {
//TO DO: Should we mk experiments rootDir here?
//TO DO: Should we mk experiments rootDir here?
const
nniRootDir
:
string
=
'
/tmp/nni
'
;
const
nniRootDir
:
string
=
'
/tmp/nni
'
;
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
this
.
remoteExpRootDir
}
`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
this
.
remoteExpRootDir
}
`
,
conn
);
// Copy NNI scripts to remote expeirment working directory
// Copy NNI scripts to remote expeirment working directory
const
remoteScriptsDir
:
string
=
this
.
getRemoteScriptsPath
();
const
remoteScriptsDir
:
string
=
this
.
getRemoteScriptsPath
();
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
remoteScriptsDir
}
`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
remoteScriptsDir
}
`
,
conn
);
await
SSHClientUtility
.
copyDirectoryToRemote
(
'
./scripts
'
,
remoteScriptsDir
,
conn
);
await
SSHClientUtility
.
copyDirectoryToRemote
(
'
./scripts
'
,
remoteScriptsDir
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`chmod 777
${
nniRootDir
}
${
nniRootDir
}
/*
${
nniRootDir
}
/scripts/*`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`chmod 777
${
nniRootDir
}
${
nniRootDir
}
/*
${
nniRootDir
}
/scripts/*`
,
conn
);
//Begin to execute gpu_metrics_collection scripts
//Begin to execute gpu_metrics_collection scripts
SSHClientUtility
.
remoteExeCommand
(
`cd
${
remoteScriptsDir
}
&& python3 gpu_metrics_collector.py`
,
conn
);
SSHClientUtility
.
remoteExeCommand
(
`cd
${
remoteScriptsDir
}
&& python3 gpu_metrics_collector.py`
,
conn
);
this
.
timer
.
subscribe
(
this
.
timer
.
subscribe
(
async
(
tick
:
number
)
=>
{
async
(
tick
:
number
)
=>
{
const
cmdresult
:
RemoteCommandResult
=
await
SSHClientUtility
.
remoteExeCommand
(
const
cmdresult
:
RemoteCommandResult
=
await
SSHClientUtility
.
remoteExeCommand
(
...
@@ -351,7 +354,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -351,7 +354,7 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
log
.
error
(
errorMessage
);
this
.
log
.
error
(
errorMessage
);
deferred
.
reject
();
deferred
.
reject
();
throw
new
NNIError
(
NNIErrorNames
.
RESOURCE_NOT_AVAILABLE
,
errorMessage
);
throw
new
NNIError
(
NNIErrorNames
.
RESOURCE_NOT_AVAILABLE
,
errorMessage
);
}
else
if
(
rmScheduleResult
.
resultType
==
ScheduleResultType
.
SUCCEED
}
else
if
(
rmScheduleResult
.
resultType
==
=
ScheduleResultType
.
SUCCEED
&&
rmScheduleResult
.
scheduleInfo
!==
undefined
)
{
&&
rmScheduleResult
.
scheduleInfo
!==
undefined
)
{
const
rmScheduleInfo
:
RemoteMachineScheduleInfo
=
rmScheduleResult
.
scheduleInfo
;
const
rmScheduleInfo
:
RemoteMachineScheduleInfo
=
rmScheduleResult
.
scheduleInfo
;
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
remoteExpRootDir
,
'
trials
'
,
trialJobId
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
remoteExpRootDir
,
'
trials
'
,
trialJobId
);
...
@@ -364,11 +367,11 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -364,11 +367,11 @@ class RemoteMachineTrainingService implements TrainingService {
trialJobDetail
.
rmMeta
=
rmScheduleInfo
.
rmMeta
;
trialJobDetail
.
rmMeta
=
rmScheduleInfo
.
rmMeta
;
deferred
.
resolve
(
true
);
deferred
.
resolve
(
true
);
}
else
if
(
rmScheduleResult
.
resultType
==
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
)
{
}
else
if
(
rmScheduleResult
.
resultType
==
=
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
)
{
this
.
log
.
info
(
`Right now no available GPU can be allocated for trial
${
trialJobId
}
, will try to schedule later`
);
this
.
log
.
info
(
`Right now no available GPU can be allocated for trial
${
trialJobId
}
, will try to schedule later`
);
deferred
.
resolve
(
false
);
deferred
.
resolve
(
false
);
}
else
{
}
else
{
deferred
.
reject
(
'
Invalid schedule resutl type:
'
+
rmScheduleResult
.
resultType
);
deferred
.
reject
(
`
Invalid schedule resutl type:
${
rmScheduleResult
.
resultType
}
`
);
}
}
return
deferred
.
promise
;
return
deferred
.
promise
;
...
@@ -394,7 +397,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -394,7 +397,7 @@ class RemoteMachineTrainingService implements TrainingService {
trialWorkingFolder
,
trialWorkingFolder
,
trialJobId
,
trialJobId
,
path
.
join
(
trialWorkingFolder
,
'
.nni
'
,
'
jobpid
'
),
path
.
join
(
trialWorkingFolder
,
'
.nni
'
,
'
jobpid
'
),
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
// If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
(
typeof
cuda_visible_device
===
'
string
'
&&
cuda_visible_device
.
length
>
0
)
?
(
typeof
cuda_visible_device
===
'
string
'
&&
cuda_visible_device
.
length
>
0
)
?
`CUDA_VISIBLE_DEVICES=
${
cuda_visible_device
}
`
:
`CUDA_VISIBLE_DEVICES=" " `
,
`CUDA_VISIBLE_DEVICES=
${
cuda_visible_device
}
`
:
`CUDA_VISIBLE_DEVICES=" " `
,
...
...
src/nni_manager/training_service/remote_machine/sshClientUtility.ts
View file @
61d47a4d
...
@@ -19,14 +19,16 @@
...
@@ -19,14 +19,16 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
os
from
'
os
'
;
import
{
Client
,
ClientChannel
,
SFTPWrapper
}
from
'
ssh2
'
;
import
{
Client
,
ClientChannel
,
SFTPWrapper
}
from
'
ssh2
'
;
import
*
as
stream
from
"
stream
"
;
import
*
as
stream
from
'
stream
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getExperimentRootDir
}
from
'
../../common/utils
'
;
import
{
getLogger
}
from
'
../../common/log
'
;
import
{
uniqueString
}
from
'
../../common/utils
'
;
import
{
RemoteCommandResult
}
from
'
./remoteMachineData
'
;
import
{
RemoteCommandResult
}
from
'
./remoteMachineData
'
;
/**
/**
...
@@ -43,17 +45,18 @@ export namespace SSHClientUtility {
...
@@ -43,17 +45,18 @@ export namespace SSHClientUtility {
*/
*/
export
async
function
copyDirectoryToRemote
(
localDirectory
:
string
,
remoteDirectory
:
string
,
sshClient
:
Client
)
:
Promise
<
void
>
{
export
async
function
copyDirectoryToRemote
(
localDirectory
:
string
,
remoteDirectory
:
string
,
sshClient
:
Client
)
:
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
localCompressedDir
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
directory.tar.gz
'
);
const
tmpTarName
:
string
=
`
${
uniqueString
(
10
)}
.tar.gz`
;
const
remoteCompressedDir
:
string
=
path
.
join
(
remoteDirectory
,
'
directory.tar.gz
'
);
const
localTarPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
tmpTarName
);
const
remoteTarPath
:
string
=
path
.
join
(
os
.
tmpdir
(),
tmpTarName
);
// Compress files in local directory to experiment root directory
// Compress files in local directory to experiment root directory
await
cpp
.
exec
(
`tar -czf
${
local
CompressedDir
}
-C
${
localDirectory
}
.`
);
await
cpp
.
exec
(
`tar -czf
${
local
TarPath
}
-C
${
localDirectory
}
.`
);
// Copy the compressed file to remoteDirectory and delete it
// Copy the compressed file to remoteDirectory and delete it
await
copyFileToRemote
(
local
CompressedDir
,
remoteCompressedDir
,
sshClient
);
await
copyFileToRemote
(
local
TarPath
,
remoteTarPath
,
sshClient
);
await
cpp
.
exec
(
`rm
${
local
CompressedDir
}
`
);
await
cpp
.
exec
(
`rm
${
local
TarPath
}
`
);
// Decompress the remote compressed file in and delete it
// Decompress the remote compressed file in and delete it
await
remoteExeCommand
(
`tar -oxzf
${
remote
CompressedDir
}
-C
${
remoteDirectory
}
`
,
sshClient
);
await
remoteExeCommand
(
`tar -oxzf
${
remote
TarPath
}
-C
${
remoteDirectory
}
`
,
sshClient
);
await
remoteExeCommand
(
`rm
${
remote
CompressedDir
}
`
,
sshClient
);
await
remoteExeCommand
(
`rm
${
remote
TarPath
}
`
,
sshClient
);
deferred
.
resolve
();
deferred
.
resolve
();
return
deferred
.
promise
;
return
deferred
.
promise
;
...
@@ -65,18 +68,23 @@ export namespace SSHClientUtility {
...
@@ -65,18 +68,23 @@ export namespace SSHClientUtility {
* @param remoteFilePath the target path in remote machine
* @param remoteFilePath the target path in remote machine
* @param sshClient SSH Client
* @param sshClient SSH Client
*/
*/
export
function
copyFileToRemote
(
localFilePath
:
string
,
remoteFilePath
:
string
,
sshClient
:
Client
)
:
Promise
<
string
>
{
export
function
copyFileToRemote
(
localFilePath
:
string
,
remoteFilePath
:
string
,
sshClient
:
Client
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
assert
(
sshClient
!==
undefined
);
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
if
(
err
)
{
if
(
err
)
{
deferred
.
reject
();
getLogger
().
error
(
`copyFileToRemote:
${
err
.
message
}
,
${
localFilePath
}
,
${
remoteFilePath
}
`
);
deferred
.
reject
(
err
);
return
;
}
}
assert
(
sftp
!==
undefined
);
sftp
.
fastPut
(
localFilePath
,
remoteFilePath
,
(
fastPutErr
:
Error
)
=>
{
sftp
.
fastPut
(
localFilePath
,
remoteFilePath
,
(
fastPutErr
:
Error
)
=>
{
sftp
.
end
();
sftp
.
end
();
if
(
fastPutErr
)
{
if
(
fastPutErr
)
{
deferred
.
reject
();
deferred
.
reject
(
fastPutErr
);
}
else
{
}
else
{
deferred
.
resolve
(
'
success
'
);
deferred
.
resolve
(
true
);
}
}
});
});
});
});
...
@@ -97,14 +105,16 @@ export namespace SSHClientUtility {
...
@@ -97,14 +105,16 @@ export namespace SSHClientUtility {
client
.
exec
(
command
,
(
err
:
Error
,
channel
:
ClientChannel
)
=>
{
client
.
exec
(
command
,
(
err
:
Error
,
channel
:
ClientChannel
)
=>
{
if
(
err
)
{
if
(
err
)
{
getLogger
().
error
(
`remoteExeCommand:
${
err
.
message
}
`
);
deferred
.
reject
(
err
);
deferred
.
reject
(
err
);
return
;
}
}
channel
.
on
(
'
data
'
,
function
(
data
:
any
,
dataStderr
:
any
)
{
channel
.
on
(
'
data
'
,
(
data
:
any
,
dataStderr
:
any
)
=>
{
if
(
dataStderr
)
{
if
(
dataStderr
)
{
stderr
+=
data
.
toString
();
stderr
+=
data
.
toString
();
}
}
else
{
else
{
stdout
+=
data
.
toString
();
stdout
+=
data
.
toString
();
}
}
}).
on
(
'
exit
'
,
(
code
,
signal
)
=>
{
}).
on
(
'
exit
'
,
(
code
,
signal
)
=>
{
...
@@ -124,7 +134,10 @@ export namespace SSHClientUtility {
...
@@ -124,7 +134,10 @@ export namespace SSHClientUtility {
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
sshClient
.
sftp
((
err
:
Error
,
sftp
:
SFTPWrapper
)
=>
{
if
(
err
)
{
if
(
err
)
{
getLogger
().
error
(
`getRemoteFileContent:
${
err
.
message
}
`
);
deferred
.
reject
(
new
Error
(
`SFTP error:
${
err
.
message
}
`
));
deferred
.
reject
(
new
Error
(
`SFTP error:
${
err
.
message
}
`
));
return
;
}
}
try
{
try
{
const
sftpStream
:
stream
.
Readable
=
sftp
.
createReadStream
(
filePath
);
const
sftpStream
:
stream
.
Readable
=
sftp
.
createReadStream
(
filePath
);
...
@@ -133,11 +146,16 @@ export namespace SSHClientUtility {
...
@@ -133,11 +146,16 @@ export namespace SSHClientUtility {
sftpStream
.
on
(
'
data
'
,
(
data
:
Buffer
|
string
)
=>
{
sftpStream
.
on
(
'
data
'
,
(
data
:
Buffer
|
string
)
=>
{
dataBuffer
+=
data
;
dataBuffer
+=
data
;
}).
on
(
'
error
'
,
(
streamErr
:
Error
)
=>
{
}).
on
(
'
error
'
,
(
streamErr
:
Error
)
=>
{
sftp
.
end
();
deferred
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
streamErr
.
message
));
deferred
.
reject
(
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
streamErr
.
message
));
}).
on
(
'
end
'
,
()
=>
{
}).
on
(
'
end
'
,
()
=>
{
// sftp connection need to be released manually once operation is done
sftp
.
end
();
deferred
.
resolve
(
dataBuffer
);
deferred
.
resolve
(
dataBuffer
);
});
});
}
catch
(
error
)
{
}
catch
(
error
)
{
getLogger
().
error
(
`getRemoteFileContent:
${
error
.
message
}
`
);
sftp
.
end
();
deferred
.
reject
(
new
Error
(
`SFTP error:
${
error
.
message
}
`
));
deferred
.
reject
(
new
Error
(
`SFTP error:
${
error
.
message
}
`
));
}
}
});
});
...
...
src/nni_manager/training_service/test/remoteMachineTrainingService.test.ts
View file @
61d47a4d
...
@@ -27,7 +27,7 @@ import * as tmp from 'tmp';
...
@@ -27,7 +27,7 @@ import * as tmp from 'tmp';
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
cleanupUnitTest
,
delay
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
cleanupUnitTest
,
delay
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
RemoteMachine
MetadataKey
}
from
'
../
remote_machine/remoteMachineData
'
;
import
{
TrialConfig
MetadataKey
}
from
'
../
common/trialConfigMetadataKey
'
;
import
{
RemoteMachineTrainingService
}
from
'
../remote_machine/remoteMachineTrainingService
'
;
import
{
RemoteMachineTrainingService
}
from
'
../remote_machine/remoteMachineTrainingService
'
;
// copy mockedTrail.py to local folder
// copy mockedTrail.py to local folder
...
@@ -95,9 +95,9 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
...
@@ -95,9 +95,9 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
if
(
skip
)
{
if
(
skip
)
{
return
;
return
;
}
}
await
remoteMachineTrainingService
.
setClusterMetadata
(
RemoteMachine
MetadataKey
.
MACHINE_LIST
,
machineList
);
await
remoteMachineTrainingService
.
setClusterMetadata
(
TrialConfig
MetadataKey
.
MACHINE_LIST
,
machineList
);
await
remoteMachineTrainingService
.
setClusterMetadata
(
await
remoteMachineTrainingService
.
setClusterMetadata
(
RemoteMachine
MetadataKey
.
TRIAL_CONFIG
,
`{"command":"sleep 1h && echo ","codeDir":"
${
localCodeDir
}
","gpuNum":1}`
);
TrialConfig
MetadataKey
.
TRIAL_CONFIG
,
`{"command":"sleep 1h && echo ","codeDir":"
${
localCodeDir
}
","gpuNum":1}`
);
const
form
:
TrialJobApplicationForm
=
{
const
form
:
TrialJobApplicationForm
=
{
jobType
:
'
TRIAL
'
,
jobType
:
'
TRIAL
'
,
hyperParameters
:
'
mock hyperparameters
'
hyperParameters
:
'
mock hyperparameters
'
...
@@ -126,11 +126,11 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
...
@@ -126,11 +126,11 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
return
;
return
;
}
}
// set machine list'
// set machine list'
await
remoteMachineTrainingService
.
setClusterMetadata
(
RemoteMachine
MetadataKey
.
MACHINE_LIST
,
machineList
);
await
remoteMachineTrainingService
.
setClusterMetadata
(
TrialConfig
MetadataKey
.
MACHINE_LIST
,
machineList
);
// set meta data
// set meta data
const
trialConfig
:
string
=
`{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"
${
localCodeDir
}
\",\"gpuNum\":0}`
const
trialConfig
:
string
=
`{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"
${
localCodeDir
}
\",\"gpuNum\":0}`
await
remoteMachineTrainingService
.
setClusterMetadata
(
RemoteMachine
MetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
await
remoteMachineTrainingService
.
setClusterMetadata
(
TrialConfig
MetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
// submit job
// submit job
const
form
:
TrialJobApplicationForm
=
{
const
form
:
TrialJobApplicationForm
=
{
...
...
src/nni_manager/training_service/test/sshClientUtility.test.ts
0 → 100644
View file @
61d47a4d
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
{
Client
}
from
'
ssh2
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
SSHClientUtility
}
from
'
../remote_machine/sshClientUtility
'
;
const
LOCALFILE
:
string
=
'
/tmp/sshclientUTData
'
;
const
REMOTEFILE
:
string
=
'
/tmp/sshclientUTData
'
;
async
function
copyFile
(
conn
:
Client
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
conn
.
sftp
((
err
,
sftp
)
=>
{
if
(
err
)
{
deferred
.
reject
(
err
);
return
;
}
sftp
.
fastPut
(
LOCALFILE
,
REMOTEFILE
,
(
fastPutErr
:
Error
)
=>
{
sftp
.
end
();
if
(
fastPutErr
)
{
deferred
.
reject
(
fastPutErr
);
}
else
{
deferred
.
resolve
();
}
}
);
});
return
deferred
.
promise
;
}
async
function
copyFileToRemoteLoop
(
conn
:
Client
):
Promise
<
void
>
{
for
(
let
i
:
number
=
0
;
i
<
500
;
i
++
)
{
console
.
log
(
i
);
await
SSHClientUtility
.
copyFileToRemote
(
LOCALFILE
,
REMOTEFILE
,
conn
);
}
}
async
function
remoteExeCommandLoop
(
conn
:
Client
):
Promise
<
void
>
{
for
(
let
i
:
number
=
0
;
i
<
500
;
i
++
)
{
console
.
log
(
i
);
await
SSHClientUtility
.
remoteExeCommand
(
'
ls
'
,
conn
);
}
}
async
function
getRemoteFileContentLoop
(
conn
:
Client
):
Promise
<
void
>
{
for
(
let
i
:
number
=
0
;
i
<
500
;
i
++
)
{
console
.
log
(
i
);
await
SSHClientUtility
.
getRemoteFileContent
(
REMOTEFILE
,
conn
);
}
}
describe
(
'
sshClientUtility test
'
,
()
=>
{
let
skip
:
boolean
=
true
;
let
rmMeta
:
any
;
try
{
rmMeta
=
JSON
.
parse
(
fs
.
readFileSync
(
'
../../.vscode/rminfo.json
'
,
'
utf8
'
));
}
catch
(
err
)
{
skip
=
true
;
}
before
(
async
()
=>
{
await
cpp
.
exec
(
`echo '1234' >
${
LOCALFILE
}
`
);
});
after
(()
=>
{
fs
.
unlinkSync
(
LOCALFILE
);
});
it
(
'
Test SSHClientUtility
'
,
(
done
)
=>
{
if
(
skip
)
{
done
();
return
;
}
const
conn
:
Client
=
new
Client
();
conn
.
on
(
'
ready
'
,
async
()
=>
{
await
copyFile
(
conn
);
await
Promise
.
all
([
copyFileToRemoteLoop
(
conn
),
copyFileToRemoteLoop
(
conn
),
copyFileToRemoteLoop
(
conn
),
remoteExeCommandLoop
(
conn
),
getRemoteFileContentLoop
(
conn
)
]);
done
();
}).
connect
(
rmMeta
);
});
});
src/nni_manager/types/node-nvidia-smi/index.d.ts
View file @
61d47a4d
...
@@ -2,17 +2,19 @@ declare module 'node-nvidia-smi' {
...
@@ -2,17 +2,19 @@ declare module 'node-nvidia-smi' {
function
smi
(
callback
:
(
error
:
Error
,
data
:
smi
.
GPUInfo
)
=>
void
):
void
;
function
smi
(
callback
:
(
error
:
Error
,
data
:
smi
.
GPUInfo
)
=>
void
):
void
;
namespace
smi
{
namespace
smi
{
interface
EmbededGPUSummary
{
minor_number
:
string
;
utilization
:
{
gpu_util
:
string
;
memory_util
:
string
;
};
process
:
string
|
object
;
}
interface
GPUInfo
{
interface
GPUInfo
{
nvidia_smi_log
:
{
nvidia_smi_log
:
{
attached_gpus
:
string
;
attached_gpus
:
string
;
gpu
:
{
gpu
:
EmbededGPUSummary
[]
|
EmbededGPUSummary
;
minor_number
:
string
;
utilization
:
{
gpu_util
:
string
;
memory_util
:
string
;
};
process
:
string
|
object
;
}[];
};
};
}
}
}
}
...
...
src/sdk/pynni/nni/protocol.py
View file @
61d47a4d
...
@@ -18,7 +18,7 @@
...
@@ -18,7 +18,7 @@
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
# ==================================================================================================
import
logging
from
enum
import
Enum
from
enum
import
Enum
...
@@ -64,8 +64,14 @@ def receive():
...
@@ -64,8 +64,14 @@ def receive():
Returns a tuple of command (CommandType) and payload (str)
Returns a tuple of command (CommandType) and payload (str)
"""
"""
header
=
_in_file
.
read
(
8
)
header
=
_in_file
.
read
(
8
)
logging
.
getLogger
(
__name__
).
debug
(
'Received command, header: [%s]'
%
header
)
if
header
is
None
or
len
(
header
)
<
8
:
# Pipe EOF encountered
logging
.
getLogger
(
__name__
).
debug
(
'Pipe EOF encountered'
)
return
None
,
None
length
=
int
(
header
[
2
:])
length
=
int
(
header
[
2
:])
data
=
_in_file
.
read
(
length
)
data
=
_in_file
.
read
(
length
)
command
=
CommandType
(
header
[:
2
])
command
=
CommandType
(
header
[:
2
])
data
=
data
.
decode
(
'utf8'
)
data
=
data
.
decode
(
'utf8'
)
logging
.
getLogger
(
__name__
).
debug
(
'Received command, data: [%s]'
%
data
)
return
command
,
data
return
command
,
data
src/sdk/pynni/nni/tuner.py
View file @
61d47a4d
...
@@ -127,6 +127,8 @@ def _handle_request(tuner):
...
@@ -127,6 +127,8 @@ def _handle_request(tuner):
_logger
.
debug
(
'waiting receive_message'
)
_logger
.
debug
(
'waiting receive_message'
)
command
,
data
=
receive
()
command
,
data
=
receive
()
if
command
is
None
:
return
False
_logger
.
debug
(
command
)
_logger
.
debug
(
command
)
_logger
.
debug
(
data
)
_logger
.
debug
(
data
)
...
...
src/webui/README.md
View file @
61d47a4d
...
@@ -18,7 +18,6 @@ Click the tab "Overview".
...
@@ -18,7 +18,6 @@ Click the tab "Overview".
*
See good performance trial.
*
See good performance trial.
*
See search_space json.
*
See search_space json.
*
See complete trial cdf graph.
### View job accuracy
### View job accuracy
...
@@ -29,7 +28,7 @@ Click the tab "Optimization Progress" to see the point graph of all trials. Hove
...
@@ -29,7 +28,7 @@ Click the tab "Optimization Progress" to see the point graph of all trials. Hove
Click the tab "Hyper Parameter" to see the parallel graph.
Click the tab "Hyper Parameter" to see the parallel graph.
*
You can select the percentage to cut down some lines.
*
You can select the percentage to cut down some lines.
*
Choose two ax
e
s to swap its positions
*
Choose two ax
i
s to swap its positions
### View trial status
### View trial status
...
@@ -39,11 +38,10 @@ Click the tab "Trial Status" to see the status of the all trials. Specifically:
...
@@ -39,11 +38,10 @@ Click the tab "Trial Status" to see the status of the all trials. Specifically:
*
Trial detail: trial's id, trial's duration, start time, end time, status and accuracy.
*
Trial detail: trial's id, trial's duration, start time, end time, status and accuracy.
*
Kill: you can kill a job that status is running.
*
Kill: you can kill a job that status is running.
*
Tensor: you can see a job in the tensorflow graph, it will link to the Tensorboard page.
*
Tensor: you can see a job in the tensorflow graph, it will link to the Tensorboard page.
*
Log: click the button, you can see the log about NNI and pai.
### Control
### Control
Click the tab "Control" to add a new trial or update the search_space file.
Click the tab "Control" to add a new trial or update the search_space file
and some experiment parameters
.
### View Tensorboard Graph
### View Tensorboard Graph
...
...
src/webui/public/index.html
View file @
61d47a4d
...
@@ -4,22 +4,6 @@
...
@@ -4,22 +4,6 @@
<head>
<head>
<meta
charset=
"utf-8"
>
<meta
charset=
"utf-8"
>
<meta
name=
"viewport"
content=
"width=device-width, initial-scale=1, shrink-to-fit=no"
>
<meta
name=
"viewport"
content=
"width=device-width, initial-scale=1, shrink-to-fit=no"
>
<meta
name=
"theme-color"
content=
"#000000"
>
<!--
manifest.json provides metadata used when your web app is added to the
homescreen on Android. See https://developers.google.com/web/fundamentals/engage-and-retain/web-app-manifest/
-->
<link
rel=
"manifest"
href=
"%PUBLIC_URL%/manifest.json"
>
<link
rel=
"shortcut icon"
href=
"%PUBLIC_URL%/icon.jpg"
>
<!--
Notice the use of %PUBLIC_URL% in the tags above.
It will be replaced with the URL of the `public` folder during the build.
Only files inside the `public` folder can be referenced from the HTML.
Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
work correctly both with client-side routing and a non-root public URL.
Learn how to configure a non-root public URL by running `npm run build`.
-->
<title>
Neural Network Intelligence
</title>
<title>
Neural Network Intelligence
</title>
</head>
</head>
...
...
src/webui/src/App.css
View file @
61d47a4d
.header_title
{
.header_title
{
width
:
100%
;
width
:
100%
;
height
:
60px
;
height
:
60px
;
line-height
:
60px
;
font-size
:
24px
;
font-family
:
'Segoe UI'
,
Tahoma
,
Geneva
,
Verdana
,
sans-serif
;
color
:
white
;
background-color
:
rgb
(
60
,
141
,
188
)
;
background-color
:
rgb
(
60
,
141
,
188
)
;
user-select
:
none
;
user-select
:
none
;
text-align
:
center
;
text-align
:
center
;
...
...
src/webui/src/App.tsx
View file @
61d47a4d
...
@@ -6,7 +6,8 @@ class App extends React.Component<{}, {}> {
...
@@ -6,7 +6,8 @@ class App extends React.Component<{}, {}> {
render
()
{
render
()
{
return
(
return
(
<
div
className
=
"App"
>
<
div
className
=
"App"
>
<
header
className
=
"header_title"
><
img
src
=
{
require
(
'
./logo.jpg
'
)
}
alt
=
""
/></
header
>
{
/* <header className="header_title"><img src={require('./logo.jpg')} alt=""/></header> */
}
<
header
className
=
"header_title"
>
Neural Network Intelligence
</
header
>
<
div
className
=
"content"
>
<
div
className
=
"content"
>
<
SlideBar
/>
<
SlideBar
/>
<
div
className
=
"right"
>
{
this
.
props
.
children
}
</
div
>
<
div
className
=
"right"
>
{
this
.
props
.
children
}
</
div
>
...
...
src/webui/src/components/Control.tsx
View file @
61d47a4d
...
@@ -203,7 +203,7 @@ class Control extends React.Component<{}, ControlState> {
...
@@ -203,7 +203,7 @@ class Control extends React.Component<{}, ControlState> {
// update trial number parameters
// update trial number parameters
trialParameterMess
=
(
exper
:
Experiments
,
str
:
string
)
=>
{
trialParameterMess
=
(
exper
:
Experiments
,
str
:
string
)
=>
{
this
.
getUpdateExample
();
axios
(
`
${
MANAGER_IP
}
/experiment`
,
{
axios
(
`
${
MANAGER_IP
}
/experiment`
,
{
method
:
'
PUT
'
,
method
:
'
PUT
'
,
headers
:
{
headers
:
{
...
@@ -216,6 +216,7 @@ class Control extends React.Component<{}, ControlState> {
...
@@ -216,6 +216,7 @@ class Control extends React.Component<{}, ControlState> {
}).
then
(
res
=>
{
}).
then
(
res
=>
{
if
(
res
.
status
===
200
)
{
if
(
res
.
status
===
200
)
{
message
.
success
(
`Update
${
str
.
toLocaleLowerCase
()}
successfully`
);
message
.
success
(
`Update
${
str
.
toLocaleLowerCase
()}
successfully`
);
this
.
getUpdateExample
();
}
else
{
}
else
{
message
.
error
(
`Update
${
str
.
toLocaleLowerCase
()}
failed`
);
message
.
error
(
`Update
${
str
.
toLocaleLowerCase
()}
failed`
);
}
}
...
@@ -284,8 +285,8 @@ class Control extends React.Component<{}, ControlState> {
...
@@ -284,8 +285,8 @@ class Control extends React.Component<{}, ControlState> {
}
}
userUpdateSeaspace
=
()
=>
{
userUpdateSeaspace
=
()
=>
{
this
.
updateSearchLoad
();
this
.
updateSearchLoad
();
this
.
getUpdateExample
();
const
{
updateSearch
}
=
this
.
state
;
const
{
updateSearch
}
=
this
.
state
;
if
(
updateSearch
!==
''
||
updateSearch
!==
null
)
{
if
(
updateSearch
!==
''
||
updateSearch
!==
null
)
{
const
{
experiment
}
=
this
.
state
;
const
{
experiment
}
=
this
.
state
;
...
...
src/webui/src/components/Sessionpro.tsx
View file @
61d47a4d
...
@@ -2,12 +2,12 @@ import * as React from 'react';
...
@@ -2,12 +2,12 @@ import * as React from 'react';
import
axios
from
'
axios
'
;
import
axios
from
'
axios
'
;
import
{
Table
,
Select
,
Row
,
Col
,
Icon
}
from
'
antd
'
;
import
{
Table
,
Select
,
Row
,
Col
,
Icon
}
from
'
antd
'
;
import
{
MANAGER_IP
,
overviewItem
,
roundNum
}
from
'
../const
'
;
import
{
MANAGER_IP
,
overviewItem
,
roundNum
}
from
'
../const
'
;
import
ReactEcharts
from
'
echarts-for-react
'
;
//
import ReactEcharts from 'echarts-for-react';
const
Option
=
Select
.
Option
;
const
Option
=
Select
.
Option
;
import
JSONTree
from
'
react-json-tree
'
;
import
JSONTree
from
'
react-json-tree
'
;
require
(
'
echarts/lib/chart/line
'
);
//
require('echarts/lib/chart/line');
require
(
'
echarts/lib/component/tooltip
'
);
//
require('echarts/lib/component/tooltip');
require
(
'
echarts/lib/component/title
'
);
//
require('echarts/lib/component/title');
require
(
'
../style/sessionpro.css
'
);
require
(
'
../style/sessionpro.css
'
);
interface
TableObj
{
interface
TableObj
{
...
@@ -266,26 +266,26 @@ class Sessionpro extends React.Component<{}, SessionState> {
...
@@ -266,26 +266,26 @@ class Sessionpro extends React.Component<{}, SessionState> {
});
});
}
}
// draw CDF
// draw CDF
const
{
trialRun
}
=
this
.
state
;
//
const { trialRun } = this.state;
if
(
this
.
_isMounted
)
{
//
if (this._isMounted) {
this
.
setState
({
//
this.setState({
option
:
this
.
getOption
(
trialRun
)
//
option: this.getOption(trialRun)
});
//
});
}
//
}
// CDF graph 'No data' judge
// CDF graph 'No data' judge
if
(
trialRun
.
length
===
0
)
{
//
if (trialRun.length === 0) {
if
(
this
.
_isMounted
)
{
//
if (this._isMounted) {
this
.
setState
({
//
this.setState({
noData
:
'
No data
'
//
noData: 'No data'
});
//
});
}
//
}
}
else
{
//
} else {
if
(
this
.
_isMounted
)
{
//
if (this._isMounted) {
this
.
setState
({
//
this.setState({
noData
:
''
//
noData: ''
});
//
});
}
//
}
}
//
}
}
}
});
});
}
}
...
@@ -372,7 +372,8 @@ class Sessionpro extends React.Component<{}, SessionState> {
...
@@ -372,7 +372,8 @@ class Sessionpro extends React.Component<{}, SessionState> {
};
};
const
{
const
{
trialProfile
,
searchSpace
,
tunerAssessor
,
tableData
,
option
,
noData
trialProfile
,
searchSpace
,
tunerAssessor
,
tableData
,
// option, noData
}
=
this
.
state
;
}
=
this
.
state
;
let
running
;
let
running
;
if
(
trialProfile
.
endTime
===
'
not over
'
)
{
if
(
trialProfile
.
endTime
===
'
not over
'
)
{
...
@@ -500,13 +501,13 @@ class Sessionpro extends React.Component<{}, SessionState> {
...
@@ -500,13 +501,13 @@ class Sessionpro extends React.Component<{}, SessionState> {
scroll
=
{
{
x
:
'
100%
'
,
y
:
540
}
}
scroll
=
{
{
x
:
'
100%
'
,
y
:
540
}
}
/>
/>
</
div
>
</
div
>
<
div
className
=
"cdf"
>
{
/*
<div className="cdf">
<ReactEcharts
<ReactEcharts
option={option}
option={option}
style={{ height: 500, padding: '0px' }}
style={{ height: 500, padding: '0px' }}
/>
/>
<div className="addNodata">{noData}</div>
<div className="addNodata">{noData}</div>
</
div
>
</div>
*/
}
</
div
>
</
div
>
);
);
}
}
...
...
tools/nnicmd/constants.py
View file @
61d47a4d
...
@@ -43,7 +43,7 @@ WARNING_INFO = 'Waining: %s'
...
@@ -43,7 +43,7 @@ WARNING_INFO = 'Waining: %s'
EXPERIMENT_SUCCESS_INFO
=
'Start experiment success! The experiment id is %s, and the restful server post is %s.
\n
'
\
EXPERIMENT_SUCCESS_INFO
=
'Start experiment success! The experiment id is %s, and the restful server post is %s.
\n
'
\
'You can use these commands to get more information about this experiment:
\n
'
\
'You can use these commands to get more information about this experiment:
\n
'
\
' commands description
\n
'
\
' commands description
\n
'
\
'1. nnictl experiment
l
s
list all
of experiments
\n
'
\
'1. nnictl experiment s
how
show the information
of experiments
\n
'
\
'2. nnictl trial ls list all of trial jobs
\n
'
\
'2. nnictl trial ls list all of trial jobs
\n
'
\
'3. nnictl stop stop a experiment
\n
'
\
'3. nnictl stop stop a experiment
\n
'
\
'4. nnictl trial kill kill a trial job by id
\n
'
\
'4. nnictl trial kill kill a trial job by id
\n
'
\
...
...
tools/nnicmd/launcher.py
View file @
61d47a4d
...
@@ -54,13 +54,20 @@ def start_rest_server(manager, port, platform, mode, experiment_id=None):
...
@@ -54,13 +54,20 @@ def start_rest_server(manager, port, platform, mode, experiment_id=None):
process
=
Popen
(
cmds
,
stdout
=
stdout_file
,
stderr
=
stderr_file
)
process
=
Popen
(
cmds
,
stdout
=
stdout_file
,
stderr
=
stderr_file
)
return
process
return
process
def
set_
loc
al_config
(
experiment_config
,
port
):
def
set_
tri
al_config
(
experiment_config
,
port
):
'''
Call setClusterMetadata (rest PUT /parameters/cluster-metadata) to pass platform and machineList"
'''
'''
set trial configuration
'''
request_data
=
dict
()
request_data
=
dict
()
request_data
[
'codeDir'
]
=
experiment_config
[
'trial'
][
'trialCodeDir'
]
value_dict
=
dict
()
request_data
[
'command'
]
=
experiment_config
[
'trial'
][
'trialCommand'
]
value_dict
[
'command'
]
=
experiment_config
[
'trial'
][
'trialCommand'
]
value_dict
[
'codeDir'
]
=
experiment_config
[
'trial'
][
'trialCodeDir'
]
value_dict
[
'gpuNum'
]
=
experiment_config
[
'trial'
][
'trialGpuNum'
]
request_data
[
'trial_config'
]
=
value_dict
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
20
)
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
20
)
return
True
if
response
and
response
.
status_code
==
200
else
False
return
True
if
response
.
status_code
==
200
else
False
def
set_local_config
(
experiment_config
,
port
):
'''set local configuration'''
return
set_trial_config
(
experiment_config
,
port
)
def
set_remote_config
(
experiment_config
,
port
):
def
set_remote_config
(
experiment_config
,
port
):
'''Call setClusterMetadata to pass trial'''
'''Call setClusterMetadata to pass trial'''
...
@@ -72,14 +79,7 @@ def set_remote_config(experiment_config, port):
...
@@ -72,14 +79,7 @@ def set_remote_config(experiment_config, port):
return
False
return
False
#set trial_config
#set trial_config
request_data
=
dict
()
return
set_trial_config
(
experiment_config
,
port
)
value_dict
=
dict
()
value_dict
[
'command'
]
=
experiment_config
[
'trial'
][
'trialCommand'
]
value_dict
[
'codeDir'
]
=
experiment_config
[
'trial'
][
'trialCodeDir'
]
value_dict
[
'gpuNum'
]
=
experiment_config
[
'trial'
][
'trialGpuNum'
]
request_data
[
'trial_config'
]
=
value_dict
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
20
)
return
True
if
response
.
status_code
==
200
else
False
def
set_experiment
(
experiment_config
,
mode
,
port
):
def
set_experiment
(
experiment_config
,
mode
,
port
):
'''Call startExperiment (rest POST /experiment) with yaml file content'''
'''Call startExperiment (rest POST /experiment) with yaml file content'''
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment