Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
252f36f8
Commit
252f36f8
authored
Aug 20, 2018
by
Deshui Yu
Browse files
NNI dogfood version 1
parent
781cea26
Changes
214
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2713 additions
and
0 deletions
+2713
-0
src/nni_manager/core/test/assessor.py
src/nni_manager/core/test/assessor.py
+47
-0
src/nni_manager/core/test/dataStore.test.ts
src/nni_manager/core/test/dataStore.test.ts
+156
-0
src/nni_manager/core/test/dummy_assessor.py
src/nni_manager/core/test/dummy_assessor.py
+25
-0
src/nni_manager/core/test/hyperopt.py
src/nni_manager/core/test/hyperopt.py
+30
-0
src/nni_manager/core/test/ipcInterface.test.ts
src/nni_manager/core/test/ipcInterface.test.ts
+141
-0
src/nni_manager/core/test/ipcInterfaceTerminate.test.ts
src/nni_manager/core/test/ipcInterfaceTerminate.test.ts
+95
-0
src/nni_manager/core/test/mockedDatastore.ts
src/nni_manager/core/test/mockedDatastore.ts
+252
-0
src/nni_manager/core/test/mockedTrainingService.ts
src/nni_manager/core/test/mockedTrainingService.ts
+132
-0
src/nni_manager/core/test/nnimanager.test.ts
src/nni_manager/core/test/nnimanager.test.ts
+156
-0
src/nni_manager/core/test/sqlDatabase.test.ts
src/nni_manager/core/test/sqlDatabase.test.ts
+188
-0
src/nni_manager/core/trialJobs.ts
src/nni_manager/core/trialJobs.ts
+128
-0
src/nni_manager/main.ts
src/nni_manager/main.ts
+105
-0
src/nni_manager/package.json
src/nni_manager/package.json
+52
-0
src/nni_manager/rest_server/restHandler.ts
src/nni_manager/rest_server/restHandler.ts
+292
-0
src/nni_manager/rest_server/server.ts
src/nni_manager/rest_server/server.ts
+107
-0
src/nni_manager/rest_server/tensorboard.ts
src/nni_manager/rest_server/tensorboard.ts
+151
-0
src/nni_manager/rest_server/test/mockedNNIManager.ts
src/nni_manager/rest_server/test/mockedNNIManager.ts
+174
-0
src/nni_manager/rest_server/test/restserver.test.ts
src/nni_manager/rest_server/test/restserver.test.ts
+208
-0
src/nni_manager/scripts/gpu_metrics_collector.py
src/nni_manager/scripts/gpu_metrics_collector.py
+82
-0
src/nni_manager/scripts/metrics_reader.py
src/nni_manager/scripts/metrics_reader.py
+192
-0
No files found.
src/nni_manager/core/test/assessor.py
0 → 100644
View file @
252f36f8
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
_in_file
=
open
(
3
,
'rb'
)
_out_file
=
open
(
4
,
'wb'
)
def
send
(
command
,
data
):
command
=
command
.
encode
(
'utf8'
)
data
=
data
.
encode
(
'utf8'
)
msg
=
b
'%b%06d%b'
%
(
command
,
len
(
data
),
data
)
_out_file
.
write
(
msg
)
_out_file
.
flush
()
def
receive
():
header
=
_in_file
.
read
(
8
)
l
=
int
(
header
[
2
:])
command
=
header
[:
2
].
decode
(
'utf8'
)
data
=
_in_file
.
read
(
l
).
decode
(
'utf8'
)
return
command
,
data
print
(
receive
())
send
(
'KI'
,
''
)
print
(
receive
())
send
(
'KI'
,
'hello'
)
send
(
'KI'
,
'世界'
)
src/nni_manager/core/test/dataStore.test.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
{
expect
}
from
'
chai
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Database
,
DataStore
,
TrialJobInfo
}
from
'
../../common/datastore
'
;
import
{
setExperimentStartupInfo
}
from
'
../../common/experimentStartupInfo
'
;
import
{
ExperimentProfile
,
TrialJobStatistics
}
from
'
../../common/manager
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
NNIDataStore
}
from
'
../nniDataStore
'
;
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
describe
(
'
Unit test for dataStore
'
,
()
=>
{
let
ds
:
DataStore
;
before
(
async
()
=>
{
prepareUnitTest
();
Container
.
bind
(
Database
).
to
(
SqlDB
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
DataStore
).
to
(
NNIDataStore
).
scope
(
Scope
.
Singleton
);
ds
=
component
.
get
(
DataStore
);
await
ds
.
init
();
});
after
(()
=>
{
cleanupUnitTest
();
});
it
(
'
test emtpy experiment profile
'
,
async
()
=>
{
const
result
:
ExperimentProfile
=
await
ds
.
getExperimentProfile
(
'
abc
'
);
expect
(
result
).
to
.
equal
(
undefined
,
'
Should not get any profile
'
);
});
it
(
'
test experiment profiles CRUD
'
,
async
()
=>
{
const
profile
:
ExperimentProfile
=
{
params
:
{
authorName
:
'
test1
'
,
experimentName
:
'
exp1
'
,
trialConcurrency
:
2
,
maxExecDuration
:
10
,
maxTrialNum
:
5
,
searchSpace
:
`{
"dropout_rate": {
"_type": "uniform",
"_value": [0.1, 0.5]
},
"batch_size": {
"_type": "choice",
"_value": [50, 250, 500]
}
}`
,
tuner
:
{
tunerCommand
:
'
python3 tunner.py
'
,
tunerCwd
:
'
/tmp
'
,
tunerCheckpointDirectory
:
'
/tmp/cp
'
,
tunerGpuNum
:
0
}
},
id
:
'
exp123
'
,
execDuration
:
0
,
startTime
:
new
Date
(),
endTime
:
new
Date
(),
revision
:
0
}
const
id
:
string
=
profile
.
id
;
for
(
let
i
:
number
=
0
;
i
<
5
;
i
++
)
{
await
ds
.
storeExperimentProfile
(
profile
);
profile
.
revision
+=
1
;
}
const
result
:
ExperimentProfile
=
await
ds
.
getExperimentProfile
(
id
);
expect
(
result
.
revision
).
to
.
equal
(
4
);
});
const
testEventRecords
:
{
event
:
string
;
jobId
:
string
;
data
?:
string
;
}[]
=
[
{
event
:
'
WAITING
'
,
jobId
:
'
111
'
},
{
event
:
'
WAITING
'
,
jobId
:
'
222
'
},
{
event
:
'
RUNNING
'
,
jobId
:
'
111
'
},
{
event
:
'
RUNNING
'
,
jobId
:
'
222
'
},
{
event
:
'
SUCCEEDED
'
,
jobId
:
'
111
'
,
data
:
'
lr: 0.001
'
},
{
event
:
'
FAILED
'
,
jobId
:
'
222
'
}
];
// tslint:disable-next-line:no-any
const
metricsData
:
any
=
[
{
trial_job_id
:
'
111
'
,
parameter_id
:
'
abc
'
,
type
:
'
PERIODICAL
'
,
value
:
'
acc: 0.88
'
,
timestamp
:
new
Date
()
},
{
trial_job_id
:
'
111
'
,
parameter_id
:
'
abc
'
,
type
:
'
FINAL
'
,
value
:
'
acc: 0.88
'
,
timestamp
:
new
Date
()
}
];
it
(
'
test trial job events store /query
'
,
async
()
=>
{
for
(
const
event
of
testEventRecords
)
{
await
ds
.
storeTrialJobEvent
(
<
TrialJobStatus
>
event
.
event
,
event
.
jobId
,
event
.
data
);
}
for
(
const
metrics
of
metricsData
)
{
await
ds
.
storeMetricData
(
metrics
.
trial_job_id
,
JSON
.
stringify
(
metrics
));
}
const
jobs
:
TrialJobInfo
[]
=
await
ds
.
listTrialJobs
();
expect
(
jobs
.
length
).
to
.
equals
(
2
,
'
There should be 2 jobs
'
);
const
statistics
:
TrialJobStatistics
[]
=
await
ds
.
getTrialJobStatistics
();
expect
(
statistics
.
length
).
to
.
equals
(
2
,
'
There should be 2 statistics
'
);
});
});
src/nni_manager/core/test/dummy_assessor.py
0 → 100644
View file @
252f36f8
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from
nni.assessor
import
Assessor
,
AssessResult
class
DummyAssessor
(
Assessor
):
def
assess_trial
(
self
,
trial_job_id
,
trial_history
):
return
AssessResult
.
Good
DummyAssessor
().
run
()
src/nni_manager/core/test/hyperopt.py
0 → 100644
View file @
252f36f8
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from
nni.tuner
import
Tuner
class
TestTuner
(
Tuner
):
def
generate_parameters
(
self
,
trial_id
):
return
{
'lr'
:
0.01
}
def
receive_trial_result
(
self
,
parameter_id
,
parameters
,
reward
):
pass
def
update_search_space
(
self
,
search_space
):
return
True
TestTuner
().
run
()
src/nni_manager/core/test/ipcInterface.test.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
ChildProcess
,
spawn
}
from
'
child_process
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
*
as
CommandType
from
'
../commands
'
;
import
{
createAssessorInterface
,
createTunerInterface
,
IpcInterface
}
from
'
../ipcInterface
'
;
let
sentCommands
:
{[
key
:
string
]:
string
}[]
=
[];
const
receivedCommands
:
{[
key
:
string
]:
string
}[]
=
[];
let
commandTooLong
:
Error
|
undefined
;
let
rejectCommandType
:
Error
|
undefined
;
function
runProcess
():
Promise
<
Error
|
null
>
{
// the process is intended to throw error, do not reject
const
deferred
:
Deferred
<
Error
|
null
>
=
new
Deferred
<
Error
|
null
>
();
// create fake assessor process
const
stdio
:
{}[]
=
[
'
ignore
'
,
'
pipe
'
,
process
.
stderr
,
'
pipe
'
,
'
pipe
'
];
const
proc
:
ChildProcess
=
spawn
(
'
python3 assessor.py
'
,
[],
{
stdio
,
cwd
:
'
core/test
'
,
shell
:
true
});
// record its sent/received commands on exit
proc
.
on
(
'
error
'
,
(
error
:
Error
):
void
=>
{
deferred
.
resolve
(
error
);
});
proc
.
on
(
'
exit
'
,
(
code
:
number
):
void
=>
{
if
(
code
!==
0
)
{
deferred
.
resolve
(
new
Error
(
`return code:
${
code
}
`
));
}
else
{
sentCommands
=
proc
.
stdout
.
read
().
toString
().
split
(
'
\n
'
);
deferred
.
resolve
(
null
);
}
});
// create IPC interface
const
assessor
:
IpcInterface
=
createAssessorInterface
(
proc
);
assessor
.
onCommand
((
commandType
:
string
,
content
:
string
):
void
=>
{
receivedCommands
.
push
({
commandType
,
content
});
});
// Command #1: ok
assessor
.
sendCommand
(
'
IN
'
);
// Command #2: ok
assessor
.
sendCommand
(
'
ME
'
,
'
123
'
);
// Command #3: too long
try
{
assessor
.
sendCommand
(
'
ME
'
,
'
x
'
.
repeat
(
1
_000_000
));
}
catch
(
error
)
{
commandTooLong
=
error
;
}
// Command #4: not assessor command
try
{
assessor
.
sendCommand
(
'
GE
'
,
'
1
'
);
}
catch
(
error
)
{
rejectCommandType
=
error
;
}
return
deferred
.
promise
;
}
describe
(
'
core/protocol
'
,
():
void
=>
{
before
(
async
()
=>
{
prepareUnitTest
();
await
runProcess
();
});
after
(()
=>
{
cleanupUnitTest
();
});
it
(
'
should have sent 2 successful commands
'
,
():
void
=>
{
assert
.
equal
(
sentCommands
.
length
,
3
);
assert
.
equal
(
sentCommands
[
2
],
''
);
});
it
(
'
sendCommand() should work without content
'
,
():
void
=>
{
assert
.
equal
(
sentCommands
[
0
],
'
(
\'
IN
\'
,
\'\'
)
'
);
});
it
(
'
sendCommand() should work with content
'
,
():
void
=>
{
assert
.
equal
(
sentCommands
[
1
],
'
(
\'
ME
\'
,
\'
123
\'
)
'
);
});
it
(
'
sendCommand() should throw on too long command
'
,
():
void
=>
{
assert
.
equal
((
<
Error
>
commandTooLong
).
name
,
'
RangeError
'
);
assert
.
equal
((
<
Error
>
commandTooLong
).
message
,
'
Command too long
'
);
});
it
(
'
sendCommand() should throw on wrong command type
'
,
():
void
=>
{
assert
.
equal
((
<
Error
>
rejectCommandType
).
name
,
'
AssertionError [ERR_ASSERTION]
'
);
});
it
(
'
should have received 3 commands
'
,
():
void
=>
{
assert
.
equal
(
receivedCommands
.
length
,
3
);
});
it
(
'
onCommand() should work without content
'
,
():
void
=>
{
assert
.
deepStrictEqual
(
receivedCommands
[
0
],
{
commandType
:
'
KI
'
,
content
:
''
});
});
it
(
'
onCommand() should work with content
'
,
():
void
=>
{
assert
.
deepStrictEqual
(
receivedCommands
[
1
],
{
commandType
:
'
KI
'
,
content
:
'
hello
'
});
});
it
(
'
onCommand() should work with Unicode content
'
,
():
void
=>
{
assert
.
deepStrictEqual
(
receivedCommands
[
2
],
{
commandType
:
'
KI
'
,
content
:
'
世界
'
});
});
});
src/nni_manager/core/test/ipcInterfaceTerminate.test.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
ChildProcess
,
spawn
}
from
'
child_process
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
*
as
CommandType
from
'
../commands
'
;
import
{
createAssessorInterface
,
IpcInterface
}
from
'
../ipcInterface
'
;
let
assessor
:
IpcInterface
|
undefined
;
let
procExit
:
boolean
=
false
;
let
procError
:
boolean
=
false
;
function
startProcess
():
void
{
// create fake assessor process
const
stdio
:
{}[]
=
[
'
ignore
'
,
'
pipe
'
,
process
.
stderr
,
'
pipe
'
,
'
pipe
'
];
const
proc
:
ChildProcess
=
spawn
(
'
python3 dummy_assessor.py
'
,
[],
{
stdio
,
cwd
:
'
core/test
'
,
shell
:
true
});
proc
.
on
(
'
error
'
,
(
error
:
Error
):
void
=>
{
procExit
=
true
;
procError
=
true
;
});
proc
.
on
(
'
exit
'
,
(
code
:
number
):
void
=>
{
procExit
=
true
;
procError
=
(
code
!==
0
);
});
// create IPC interface
assessor
=
createAssessorInterface
(
proc
);
(
<
IpcInterface
>
assessor
).
onCommand
((
commandType
:
string
,
content
:
string
):
void
=>
{
console
.
log
(
commandType
,
content
);
// tslint:disable-line:no-console
});
}
describe
(
'
core/ipcInterface.terminate
'
,
():
void
=>
{
before
(()
=>
{
prepareUnitTest
();
startProcess
();
});
after
(()
=>
{
cleanupUnitTest
();
});
it
(
'
normal
'
,
()
=>
{
(
<
IpcInterface
>
assessor
).
sendCommand
(
CommandType
.
REPORT_METRIC_DATA
,
'
{"trial_job_id":"A","type":"periodical","value":1}
'
);
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
setTimeout
(
()
=>
{
assert
.
ok
(
!
procExit
);
assert
.
ok
(
!
procError
);
deferred
.
resolve
();
},
1000
);
return
deferred
.
promise
;
});
it
(
'
terminate
'
,
()
=>
{
(
<
IpcInterface
>
assessor
).
sendCommand
(
CommandType
.
TERMINATE
);
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
setTimeout
(
()
=>
{
assert
.
ok
(
procExit
);
assert
.
ok
(
!
procError
);
deferred
.
resolve
();
},
1000
);
return
deferred
.
promise
;
});
});
src/nni_manager/core/test/mockedDatastore.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
{
assert
}
from
'
console
'
;
import
*
as
fs
from
'
fs
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
DataStore
,
MetricData
,
MetricDataRecord
,
MetricType
,
TrialJobEvent
,
TrialJobEventRecord
,
TrialJobInfo
}
from
'
../../common/datastore
'
;
import
{
ExperimentProfile
,
TrialJobStatistics
}
from
'
../../common/manager
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
class
SimpleDb
{
private
name
:
string
=
''
;
private
fileName
:
string
=
''
;
private
db
:
Array
<
any
>
=
new
Array
();
private
map
:
Map
<
string
,
number
>
=
new
Map
<
string
,
number
>
();
// map key to data index
constructor
(
name
:
string
,
filename
:
string
)
{
this
.
name
=
name
;
this
.
fileName
=
filename
;
}
async
saveData
(
data
:
any
,
key
?:
string
):
Promise
<
void
>
{
let
index
;
if
(
key
&&
this
.
map
.
has
(
key
))
{
index
=
this
.
map
.
get
(
key
);
}
if
(
index
===
undefined
)
{
index
=
this
.
db
.
push
(
data
)
-
1
;
}
else
{
this
.
db
[
index
]
=
data
;
}
if
(
key
)
{
this
.
map
.
set
(
key
,
index
);
}
await
this
.
persist
();
}
listAllData
():
Promise
<
Array
<
any
>>
{
const
deferred
=
new
Deferred
<
Array
<
any
>>
();
deferred
.
resolve
(
this
.
db
);
return
deferred
.
promise
;
}
getData
(
key
:
string
):
Promise
<
any
>
{
const
deferred
=
new
Deferred
<
any
>
();
if
(
this
.
map
.
has
(
key
))
{
const
index
=
this
.
map
.
get
(
key
);
if
(
index
!==
undefined
&&
index
>=
0
)
{
deferred
.
resolve
(
this
.
db
[
index
]);
}
else
{
deferred
.
reject
(
new
Error
(
`Key or index not found:
${
this
.
name
}
,
${
key
}
`
));
}
}
else
{
console
.
log
(
`Key not found:
${
this
.
name
}
,
${
key
}
`
);
deferred
.
resolve
(
undefined
);
}
return
deferred
.
promise
;
}
persist
():
Promise
<
void
>
{
const
deferred
=
new
Deferred
<
void
>
();
fs
.
writeFileSync
(
this
.
fileName
,
JSON
.
stringify
({
name
:
this
.
name
,
data
:
this
.
db
,
index
:
JSON
.
stringify
([...
this
.
map
])
},
null
,
4
));
deferred
.
resolve
();
return
deferred
.
promise
;
}
}
class
MockedDataStore
implements
DataStore
{
private
dbExpProfile
:
SimpleDb
=
new
SimpleDb
(
'
exp_profile
'
,
'
./exp_profile.json
'
);
private
dbTrialJobs
:
SimpleDb
=
new
SimpleDb
(
'
trial_jobs
'
,
'
./trial_jobs.json
'
);
private
dbMetrics
:
SimpleDb
=
new
SimpleDb
(
'
metrics
'
,
'
./metrics.json
'
);
init
():
Promise
<
void
>
{
return
Promise
.
resolve
();
}
close
():
Promise
<
void
>
{
return
Promise
.
resolve
();
}
async
storeExperimentProfile
(
experimentProfile
:
ExperimentProfile
):
Promise
<
void
>
{
await
this
.
dbExpProfile
.
saveData
(
experimentProfile
,
experimentProfile
.
id
);
}
async
getExperimentProfile
(
experimentId
:
string
):
Promise
<
ExperimentProfile
>
{
return
await
this
.
dbExpProfile
.
getData
(
experimentId
);
}
async
storeTrialJobEvent
(
event
:
TrialJobEvent
,
trialJobId
:
string
,
data
?:
string
|
undefined
):
Promise
<
void
>
{
const
dataRecord
:
TrialJobEventRecord
=
{
event
:
event
,
timestamp
:
new
Date
(),
trialJobId
:
trialJobId
,
data
:
data
}
await
this
.
dbTrialJobs
.
saveData
(
dataRecord
);
}
async
getTrialJobStatistics
():
Promise
<
any
[]
>
{
const
result
:
TrialJobStatistics
[]
=
[];
const
jobs
=
await
this
.
listTrialJobs
();
const
map
:
Map
<
TrialJobStatus
,
number
>
=
new
Map
();
jobs
.
forEach
((
value
)
=>
{
let
n
:
number
|
undefined
=
map
.
get
(
value
.
status
);
if
(
!
n
)
{
n
=
0
;
}
map
.
set
(
value
.
status
,
n
+
1
);
})
map
.
forEach
((
value
,
key
)
=>
{
const
statistics
:
TrialJobStatistics
=
{
trialJobStatus
:
key
,
trialJobNumber
:
value
}
result
.
push
(
statistics
);
})
return
result
;
}
async
listTrialJobs
(
status
?:
TrialJobStatus
):
Promise
<
TrialJobInfo
[]
>
{
const
trialJobEvents
:
TrialJobEventRecord
[]
=
await
this
.
dbTrialJobs
.
listAllData
();
const
map
:
Map
<
string
,
TrialJobInfo
>
=
this
.
getTrialJobsByReplayEvents
(
trialJobEvents
);
const
result
:
TrialJobInfo
[]
=
[];
for
(
let
key
of
map
.
keys
())
{
const
jobInfo
=
map
.
get
(
key
);
if
(
jobInfo
===
undefined
)
{
continue
;
}
if
(
!
(
status
&&
jobInfo
.
status
!==
status
))
{
if
(
jobInfo
.
status
===
'
SUCCEEDED
'
)
{
jobInfo
.
finalMetricData
=
await
this
.
getFinalMetricData
(
jobInfo
.
id
);
}
result
.
push
(
jobInfo
);
}
}
return
result
;
}
async
storeMetricData
(
trialJobId
:
string
,
data
:
string
):
Promise
<
void
>
{
const
metrics
=
JSON
.
parse
(
data
)
as
MetricData
;
assert
(
trialJobId
===
metrics
.
trial_job_id
);
await
this
.
dbMetrics
.
saveData
({
trialJobId
:
metrics
.
trial_job_id
,
parameterId
:
metrics
.
parameter_id
,
type
:
metrics
.
type
,
data
:
metrics
.
value
,
timestamp
:
new
Date
()
});
}
async
getMetricData
(
trialJobId
:
string
,
metricType
:
MetricType
):
Promise
<
MetricDataRecord
[]
>
{
const
result
:
MetricDataRecord
[]
=
[]
const
allMetrics
=
await
this
.
dbMetrics
.
listAllData
();
allMetrics
.
forEach
((
value
)
=>
{
const
metrics
=
<
MetricDataRecord
>
value
;
if
(
metrics
.
type
===
metricType
&&
metrics
.
trialJobId
===
trialJobId
)
{
result
.
push
(
metrics
);
}
});
return
result
;
}
public
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobInfo
>
{
throw
new
Error
(
"
Method not implemented.
"
);
}
private
async
getFinalMetricData
(
trialJobId
:
string
):
Promise
<
any
>
{
const
metrics
:
MetricDataRecord
[]
=
await
this
.
getMetricData
(
trialJobId
,
"
FINAL
"
);
assert
(
metrics
.
length
<=
1
);
if
(
metrics
.
length
==
1
)
{
return
metrics
[
0
];
}
else
{
return
undefined
;
}
}
private
getJobStatusByLatestEvent
(
event
:
TrialJobEvent
):
TrialJobStatus
{
switch
(
event
)
{
case
'
USER_TO_CANCEL
'
:
return
'
USER_CANCELED
'
;
case
'
ADD_CUSTOMIZED
'
:
return
'
WAITING
'
;
}
return
<
TrialJobStatus
>
event
;
}
private
getTrialJobsByReplayEvents
(
trialJobEvents
:
TrialJobEventRecord
[]):
Map
<
string
,
TrialJobInfo
>
{
const
map
:
Map
<
string
,
TrialJobInfo
>
=
new
Map
();
// assume data is stored by time ASC order
for
(
let
record
of
trialJobEvents
)
{
let
jobInfo
:
TrialJobInfo
|
undefined
;
if
(
map
.
has
(
record
.
trialJobId
))
{
jobInfo
=
map
.
get
(
record
.
trialJobId
);
}
else
{
jobInfo
=
{
id
:
record
.
trialJobId
,
status
:
this
.
getJobStatusByLatestEvent
(
record
.
event
),
};
}
if
(
!
jobInfo
)
{
throw
new
Error
(
'
Empty JobInfo
'
);
}
switch
(
record
.
event
)
{
case
'
RUNNING
'
:
jobInfo
.
startTime
=
new
Date
();
break
;
case
'
SUCCEEDED
'
:
case
'
FAILED
'
:
case
'
USER_CANCELED
'
:
case
'
SYS_CANCELED
'
:
jobInfo
.
endTime
=
new
Date
();
}
jobInfo
.
status
=
this
.
getJobStatusByLatestEvent
(
record
.
event
);
map
.
set
(
record
.
trialJobId
,
jobInfo
);
}
return
map
;
}
}
export
{
MockedDataStore
};
\ No newline at end of file
src/nni_manager/core/test/mockedTrainingService.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
* MIT License
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Provider
}
from
'
typescript-ioc
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
const
testTrainingServiceProvider
:
Provider
=
{
get
:
()
=>
{
return
new
MockedTrainingService
();
}
};
class
MockedTrainingService
extends
TrainingService
{
public
mockedMetaDataValue
:
string
=
"
default
"
;
public
jobDetail1
:
TrialJobDetail
=
{
id
:
'
1234
'
,
status
:
'
SUCCEEDED
'
,
submitTime
:
new
Date
(),
startTime
:
new
Date
(),
endTime
:
new
Date
(),
tags
:
[
'
test
'
],
url
:
'
http://test
'
,
workingDirectory
:
'
/tmp/mocked
'
,
form
:
{
jobType
:
'
TRIAL
'
}
};
public
jobDetail2
:
TrialJobDetail
=
{
id
:
'
3456
'
,
status
:
'
SUCCEEDED
'
,
submitTime
:
new
Date
(),
startTime
:
new
Date
(),
endTime
:
new
Date
(),
tags
:
[
'
test
'
],
url
:
'
http://test
'
,
workingDirectory
:
'
/tmp/mocked
'
,
form
:
{
jobType
:
'
TRIAL
'
}
};
public
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
const
deferred
=
new
Deferred
<
TrialJobDetail
[]
>
();
deferred
.
resolve
([
this
.
jobDetail1
,
this
.
jobDetail2
]);
return
deferred
.
promise
;
}
public
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
const
deferred
=
new
Deferred
<
TrialJobDetail
>
();
if
(
trialJobId
===
'
1234
'
){
deferred
.
resolve
(
this
.
jobDetail1
);
}
else
if
(
trialJobId
===
'
3456
'
){
deferred
.
resolve
(
this
.
jobDetail2
);
}
else
{
deferred
.
reject
();
}
return
deferred
.
promise
;
}
async
run
():
Promise
<
void
>
{
}
public
addTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
}
public
removeTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
):
void
{
}
public
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
const
deferred
=
new
Deferred
<
TrialJobDetail
>
();
return
deferred
.
promise
;
}
public
cancelTrialJob
(
trialJobId
:
string
):
Promise
<
void
>
{
const
deferred
=
new
Deferred
<
void
>
();
if
(
trialJobId
===
'
1234
'
||
trialJobId
===
'
3456
'
){
deferred
.
resolve
();
}
else
{
deferred
.
reject
(
'
job id error
'
);
}
return
deferred
.
promise
;
}
public
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
const
deferred
=
new
Deferred
<
void
>
();
if
(
key
==
'
mockedMetadataKey
'
){
this
.
mockedMetaDataValue
=
value
;
deferred
.
resolve
();
}
else
{
deferred
.
reject
(
'
key error
'
);
}
return
deferred
.
promise
;
}
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
const
deferred
=
new
Deferred
<
string
>
();
if
(
key
==
'
mockedMetadataKey
'
){
deferred
.
resolve
(
this
.
mockedMetaDataValue
);
}
else
{
deferred
.
reject
(
'
key error
'
);
}
return
deferred
.
promise
;
}
public
cleanUp
():
Promise
<
void
>
{
throw
new
MethodNotImplementedError
();
}
}
export
{
MockedTrainingService
,
testTrainingServiceProvider
}
\ No newline at end of file
src/nni_manager/core/test/nnimanager.test.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
{
assert
,
expect
}
from
'
chai
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Database
,
DataStore
}
from
'
../../common/datastore
'
;
import
{
Manager
}
from
'
../../common/manager
'
;
import
{
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
NNIDataStore
}
from
'
../nniDataStore
'
;
import
{
NNIManager
}
from
'
../nnimanager
'
;
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
import
{
MockedTrainingService
}
from
'
./mockedTrainingService
'
;
async
function
initContainer
():
Promise
<
void
>
{
prepareUnitTest
();
Container
.
bind
(
TrainingService
).
to
(
MockedTrainingService
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Manager
).
to
(
NNIManager
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Database
).
to
(
SqlDB
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
DataStore
).
to
(
NNIDataStore
).
scope
(
Scope
.
Singleton
);
await
component
.
get
<
DataStore
>
(
DataStore
).
init
();
}
describe
(
'
Unit test for nnimanager
'
,
function
()
{
this
.
timeout
(
10000
);
let
nniManager
:
Manager
;
let
ClusterMetadataKey
=
'
mockedMetadataKey
'
;
let
experimentParams
=
{
authorName
:
'
zql
'
,
experimentName
:
'
naive_experiment
'
,
trialConcurrency
:
2
,
maxExecDuration
:
5
,
maxTrialNum
:
2
,
searchSpace
:
'
{"x":1}
'
,
tuner
:
{
tunerCommand
:
'
python3 hyperopt.py
'
,
tunerCwd
:
'
core/test
'
,
tunerCheckpointDirectory
:
''
,
tunerGpuNum
:
1
},
assessor
:
{
assessorCommand
:
'
python3 dummy_assessor.py
'
,
assessorCwd
:
'
core/test
'
,
assessorCheckpointDirectory
:
''
,
assessorGpuNum
:
1
}
}
before
(
async
()
=>
{
await
initContainer
();
nniManager
=
component
.
get
(
Manager
);
});
after
(()
=>
{
cleanupUnitTest
();
})
it
(
'
test resumeExperiment
'
,
()
=>
{
//TODO: add resume experiment unit test
})
it
(
'
test startExperiment
'
,
()
=>
{
return
nniManager
.
startExperiment
(
experimentParams
).
then
(
function
(
experimentId
)
{
expect
(
experimentId
.
length
).
to
.
be
.
equal
(
8
);
}).
catch
(
function
(
error
)
{
assert
.
fail
(
error
);
})
})
it
(
'
test listTrialJobs
'
,
()
=>
{
//FIXME: not implemented
//return nniManager.listTrialJobs().then(function (trialJobDetails) {
// expect(trialJobDetails.length).to.be.equal(2);
//}).catch(function (error) {
// assert.fail(error);
//})
})
it
(
'
test getTrialJob valid
'
,
()
=>
{
//query a exist id
return
nniManager
.
getTrialJob
(
'
1234
'
).
then
(
function
(
trialJobDetail
)
{
expect
(
trialJobDetail
.
id
).
to
.
be
.
equal
(
'
1234
'
);
}).
catch
(
function
(
error
)
{
assert
.
fail
(
error
);
})
})
it
(
'
test getTrialJob with invalid id
'
,
()
=>
{
//query a not exist id, and the function should throw error, and should not process then() method
return
nniManager
.
getTrialJob
(
'
4567
'
).
then
((
jobid
)
=>
{
assert
.
fail
();
}).
catch
((
error
)
=>
{
assert
.
isTrue
(
true
);
})
})
it
(
'
test getClusterMetadata
'
,
()
=>
{
//default value is "default"
return
nniManager
.
getClusterMetadata
(
ClusterMetadataKey
).
then
(
function
(
value
)
{
expect
(
value
).
to
.
equal
(
"
default
"
);
});
})
it
(
'
test setClusterMetadata and getClusterMetadata
'
,
()
=>
{
//set a valid key
return
nniManager
.
setClusterMetadata
(
ClusterMetadataKey
,
"
newdata
"
).
then
(()
=>
{
return
nniManager
.
getClusterMetadata
(
ClusterMetadataKey
).
then
(
function
(
value
)
{
expect
(
value
).
to
.
equal
(
"
newdata
"
);
});
}).
catch
((
error
)
=>
{
console
.
log
(
error
);
})
})
//TODO: complete ut
it
(
'
test cancelTrialJobByUser
'
,
()
=>
{
return
nniManager
.
cancelTrialJobByUser
(
'
1234
'
).
then
(()
=>
{
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
})
it
(
'
test addCustomizedTrialJob
'
,
()
=>
{
return
nniManager
.
addCustomizedTrialJob
(
'
hyperParams
'
).
then
(()
=>
{
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
})
})
src/nni_manager/core/test/sqlDatabase.test.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
{
Container
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Database
,
MetricDataRecord
,
TrialJobEvent
,
TrialJobEventRecord
}
from
'
../../common/datastore
'
;
import
{
setExperimentStartupInfo
}
from
'
../../common/experimentStartupInfo
'
;
import
{
ExperimentParams
,
ExperimentProfile
}
from
'
../../common/manager
'
;
import
{
cleanupUnitTest
,
getDefaultDatabaseDir
,
mkDirP
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
const
expParams1
:
ExperimentParams
=
{
authorName
:
'
ZhangSan
'
,
experimentName
:
'
Exp1
'
,
trialConcurrency
:
3
,
maxExecDuration
:
100
,
maxTrialNum
:
5
,
searchSpace
:
'
SS
'
,
tuner
:
{
tunerCommand
:
'
./tuner.sh
'
,
tunerCwd
:
'
.
'
,
tunerCheckpointDirectory
:
'
/tmp
'
,
tunerGpuNum
:
0
}
};
const
expParams2
:
ExperimentParams
=
{
authorName
:
'
LiSi
'
,
experimentName
:
'
Exp2
'
,
trialConcurrency
:
5
,
maxExecDuration
:
1000
,
maxTrialNum
:
5
,
searchSpace
:
''
,
tuner
:
{
tunerCommand
:
'
python tuner.py
'
,
tunerCwd
:
'
/tmp
'
,
tunerCheckpointDirectory
:
'
/tmp
'
},
assessor
:
{
assessorCommand
:
'
python assessor.py
'
,
assessorCwd
:
'
/tmp
'
,
assessorCheckpointDirectory
:
'
/tmp
'
}
};
const
profiles
:
ExperimentProfile
[]
=
[
{
params
:
expParams1
,
id
:
'
#1
'
,
execDuration
:
0
,
startTime
:
new
Date
(),
endTime
:
undefined
,
revision
:
1
},
{
params
:
expParams1
,
id
:
'
#1
'
,
execDuration
:
0
,
startTime
:
new
Date
(),
endTime
:
new
Date
(),
revision
:
2
},
{
params
:
expParams2
,
id
:
'
#2
'
,
execDuration
:
0
,
startTime
:
new
Date
(),
endTime
:
new
Date
(),
revision
:
2
},
{
params
:
expParams2
,
id
:
'
#2
'
,
execDuration
:
0
,
startTime
:
new
Date
(),
endTime
:
new
Date
(),
revision
:
3
}
];
const
events
:
TrialJobEventRecord
[]
=
[
{
timestamp
:
new
Date
(),
event
:
'
WAITING
'
,
trialJobId
:
'
A
'
,
data
:
'
hello
'
},
// 0
{
timestamp
:
new
Date
(),
event
:
'
UNKNOWN
'
,
trialJobId
:
'
B
'
,
data
:
'
world
'
},
// 1
{
timestamp
:
new
Date
(),
event
:
'
RUNNING
'
,
trialJobId
:
'
B
'
,
data
:
undefined
},
// 2
{
timestamp
:
new
Date
(),
event
:
'
RUNNING
'
,
trialJobId
:
'
A
'
,
data
:
'
123
'
},
// 3
{
timestamp
:
new
Date
(),
event
:
'
FAILED
'
,
trialJobId
:
'
A
'
,
data
:
undefined
}
// 4
];
const
metrics
:
MetricDataRecord
[]
=
[
{
timestamp
:
new
Date
(),
trialJobId
:
'
A
'
,
parameterId
:
'
1
'
,
type
:
'
PERIODICAL
'
,
sequence
:
0
,
data
:
1.1
},
// 0
{
timestamp
:
new
Date
(),
trialJobId
:
'
B
'
,
parameterId
:
'
2
'
,
type
:
'
PERIODICAL
'
,
sequence
:
0
,
data
:
2.1
},
// 1
{
timestamp
:
new
Date
(),
trialJobId
:
'
A
'
,
parameterId
:
'
1
'
,
type
:
'
PERIODICAL
'
,
sequence
:
1
,
data
:
1.2
},
// 2
{
timestamp
:
new
Date
(),
trialJobId
:
'
A
'
,
parameterId
:
'
1
'
,
type
:
'
FINAL
'
,
sequence
:
0
,
data
:
1.3
},
// 3
{
timestamp
:
new
Date
(),
trialJobId
:
'
C
'
,
parameterId
:
'
2
'
,
type
:
'
PERIODICAL
'
,
sequence
:
1
,
data
:
2.1
},
// 4
{
timestamp
:
new
Date
(),
trialJobId
:
'
C
'
,
parameterId
:
'
2
'
,
type
:
'
FINAL
'
,
sequence
:
0
,
data
:
2.2
}
// 5
];
// tslint:disable-next-line:no-any
function
assertRecordEqual
(
record
:
any
,
value
:
any
):
void
{
assert
.
ok
(
record
.
timestamp
>
new
Date
(
2018
,
6
,
1
));
assert
.
ok
(
record
.
timestamp
<
new
Date
());
for
(
const
key
in
value
)
{
// tslint:disable-line:no-for-in
if
(
key
!==
'
timestamp
'
)
{
assert
.
equal
(
record
[
key
],
value
[
key
]);
}
}
}
// tslint:disable-next-line:no-any
function
assertRecordsEqual
(
records
:
any
[],
inputs
:
any
[],
indices
:
number
[]):
void
{
assert
.
equal
(
records
.
length
,
indices
.
length
);
for
(
let
i
:
number
=
0
;
i
<
records
.
length
;
i
++
)
{
assertRecordEqual
(
records
[
i
],
inputs
[
indices
[
i
]]);
}
}
describe
(
'
core/sqlDatabase
'
,
()
=>
{
let
db
:
SqlDB
|
undefined
;
before
(
async
()
=>
{
prepareUnitTest
();
const
dbDir
:
string
=
getDefaultDatabaseDir
();
await
mkDirP
(
dbDir
);
db
=
new
SqlDB
();
await
(
<
SqlDB
>
db
).
init
(
true
,
dbDir
);
for
(
const
profile
of
profiles
)
{
await
(
<
SqlDB
>
db
).
storeExperimentProfile
(
profile
);
}
for
(
const
event
of
events
)
{
await
(
<
SqlDB
>
db
).
storeTrialJobEvent
(
<
TrialJobEvent
>
event
.
event
,
event
.
trialJobId
,
event
.
data
);
}
for
(
const
metric
of
metrics
)
{
await
(
<
SqlDB
>
db
).
storeMetricData
(
metric
.
trialJobId
,
JSON
.
stringify
(
metric
));
}
});
after
(()
=>
{
cleanupUnitTest
();
});
it
(
'
queryExperimentProfile without revision
'
,
async
()
=>
{
const
records
:
ExperimentProfile
[]
=
await
(
<
SqlDB
>
db
).
queryExperimentProfile
(
'
#1
'
);
assert
.
equal
(
records
.
length
,
2
);
assert
.
deepEqual
(
records
[
0
],
profiles
[
1
]);
assert
.
deepEqual
(
records
[
1
],
profiles
[
0
]);
});
it
(
'
queryExperimentProfile with revision
'
,
async
()
=>
{
const
records
:
ExperimentProfile
[]
=
await
(
<
SqlDB
>
db
).
queryExperimentProfile
(
'
#1
'
,
2
);
assert
.
equal
(
records
.
length
,
1
);
assert
.
deepEqual
(
records
[
0
],
profiles
[
1
]);
});
it
(
'
queryLatestExperimentProfile
'
,
async
()
=>
{
const
record
:
ExperimentProfile
=
await
(
<
SqlDB
>
db
).
queryLatestExperimentProfile
(
'
#2
'
);
assert
.
deepEqual
(
record
,
profiles
[
3
]);
});
it
(
'
queryTrialJobEventByEvent without trialJobId
'
,
async
()
=>
{
const
records
:
TrialJobEventRecord
[]
=
await
(
<
SqlDB
>
db
).
queryTrialJobEvent
(
undefined
,
'
RUNNING
'
);
assertRecordsEqual
(
records
,
events
,
[
2
,
3
]);
});
it
(
'
queryTrialJobEventByEvent with trialJobId
'
,
async
()
=>
{
const
records
:
TrialJobEventRecord
[]
=
await
(
<
SqlDB
>
db
).
queryTrialJobEvent
(
'
A
'
,
'
RUNNING
'
);
assertRecordsEqual
(
records
,
events
,
[
3
]);
});
it
(
'
queryTrialJobEventById
'
,
async
()
=>
{
const
records
:
TrialJobEventRecord
[]
=
await
(
<
SqlDB
>
db
).
queryTrialJobEvent
(
'
B
'
);
assertRecordsEqual
(
records
,
events
,
[
1
,
2
]);
});
it
(
'
queryMetricDataByType without trialJobId
'
,
async
()
=>
{
const
records
:
MetricDataRecord
[]
=
await
(
<
SqlDB
>
db
).
queryMetricData
(
undefined
,
'
FINAL
'
);
assertRecordsEqual
(
records
,
metrics
,
[
3
,
5
]);
});
it
(
'
queryMetricDataByType with trialJobId
'
,
async
()
=>
{
const
records
:
MetricDataRecord
[]
=
await
(
<
SqlDB
>
db
).
queryMetricData
(
'
A
'
,
'
PERIODICAL
'
);
assertRecordsEqual
(
records
,
metrics
,
[
0
,
2
]);
});
it
(
'
queryMetricDataById
'
,
async
()
=>
{
const
records
:
MetricDataRecord
[]
=
await
(
<
SqlDB
>
db
).
queryMetricData
(
'
B
'
);
assertRecordsEqual
(
records
,
metrics
,
[
1
]);
});
it
(
'
empty result
'
,
async
()
=>
{
const
records
:
MetricDataRecord
[]
=
await
(
<
SqlDB
>
db
).
queryMetricData
(
'
X
'
);
assert
.
equal
(
records
.
length
,
0
);
});
});
src/nni_manager/core/trialJobs.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
EventEmitter
}
from
'
events
'
;
import
{
TrainingService
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../common/trainingService
'
;
import
{
delay
}
from
'
../common/utils
'
;
type
TrialJobMaintainerEvent
=
TrialJobStatus
|
'
EXPERIMENT_DONE
'
;
class
TrialJobs
{
private
eventEmitter
:
EventEmitter
;
private
trialJobs
:
Map
<
string
,
TrialJobDetail
>
;
private
noMoreTrials
:
boolean
;
private
stopLoop
:
boolean
;
private
trainingService
:
TrainingService
;
private
pastExecDuration
:
number
;
// second
private
maxExecDuration
:
number
;
// second
constructor
(
trainingService
:
TrainingService
,
pastExecDuration
:
number
,
// second
maxExecDuration
:
number
// second
)
{
this
.
eventEmitter
=
new
EventEmitter
();
this
.
trialJobs
=
new
Map
<
string
,
TrialJobDetail
>
();
this
.
noMoreTrials
=
false
;
this
.
stopLoop
=
false
;
this
.
trainingService
=
trainingService
;
this
.
pastExecDuration
=
pastExecDuration
;
this
.
maxExecDuration
=
maxExecDuration
;
}
public
setTrialJob
(
key
:
string
,
value
:
TrialJobDetail
):
void
{
this
.
trialJobs
.
set
(
key
,
value
);
}
public
getTrialJob
(
key
:
string
):
TrialJobDetail
|
undefined
{
return
this
.
trialJobs
.
get
(
key
);
}
public
setNoMoreTrials
():
void
{
this
.
noMoreTrials
=
true
;
}
public
setStopLoop
():
void
{
this
.
stopLoop
=
true
;
}
public
updateMaxExecDuration
(
duration
:
number
):
void
{
this
.
maxExecDuration
=
duration
;
}
public
on
(
listener
:
(
event
:
TrialJobMaintainerEvent
,
trialJobDetail
:
TrialJobDetail
)
=>
void
):
void
{
this
.
eventEmitter
.
addListener
(
'
all
'
,
listener
);
}
public
async
requestTrialJobsStatus
():
Promise
<
void
>
{
for
(
const
trialJobId
of
Array
.
from
(
this
.
trialJobs
.
keys
()))
{
const
trialJobDetail
:
TrialJobDetail
=
await
this
.
trainingService
.
getTrialJob
(
trialJobId
);
switch
(
trialJobDetail
.
status
)
{
case
'
SUCCEEDED
'
:
case
'
USER_CANCELED
'
:
this
.
eventEmitter
.
emit
(
'
all
'
,
trialJobDetail
.
status
,
trialJobDetail
);
this
.
trialJobs
.
delete
(
trialJobId
);
break
;
case
'
FAILED
'
:
case
'
SYS_CANCELED
'
:
// In the current version, we do not retry
// TO DO: push this job to queue for retry
this
.
eventEmitter
.
emit
(
'
all
'
,
trialJobDetail
.
status
,
trialJobDetail
);
this
.
trialJobs
.
delete
(
trialJobId
);
break
;
case
'
WAITING
'
:
// Do nothing
break
;
case
'
RUNNING
'
:
const
oldTrialJobDetail
=
this
.
trialJobs
.
get
(
trialJobId
);
assert
(
oldTrialJobDetail
);
if
(
oldTrialJobDetail
&&
oldTrialJobDetail
.
status
===
"
WAITING
"
)
{
this
.
trialJobs
.
set
(
trialJobId
,
trialJobDetail
);
this
.
eventEmitter
.
emit
(
'
all
'
,
trialJobDetail
.
status
,
trialJobDetail
);
}
break
;
case
'
UNKNOWN
'
:
// Do nothing
break
;
default
:
// TO DO: add warning in log
}
}
return
Promise
.
resolve
();
}
public
async
run
():
Promise
<
void
>
{
const
startTime
:
Date
=
new
Date
();
while
((
Date
.
now
()
-
startTime
.
getTime
())
/
1000
+
this
.
pastExecDuration
<
this
.
maxExecDuration
)
{
if
(
this
.
stopLoop
||
(
this
.
noMoreTrials
&&
this
.
trialJobs
.
size
===
0
))
{
break
;
}
await
this
.
requestTrialJobsStatus
();
await
delay
(
5000
);
}
this
.
eventEmitter
.
emit
(
'
all
'
,
'
EXPERIMENT_DONE
'
);
}
}
export
{
TrialJobs
,
TrialJobMaintainerEvent
};
src/nni_manager/main.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
./common/component
'
;
import
{
Database
,
DataStore
}
from
'
./common/datastore
'
;
import
{
setExperimentStartupInfo
}
from
'
./common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
./common/log
'
;
import
{
Manager
}
from
'
./common/manager
'
;
import
{
TrainingService
}
from
'
./common/trainingService
'
;
import
{
parseArg
,
uniqueString
,
mkDirP
,
getLogDir
}
from
'
./common/utils
'
;
import
{
NNIDataStore
}
from
'
./core/nniDataStore
'
;
import
{
NNIManager
}
from
'
./core/nnimanager
'
;
import
{
SqlDB
}
from
'
./core/sqlDatabase
'
;
import
{
RestServer
}
from
'
./rest_server/server
'
;
import
{
LocalTrainingService
}
from
'
./training_service/local/localTrainingService
'
;
import
{
RemoteMachineTrainingService
}
from
'
./training_service/remote_machine/remoteMachineTrainingService
'
;
function
initStartupInfo
(
startExpMode
:
string
,
resumeExperimentId
:
string
)
{
const
createNew
:
boolean
=
(
startExpMode
===
'
new
'
);
const
expId
:
string
=
createNew
?
uniqueString
(
8
)
:
resumeExperimentId
;
setExperimentStartupInfo
(
createNew
,
expId
);
}
async
function
initContainer
(
platformMode
:
string
):
Promise
<
void
>
{
if
(
platformMode
===
'
local
'
)
{
Container
.
bind
(
TrainingService
).
to
(
LocalTrainingService
).
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
remote
'
)
{
Container
.
bind
(
TrainingService
).
to
(
RemoteMachineTrainingService
).
scope
(
Scope
.
Singleton
);
}
else
{
throw
new
Error
(
`Error: unsupported mode:
${
mode
}
`
);
}
Container
.
bind
(
Manager
).
to
(
NNIManager
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Database
).
to
(
SqlDB
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
DataStore
).
to
(
NNIDataStore
).
scope
(
Scope
.
Singleton
);
const
ds
:
DataStore
=
component
.
get
(
DataStore
);
await
ds
.
init
();
}
function
usage
():
void
{
console
.
info
(
'
usage: node main.js --port <port> --mode <local/remote> --start_mode <new/resume> --experiment_id <id>
'
);
}
let
port
:
number
=
RestServer
.
DEFAULT_PORT
;
const
strPort
:
string
=
parseArg
([
'
--port
'
,
'
-p
'
]);
if
(
strPort
&&
strPort
.
length
>
0
)
{
port
=
parseInt
(
strPort
,
10
);
}
const
mode
:
string
=
parseArg
([
'
--mode
'
,
'
-m
'
]);
if
(
!
[
'
local
'
,
'
remote
'
].
includes
(
mode
))
{
usage
();
process
.
exit
(
1
);
}
const
startMode
:
string
=
parseArg
([
'
--start_mode
'
,
'
-s
'
]);
if
(
!
[
'
new
'
,
'
resume
'
].
includes
(
startMode
))
{
usage
();
process
.
exit
(
1
);
}
const
experimentId
:
string
=
parseArg
([
'
--experiment_id
'
,
'
-id
'
]);
if
(
startMode
===
'
resume
'
&&
experimentId
.
trim
().
length
<
1
)
{
usage
();
process
.
exit
(
1
);
}
initStartupInfo
(
startMode
,
experimentId
);
mkDirP
(
getLogDir
()).
then
(
async
()
=>
{
const
log
:
Logger
=
getLogger
();
try
{
await
initContainer
(
mode
);
const
restServer
:
RestServer
=
component
.
get
(
RestServer
);
await
restServer
.
start
(
port
);
log
.
info
(
`Rest server listening on:
${
restServer
.
endPoint
}
`
);
}
catch
(
err
)
{
log
.
error
(
`
${
err
.
stack
}
`
);
}
}).
catch
((
err
:
Error
)
=>
{
console
.
error
(
`Failed to create log dir:
${
err
.
stack
}
`
);
});
src/nni_manager/package.json
0 → 100644
View file @
252f36f8
{
"name"
:
"nni"
,
"version"
:
"1.0.0"
,
"main"
:
"index.js"
,
"scripts"
:
{
"postbuild"
:
"cp -f --parent scripts/*.py ./dist/"
,
"build"
:
"tsc"
,
"test"
:
"mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --colors"
,
"start"
:
"node dist/main.js"
},
"license"
:
"MIT"
,
"dependencies"
:
{
"chai-as-promised"
:
"^7.1.1"
,
"child-process-promise"
:
"^2.2.1"
,
"express"
:
"^4.16.3"
,
"node-nvidia-smi"
:
"^1.0.0"
,
"rx"
:
"^4.1.0"
,
"serve"
:
"^9.6.0"
,
"sqlite3"
:
"^4.0.2"
,
"ssh2"
:
"^0.6.1"
,
"stream-buffers"
:
"^3.0.2"
,
"tail-stream"
:
"^0.3.4"
,
"tree-kill"
:
"^1.2.0"
,
"ts-deferred"
:
"^1.0.4"
,
"typescript-ioc"
:
"^1.2.4"
,
"typescript-string-operations"
:
"^1.3.1"
},
"devDependencies"
:
{
"@types/chai"
:
"^4.1.4"
,
"@types/chai-as-promised"
:
"^7.1.0"
,
"@types/express"
:
"^4.16.0"
,
"@types/mocha"
:
"^5.2.5"
,
"@types/node"
:
"^10.5.5"
,
"@types/request"
:
"^2.47.1"
,
"@types/rx"
:
"^4.1.1"
,
"@types/sqlite3"
:
"^3.1.3"
,
"@types/ssh2"
:
"^0.5.35"
,
"@types/stream-buffers"
:
"^3.0.2"
,
"@types/tmp"
:
"^0.0.33"
,
"chai"
:
"^4.1.2"
,
"mocha"
:
"^5.2.0"
,
"request"
:
"^2.87.0"
,
"tmp"
:
"^0.0.33"
,
"ts-node"
:
"^7.0.0"
,
"tslint"
:
"^5.11.0"
,
"tslint-microsoft-contrib"
:
"^5.1.0"
,
"typescript"
:
"^3.0.1"
},
"engines"
:
{
"node"
:
">=10.0.0"
}
}
src/nni_manager/rest_server/restHandler.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
{
Request
,
Response
,
Router
}
from
'
express
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../common/component
'
;
import
{
DataStore
,
MetricDataRecord
,
TrialJobInfo
}
from
'
../common/datastore
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../common/errors
'
;
import
{
isNewExperiment
}
from
'
../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
ExperimentProfile
,
Manager
,
TrialJobStatistics
}
from
'
../common/manager
'
;
import
{
RestServer
}
from
'
./server
'
;
import
{
TensorBoard
}
from
'
./tensorboard
'
;
class
NNIRestHandler
{
private
restServer
:
RestServer
;
private
nniManager
:
Manager
;
private
tb
:
TensorBoard
;
private
log
:
Logger
;
constructor
(
rs
:
RestServer
)
{
this
.
nniManager
=
component
.
get
(
Manager
);
this
.
restServer
=
rs
;
this
.
tb
=
new
TensorBoard
();
this
.
log
=
getLogger
();
}
public
createRestHandler
():
Router
{
const
router
:
Router
=
Router
();
// tslint:disable-next-line:typedef
router
.
use
((
req
:
Request
,
res
:
Response
,
next
)
=>
{
this
.
log
.
info
(
`
${
req
.
method
}
:
${
req
.
url
}
: body:\n
${
JSON
.
stringify
(
req
.
body
,
undefined
,
4
)}
`
);
res
.
header
(
'
Access-Control-Allow-Origin
'
,
'
*
'
);
res
.
header
(
'
Access-Control-Allow-Headers
'
,
'
Origin, X-Requested-With, Content-Type, Accept
'
);
res
.
header
(
'
Access-Control-Allow-Methods
'
,
'
PUT,POST,GET,DELETE,OPTIONS
'
);
res
.
setHeader
(
'
Content-Type
'
,
'
application/json
'
);
next
();
});
this
.
checkStatus
(
router
);
this
.
getExperimentProfile
(
router
);
this
.
updateExperimentProfile
(
router
);
this
.
startExperiment
(
router
);
this
.
stopExperiment
(
router
);
this
.
getTrialJobStatistics
(
router
);
this
.
setClusterMetaData
(
router
);
this
.
listTrialJobs
(
router
);
this
.
getTrialJob
(
router
);
this
.
addTrialJob
(
router
);
this
.
cancelTrialJob
(
router
);
this
.
getMetricData
(
router
);
this
.
getExample
(
router
);
this
.
getTriedParameters
(
router
);
this
.
startTensorBoard
(
router
);
this
.
stopTensorBoard
(
router
);
return
router
;
}
private
handle_error
(
err
:
Error
,
res
:
Response
):
void
{
this
.
log
.
info
(
err
);
if
(
err
instanceof
NNIError
&&
err
.
name
===
NNIErrorNames
.
NOT_FOUND
)
{
res
.
status
(
404
);
}
else
{
res
.
status
(
500
);
}
res
.
send
({
error
:
err
.
message
});
}
// TODO add validators for request params, query, body
private
checkStatus
(
router
:
Router
):
void
{
router
.
get
(
'
/check-status
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
const
ds
:
DataStore
=
component
.
get
<
DataStore
>
(
DataStore
);
ds
.
init
().
then
(()
=>
{
res
.
send
();
}).
catch
(
async
(
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
this
.
log
.
error
(
err
.
message
);
this
.
log
.
error
(
`Database initialize failed, stopping rest server...`
);
await
this
.
restServer
.
stop
();
});
});
}
private
getExperimentProfile
(
router
:
Router
):
void
{
router
.
get
(
'
/experiment
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
getExperimentProfile
().
then
((
profile
:
ExperimentProfile
)
=>
{
res
.
send
(
profile
);
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
updateExperimentProfile
(
router
:
Router
):
void
{
router
.
put
(
'
/experiment
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
updateExperimentProfile
(
req
.
body
,
req
.
query
.
update_type
).
then
(()
=>
{
res
.
send
();
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
startExperiment
(
router
:
Router
):
void
{
router
.
post
(
'
/experiment
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
if
(
isNewExperiment
())
{
this
.
nniManager
.
startExperiment
(
req
.
body
).
then
((
eid
:
string
)
=>
{
res
.
send
({
experiment_id
:
eid
});
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
}
else
{
this
.
nniManager
.
resumeExperiment
().
then
(()
=>
{
res
.
send
();
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
}
});
}
private
stopExperiment
(
router
:
Router
):
void
{
router
.
delete
(
'
/experiment
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
try
{
await
this
.
tb
.
cleanUp
();
await
this
.
nniManager
.
stopExperiment
();
res
.
send
();
this
.
log
.
debug
(
'
Stopping rest server
'
);
await
this
.
restServer
.
stop
();
}
catch
(
err
)
{
this
.
handle_error
(
err
,
res
);
}
});
}
private
getTrialJobStatistics
(
router
:
Router
):
void
{
router
.
get
(
'
/job-statistics
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
getTrialJobStatistics
().
then
((
statistics
:
TrialJobStatistics
[])
=>
{
res
.
send
(
statistics
);
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
setClusterMetaData
(
router
:
Router
):
void
{
router
.
put
(
'
/experiment/cluster-metadata
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
// tslint:disable-next-line:no-any
const
metadata
:
any
=
req
.
body
;
const
keys
:
string
[]
=
Object
.
keys
(
metadata
);
try
{
for
(
const
key
of
keys
)
{
await
this
.
nniManager
.
setClusterMetadata
(
key
,
JSON
.
stringify
(
metadata
[
key
]));
}
res
.
send
();
}
catch
(
err
)
{
this
.
handle_error
(
err
,
res
);
}
});
}
private
listTrialJobs
(
router
:
Router
):
void
{
router
.
get
(
'
/trial-jobs
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
listTrialJobs
(
req
.
query
.
status
).
then
((
jobInfos
:
TrialJobInfo
[])
=>
{
jobInfos
.
forEach
((
trialJob
:
TrialJobInfo
)
=>
{
this
.
setErrorPathForFailedJob
(
trialJob
);
});
res
.
send
(
jobInfos
);
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
getTrialJob
(
router
:
Router
):
void
{
router
.
get
(
'
/trial-jobs/:id
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
getTrialJob
(
req
.
params
.
id
).
then
((
jobDetail
:
TrialJobInfo
)
=>
{
const
jobInfo
:
TrialJobInfo
=
this
.
setErrorPathForFailedJob
(
jobDetail
);
res
.
send
(
jobInfo
);
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
addTrialJob
(
router
:
Router
):
void
{
router
.
post
(
'
/trial-jobs
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
addCustomizedTrialJob
(
JSON
.
stringify
(
req
.
body
)).
then
(()
=>
{
res
.
send
();
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
cancelTrialJob
(
router
:
Router
):
void
{
router
.
delete
(
'
/trial-jobs/:id
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
cancelTrialJobByUser
(
req
.
params
.
id
).
then
(()
=>
{
res
.
send
();
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
getMetricData
(
router
:
Router
):
void
{
router
.
get
(
'
/metric-data/:job_id
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
getMetricData
(
req
.
params
.
job_id
,
req
.
query
.
type
).
then
((
metricsData
:
MetricDataRecord
[])
=>
{
res
.
send
(
metricsData
);
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
startTensorBoard
(
router
:
Router
):
void
{
router
.
post
(
'
/tensorboard
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
const
jobIds
:
string
[]
=
req
.
query
.
job_ids
.
split
(
'
,
'
);
const
tensorboardCmd
:
string
|
undefined
=
req
.
query
.
tensorboard_cmd
;
this
.
tb
.
startTensorBoard
(
jobIds
,
tensorboardCmd
).
then
((
endPoint
:
string
)
=>
{
res
.
send
({
endPoint
:
endPoint
});
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
stopTensorBoard
(
router
:
Router
):
void
{
router
.
delete
(
'
/tensorboard
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
const
endPoint
:
string
=
req
.
query
.
endpoint
;
this
.
tb
.
stopTensorBoard
(
endPoint
).
then
(()
=>
{
res
.
send
();
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
getExample
(
router
:
Router
):
void
{
// tslint:disable-next-line:no-empty
router
.
get
(
'
/example
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
});
}
private
getTriedParameters
(
router
:
Router
):
void
{
// tslint:disable-next-line:no-empty
router
.
get
(
'
/tried-parameters
'
,
async
(
req
:
Request
,
res
:
Response
)
=>
{
});
}
private
setErrorPathForFailedJob
(
jobInfo
:
TrialJobInfo
):
TrialJobInfo
{
if
(
jobInfo
===
undefined
||
jobInfo
.
status
!==
'
FAILED
'
||
jobInfo
.
logPath
===
undefined
)
{
return
jobInfo
;
}
jobInfo
.
stderrPath
=
path
.
join
(
jobInfo
.
logPath
,
'
.nni
'
,
'
stderr
'
);
return
jobInfo
;
}
}
export
function
createRestHandler
(
rs
:
RestServer
):
Router
{
const
handler
:
NNIRestHandler
=
new
NNIRestHandler
(
rs
);
return
handler
.
createRestHandler
();
}
src/nni_manager/rest_server/server.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
*
as
bodyParser
from
'
body-parser
'
;
import
*
as
express
from
'
express
'
;
import
*
as
http
from
'
http
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
*
as
component
from
'
../common/component
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
Manager
}
from
'
../common/manager
'
;
import
{
createRestHandler
}
from
'
./restHandler
'
;
@
component
.
Singleton
export
class
RestServer
{
public
static
readonly
DEFAULT_PORT
:
number
=
51188
;
private
readonly
API_ROOT_URL
:
string
=
'
/api/v1/nni
'
;
private
hostName
:
string
=
'
0.0.0.0
'
;
private
port
:
number
=
RestServer
.
DEFAULT_PORT
;
private
startTask
!
:
Deferred
<
void
>
;
private
stopTask
!
:
Deferred
<
void
>
;
private
app
:
express
.
Application
=
express
();
private
server
!
:
http
.
Server
;
private
log
:
Logger
=
getLogger
();
get
endPoint
():
string
{
// tslint:disable-next-line:no-http-string
return
`http://
${
this
.
hostName
}
:
${
this
.
port
}
`
;
}
public
start
(
port
?:
number
,
hostName
?:
string
):
Promise
<
void
>
{
if
(
this
.
startTask
!==
undefined
)
{
return
this
.
startTask
.
promise
;
}
this
.
startTask
=
new
Deferred
<
void
>
();
this
.
registerRestHandler
();
if
(
hostName
)
{
this
.
hostName
=
hostName
;
}
if
(
port
)
{
this
.
port
=
port
;
}
this
.
server
=
this
.
app
.
listen
(
this
.
port
,
this
.
hostName
).
on
(
'
listening
'
,
()
=>
{
this
.
startTask
.
resolve
();
}).
on
(
'
error
'
,
(
e
:
Error
)
=>
{
this
.
startTask
.
reject
(
e
);
});
return
this
.
startTask
.
promise
;
}
public
stop
():
Promise
<
void
>
{
if
(
this
.
stopTask
!==
undefined
)
{
return
this
.
stopTask
.
promise
;
}
this
.
stopTask
=
new
Deferred
<
void
>
();
if
(
this
.
startTask
===
undefined
)
{
this
.
stopTask
.
resolve
();
return
this
.
stopTask
.
promise
;
}
else
{
this
.
startTask
.
promise
.
then
(
()
=>
{
// Started
this
.
server
.
close
().
on
(
'
close
'
,
()
=>
{
this
.
log
.
info
(
'
Rest server stopped.
'
);
this
.
stopTask
.
resolve
();
}).
on
(
'
error
'
,
(
e
:
Error
)
=>
{
this
.
log
.
error
(
`Error occurred stopping Rest server:
${
e
.
message
}
`
);
this
.
stopTask
.
reject
();
});
},
()
=>
{
// Start task rejected
this
.
stopTask
.
resolve
();
}
);
}
return
this
.
stopTask
.
promise
;
}
private
registerRestHandler
():
void
{
this
.
app
.
use
(
bodyParser
.
json
());
this
.
app
.
use
(
this
.
API_ROOT_URL
,
createRestHandler
(
this
));
}
}
src/nni_manager/rest_server/tensorboard.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
*
as
component
from
'
../common/component
'
;
import
{
DataStore
,
TrialJobInfo
}
from
'
../common/datastore
'
;
import
{
NNIErrorNames
}
from
'
../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
HostJobApplicationForm
,
TrainingService
,
TrialJobStatus
}
from
'
../common/trainingService
'
;
export
class
TensorBoard
{
private
DEFAULT_PORT
:
number
=
6006
;
private
TENSORBOARD_COMMAND
:
string
=
'
PATH=$PATH:~/.local/bin:/usr/local/bin tensorboard
'
;
private
tbJobMap
:
Map
<
string
,
string
>
;
private
trainingService
:
TrainingService
;
private
dataStore
:
DataStore
;
private
log
:
Logger
=
getLogger
();
constructor
()
{
this
.
tbJobMap
=
new
Map
();
this
.
trainingService
=
component
.
get
(
TrainingService
);
this
.
dataStore
=
component
.
get
(
DataStore
);
}
public
async
startTensorBoard
(
trialJobIds
:
string
[],
tbCmd
?:
string
,
port
?:
number
):
Promise
<
string
>
{
let
tensorBoardPort
:
number
=
this
.
DEFAULT_PORT
;
if
(
port
!==
undefined
)
{
tensorBoardPort
=
port
;
}
const
host
:
string
=
await
this
.
getJobHost
(
trialJobIds
);
const
tbEndpoint
:
string
=
`http://
${
host
}
:
${
tensorBoardPort
}
`
;
try
{
if
(
await
this
.
isTensorBoardRunningOnHost
(
host
))
{
await
this
.
stopHostTensorBoard
(
host
);
}
}
catch
(
error
)
{
if
(
error
.
name
!==
NNIErrorNames
.
NOT_FOUND
)
{
throw
error
;
}
else
{
this
.
tbJobMap
.
delete
(
host
);
}
}
const
logDirs
:
string
[]
=
[];
for
(
const
id
of
trialJobIds
)
{
logDirs
.
push
(
await
this
.
getLogDir
(
id
));
}
let
tensorBoardCmd
:
string
=
this
.
TENSORBOARD_COMMAND
;
if
(
tbCmd
!==
undefined
&&
tbCmd
.
trim
().
length
>
0
)
{
tensorBoardCmd
=
tbCmd
;
}
const
cmd
:
string
=
`
${
tensorBoardCmd
}
--logdir
${
logDirs
.
join
(
'
:
'
)}
--port
${
tensorBoardPort
}
`
;
const
form
:
HostJobApplicationForm
=
{
jobType
:
'
HOST
'
,
host
:
host
,
cmd
:
cmd
};
const
jobId
:
string
=
(
await
this
.
trainingService
.
submitTrialJob
(
form
)).
id
;
this
.
tbJobMap
.
set
(
host
,
jobId
);
return
tbEndpoint
;
}
public
async
cleanUp
():
Promise
<
void
>
{
const
stopTensorBoardTasks
:
Promise
<
void
>
[]
=
[];
this
.
tbJobMap
.
forEach
((
jobId
:
string
,
host
:
string
)
=>
{
stopTensorBoardTasks
.
push
(
this
.
stopHostTensorBoard
(
host
).
catch
((
err
:
Error
)
=>
{
this
.
log
.
error
(
`Error occurred stopping tensorboard service:
${
err
.
message
}
`
);
}));
});
await
Promise
.
all
(
stopTensorBoardTasks
);
}
public
stopTensorBoard
(
endPoint
:
string
):
Promise
<
void
>
{
const
host
:
string
=
this
.
getEndPointHost
(
endPoint
);
return
this
.
stopHostTensorBoard
(
host
);
}
private
stopHostTensorBoard
(
host
:
string
):
Promise
<
void
>
{
const
jobId
:
string
|
undefined
=
this
.
tbJobMap
.
get
(
host
);
if
(
jobId
===
undefined
)
{
return
Promise
.
resolve
();
}
return
this
.
trainingService
.
cancelTrialJob
(
jobId
);
}
private
async
isTensorBoardRunningOnHost
(
host
:
string
):
Promise
<
boolean
>
{
const
jobId
:
string
|
undefined
=
this
.
tbJobMap
.
get
(
host
);
if
(
jobId
===
undefined
)
{
return
false
;
}
const
status
:
TrialJobStatus
=
(
await
this
.
trainingService
.
getTrialJob
(
jobId
)).
status
;
return
[
'
RUNNING
'
,
'
WAITING
'
].
includes
(
status
);
}
private
async
getJobHost
(
trialJobIds
:
string
[]):
Promise
<
string
>
{
if
(
trialJobIds
===
undefined
||
trialJobIds
.
length
<
1
)
{
throw
new
Error
(
'
No trail job specified.
'
);
}
const
jobInfo
:
TrialJobInfo
=
await
this
.
dataStore
.
getTrialJob
(
trialJobIds
[
0
]);
const
logPath
:
string
|
undefined
=
jobInfo
.
logPath
;
if
(
logPath
===
undefined
)
{
throw
new
Error
(
`Failed to find job logPath:
${
jobInfo
.
id
}
`
);
}
return
logPath
.
split
(
'
://
'
)[
1
].
split
(
'
:
'
)[
0
];
//TODO use url parse
}
private
async
getLogDir
(
trialJobId
:
string
):
Promise
<
string
>
{
const
jobInfo
:
TrialJobInfo
=
await
this
.
dataStore
.
getTrialJob
(
trialJobId
);
const
logPath
:
string
|
undefined
=
jobInfo
.
logPath
;
if
(
logPath
===
undefined
)
{
throw
new
Error
(
`Failed to find job logPath:
${
jobInfo
.
id
}
`
);
}
return
logPath
.
split
(
'
://
'
)[
1
].
split
(
'
:
'
)[
1
];
//TODO use url parse
}
private
getEndPointHost
(
endPoint
:
string
):
string
{
const
parts
=
endPoint
.
match
(
/.*:
\/\/(
.*
)
:
(
.*
)
/
);
if
(
parts
!==
null
)
{
return
parts
[
1
];
}
else
{
throw
new
Error
(
`Invalid endPoint:
${
endPoint
}
`
);
}
}
}
src/nni_manager/rest_server/test/mockedNNIManager.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Provider
}
from
'
typescript-ioc
'
;
import
{
MetricDataRecord
,
MetricType
,
TrialJobInfo
}
from
'
../../common/datastore
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
ExperimentParams
,
ExperimentProfile
,
Manager
,
ProfileUpdateType
,
TrialJobStatistics
}
from
'
../../common/manager
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
export
const
testManagerProvider
:
Provider
=
{
get
:
():
Manager
=>
{
return
new
MockedNNIManager
();
}
};
export
class
MockedNNIManager
extends
Manager
{
public
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
{
return
Promise
.
resolve
();
}
public
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
{
const
deferred
:
Deferred
<
TrialJobStatistics
[]
>
=
new
Deferred
<
TrialJobStatistics
[]
>
();
deferred
.
resolve
([{
trialJobStatus
:
'
RUNNING
'
,
trialJobNumber
:
2
},
{
trialJobStatus
:
'
FAILED
'
,
trialJobNumber
:
1
}]);
return
deferred
.
promise
;
}
public
addCustomizedTrialJob
(
hyperParams
:
string
):
Promise
<
void
>
{
return
Promise
.
resolve
();
}
public
resumeExperiment
():
Promise
<
void
>
{
return
Promise
.
resolve
();
}
public
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
const
deferred
:
Deferred
<
TrialJobDetail
>
=
new
Deferred
<
TrialJobDetail
>
();
const
jobDetail
:
TrialJobDetail
=
{
id
:
'
1234
'
,
status
:
'
RUNNING
'
,
submitTime
:
new
Date
(),
startTime
:
new
Date
(),
endTime
:
new
Date
(),
tags
:
[
'
test
'
],
// tslint:disable-next-line:no-http-string
url
:
'
http://test
'
,
workingDirectory
:
'
/tmp/mocked
'
,
form
:
{
jobType
:
'
TRIAL
'
}
};
deferred
.
resolve
(
jobDetail
);
return
deferred
.
promise
;
}
public
cancelTrialJobByUser
(
trialJobId
:
string
):
Promise
<
void
>
{
return
Promise
.
resolve
();
}
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
return
Promise
.
resolve
(
'
METAVALUE1
'
);
}
public
startExperiment
(
experimentParams
:
ExperimentParams
):
Promise
<
string
>
{
return
Promise
.
resolve
(
'
id-1234
'
);
}
public
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
if
(
key
===
'
exception_test_key
'
)
{
deferred
.
reject
(
new
Error
(
'
Test Error
'
));
}
deferred
.
resolve
();
return
deferred
.
promise
;
}
public
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobDetail
>
{
const
deferred
:
Deferred
<
TrialJobDetail
>
=
new
Deferred
<
TrialJobDetail
>
();
const
jobDetail
:
TrialJobDetail
=
{
id
:
'
1234
'
,
status
:
'
SUCCEEDED
'
,
submitTime
:
new
Date
(),
startTime
:
new
Date
(),
endTime
:
new
Date
(),
tags
:
[
'
test
'
],
// tslint:disable-next-line:no-http-string
url
:
'
http://test
'
,
workingDirectory
:
'
/tmp/mocked
'
,
form
:
{
jobType
:
'
TRIAL
'
}
};
deferred
.
resolve
(
jobDetail
);
return
deferred
.
promise
;
}
public
stopExperiment
():
Promise
<
void
>
{
throw
new
MethodNotImplementedError
();
}
public
getMetricData
(
trialJobId
:
string
,
metricType
:
MetricType
):
Promise
<
MetricDataRecord
[]
>
{
throw
new
MethodNotImplementedError
();
}
public
getExperimentProfile
():
Promise
<
ExperimentProfile
>
{
const
profile
:
ExperimentProfile
=
{
params
:
{
authorName
:
'
test
'
,
experimentName
:
'
exp1
'
,
trialConcurrency
:
2
,
maxExecDuration
:
30
,
maxTrialNum
:
3
,
searchSpace
:
'
{lr: 0.01}
'
,
tuner
:
{
tunerCommand
:
'
python3 tuner.py
'
,
tunerCwd
:
'
/tmp/tunner
'
,
tunerCheckpointDirectory
:
''
}
},
id
:
'
2345
'
,
execDuration
:
0
,
startTime
:
new
Date
(),
endTime
:
new
Date
(),
revision
:
0
};
return
Promise
.
resolve
(
profile
);
}
public
listTrialJobs
(
status
?:
TrialJobStatus
):
Promise
<
TrialJobInfo
[]
>
{
const
job1
:
TrialJobInfo
=
{
id
:
'
1234
'
,
status
:
'
SUCCEEDED
'
,
startTime
:
new
Date
(),
endTime
:
new
Date
(),
finalMetricData
:
'
lr: 0.01, val accuracy: 0.89, batch size: 256
'
};
const
job2
:
TrialJobInfo
=
{
id
:
'
3456
'
,
status
:
'
FAILED
'
,
startTime
:
new
Date
(),
endTime
:
new
Date
(),
finalMetricData
:
''
};
return
Promise
.
resolve
([
job1
,
job2
]);
}
}
src/nni_manager/rest_server/test/restserver.test.ts
0 → 100644
View file @
252f36f8
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
{
assert
,
expect
}
from
'
chai
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
request
from
'
request
'
;
import
{
Container
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
DataStore
}
from
'
../../common/datastore
'
;
import
{
ExperimentProfile
,
Manager
}
from
'
../../common/manager
'
;
import
{
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
MockedDataStore
}
from
'
../../core/test/mockedDatastore
'
;
import
{
MockedTrainingService
}
from
'
../../core/test/mockedTrainingService
'
;
import
{
RestServer
}
from
'
../server
'
;
import
{
testManagerProvider
}
from
'
./mockedNNIManager
'
;
describe
(
'
Unit test for rest server
'
,
()
=>
{
let
ROOT_URL
:
string
;
before
((
done
:
Mocha
.
Done
)
=>
{
prepareUnitTest
();
Container
.
bind
(
Manager
).
provider
(
testManagerProvider
);
Container
.
bind
(
DataStore
).
to
(
MockedDataStore
);
Container
.
bind
(
TrainingService
).
to
(
MockedTrainingService
);
const
restServer
:
RestServer
=
component
.
get
(
RestServer
);
restServer
.
start
().
then
(()
=>
{
ROOT_URL
=
`
${
restServer
.
endPoint
}
/api/v1/nni`
;
done
();
}).
catch
((
e
:
Error
)
=>
{
assert
.
fail
(
`Failed to start rest server:
${
e
.
message
}
`
);
});
});
after
(()
=>
{
component
.
get
<
RestServer
>
(
RestServer
).
stop
();
cleanupUnitTest
();
});
it
(
'
Test GET check-status
'
,
(
done
:
Mocha
.
Done
)
=>
{
request
.
get
(
`
${
ROOT_URL
}
/check-status`
,
(
err
:
Error
,
res
:
request
.
Response
)
=>
{
if
(
err
)
{
assert
.
fail
(
err
.
message
);
}
else
{
expect
(
res
.
statusCode
).
to
.
equal
(
200
);
}
done
();
});
});
it
(
'
Test GET trial-jobs/:id
'
,
(
done
:
Mocha
.
Done
)
=>
{
// tslint:disable-next-line:no-any
request
.
get
(
`
${
ROOT_URL
}
/trial-jobs/1234`
,
(
err
:
Error
,
res
:
request
.
Response
,
body
:
any
)
=>
{
if
(
err
)
{
assert
.
fail
(
err
.
message
);
}
else
{
expect
(
res
.
statusCode
).
to
.
equal
(
200
);
expect
(
JSON
.
parse
(
body
).
id
).
to
.
equal
(
'
1234
'
);
}
done
();
});
});
it
(
'
Test GET experiment
'
,
(
done
:
Mocha
.
Done
)
=>
{
request
.
get
(
`
${
ROOT_URL
}
/experiment`
,
(
err
:
Error
,
res
:
request
.
Response
)
=>
{
if
(
err
)
{
assert
.
fail
(
err
.
message
);
}
else
{
expect
(
res
.
statusCode
).
to
.
equal
(
200
);
}
done
();
});
});
it
(
'
Test GET trial-jobs
'
,
(
done
:
Mocha
.
Done
)
=>
{
request
.
get
(
`
${
ROOT_URL
}
/trial-jobs`
,
(
err
:
Error
,
res
:
request
.
Response
)
=>
{
expect
(
res
.
statusCode
).
to
.
equal
(
200
);
if
(
err
)
{
assert
.
fail
(
err
.
message
);
}
done
();
});
});
it
(
'
Test change concurrent-trial-jobs
'
,
(
done
:
Mocha
.
Done
)
=>
{
// tslint:disable-next-line:no-any
request
.
get
(
`
${
ROOT_URL
}
/experiment`
,
(
err
:
Error
,
res
:
request
.
Response
,
body
:
any
)
=>
{
if
(
err
)
{
assert
.
fail
(
err
.
message
);
}
else
{
expect
(
res
.
statusCode
).
to
.
equal
(
200
);
const
profile
:
ExperimentProfile
=
JSON
.
parse
(
body
);
if
(
profile
.
params
&&
profile
.
params
.
trialConcurrency
)
{
profile
.
params
.
trialConcurrency
=
10
;
}
const
req
:
request
.
Options
=
{
uri
:
`
${
ROOT_URL
}
/experiment`
,
method
:
'
PUT
'
,
json
:
true
,
body
:
profile
};
request
(
req
,
(
error
:
Error
,
response
:
request
.
Response
)
=>
{
if
(
error
)
{
assert
.
fail
(
error
.
message
);
}
else
{
expect
(
response
.
statusCode
).
to
.
equal
(
200
);
}
done
();
});
}
});
});
it
(
'
Test PUT experiment/cluster-metadata exception
'
,
(
done
:
Mocha
.
Done
)
=>
{
const
req
:
request
.
Options
=
{
uri
:
`
${
ROOT_URL
}
/experiment/cluster-metadata`
,
method
:
'
PUT
'
,
json
:
true
,
body
:
{
exception_test_key
:
'
test
'
}
}
request
(
req
,
(
err
:
Error
,
res
:
request
.
Response
)
=>
{
if
(
err
)
{
assert
.
fail
(
err
.
message
);
}
else
{
expect
(
res
.
statusCode
).
to
.
equal
(
500
);
}
done
();
});
});
it
(
'
Test PUT experiment/cluster-metadata
'
,
(
done
:
Mocha
.
Done
)
=>
{
const
req
:
request
.
Options
=
{
uri
:
`
${
ROOT_URL
}
/experiment/cluster-metadata`
,
method
:
'
PUT
'
,
json
:
true
,
body
:
{
MACHINE_LIST
:
[{
ip
:
'
10.10.10.101
'
,
port
:
22
,
username
:
'
test
'
,
passwd
:
'
1234
'
},
{
ip
:
'
10.10.10.102
'
,
port
:
22
,
username
:
'
test
'
,
passwd
:
'
1234
'
}]
}
}
request
(
req
,
(
err
:
Error
,
res
:
request
.
Response
)
=>
{
if
(
err
)
{
assert
.
fail
(
err
.
message
);
}
else
{
expect
(
res
.
statusCode
).
to
.
equal
(
200
);
}
done
();
});
});
it
(
'
Test POST experiment
'
,
(
done
:
Mocha
.
Done
)
=>
{
const
req
:
request
.
Options
=
{
uri
:
`
${
ROOT_URL
}
/experiment`
,
method
:
'
POST
'
,
json
:
true
,
body
:
{
author
:
'
test
'
,
trial
:
{
entrypoint
:
'
python
'
,
args
:
'
mnist.py
'
}
}
};
// tslint:disable-next-line:no-any
request
(
req
,
(
err
:
Error
,
res
:
request
.
Response
,
body
:
any
)
=>
{
if
(
err
)
{
assert
.
fail
(
err
.
message
);
}
else
{
expect
(
res
.
statusCode
).
to
.
equal
(
200
);
expect
(
body
.
experiment_id
).
to
.
equal
(
'
id-1234
'
);
}
done
();
});
});
});
src/nni_manager/scripts/gpu_metrics_collector.py
0 → 100644
View file @
252f36f8
#!/usr/bin/python
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import
json
import
os
import
subprocess
import
sys
import
time
from
xml.dom
import
minidom
def
check_ready_to_run
():
pgrep_output
=
subprocess
.
check_output
(
'pgrep -fx
\'
python3 gpu_metrics_collector.py
\'
'
,
shell
=
True
)
pidList
=
[]
for
pid
in
pgrep_output
.
splitlines
():
pidList
.
append
(
int
(
pid
))
pidList
.
remove
(
os
.
getpid
())
return
len
(
pidList
)
==
0
def
main
(
argv
):
if
check_ready_to_run
()
==
False
:
# GPU metrics collector is already running. Exit
exit
(
2
)
with
open
(
"./gpu_metrics"
,
"w"
)
as
outputFile
:
pass
os
.
chmod
(
"./gpu_metrics"
,
0o777
)
cmd
=
'nvidia-smi -q -x'
while
(
True
):
try
:
smi_output
=
subprocess
.
check_output
(
cmd
,
shell
=
True
)
parse_nvidia_smi_result
(
smi_output
,
'.'
)
except
:
exception
=
sys
.
exc_info
()
for
e
in
exception
:
print
(
"job exporter error {}"
.
format
(
e
))
# TODO: change to sleep time configurable via arguments
time
.
sleep
(
5
)
def
parse_nvidia_smi_result
(
smi
,
outputDir
):
try
:
xmldoc
=
minidom
.
parseString
(
smi
)
gpuList
=
xmldoc
.
getElementsByTagName
(
'gpu'
)
with
open
(
os
.
path
.
join
(
outputDir
,
"gpu_metrics"
),
'a'
)
as
outputFile
:
outPut
=
{}
outPut
[
"Timestamp"
]
=
time
.
asctime
(
time
.
localtime
())
outPut
[
"gpuCount"
]
=
len
(
gpuList
)
outPut
[
"gpuInfos"
]
=
[]
for
gpuIndex
,
gpu
in
enumerate
(
gpuList
):
gpuInfo
=
{}
gpuInfo
[
'index'
]
=
gpuIndex
gpuInfo
[
'gpuUtil'
]
=
gpu
.
getElementsByTagName
(
'utilization'
)[
0
].
getElementsByTagName
(
'gpu_util'
)[
0
].
childNodes
[
0
].
data
.
replace
(
"%"
,
""
).
strip
()
gpuInfo
[
'gpuMemUtil'
]
=
gpu
.
getElementsByTagName
(
'utilization'
)[
0
].
getElementsByTagName
(
'memory_util'
)[
0
].
childNodes
[
0
].
data
.
replace
(
"%"
,
""
).
strip
()
processes
=
gpu
.
getElementsByTagName
(
'processes'
)
runningProNumber
=
len
(
processes
[
0
].
getElementsByTagName
(
'process_info'
))
gpuInfo
[
'activeProcessNum'
]
=
runningProNumber
outPut
[
"gpuInfos"
].
append
(
gpuInfo
)
print
(
outPut
)
outputFile
.
write
(
"{}
\n
"
.
format
(
json
.
dumps
(
outPut
,
sort_keys
=
True
)))
outputFile
.
flush
();
except
:
e_info
=
sys
.
exc_info
()
print
(
'xmldoc paring error'
)
if
__name__
==
"__main__"
:
main
(
sys
.
argv
[
1
:])
src/nni_manager/scripts/metrics_reader.py
0 → 100644
View file @
252f36f8
# ============================================================================================================================== #
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ============================================================================================================================== #
import
argparse
import
errno
import
json
import
os
import
re
METRICS_FILENAME
=
'.nni/metrics'
OFFSET_FILENAME
=
'.nni/metrics_offset'
JOB_CODE_FILENAME
=
'.nni/code'
JOB_PID_FILENAME
=
'.nni/jobpid'
JOB_CODE_PATTERN
=
re
.
compile
(
'^(\d+)\s+(\d+)$'
)
LEN_FIELD_SIZE
=
6
MAGIC
=
'ME'
class
TrialMetricsReader
():
'''
Read metrics data from a trial job
'''
def
__init__
(
self
,
trial_job_dir
):
self
.
trial_job_dir
=
trial_job_dir
self
.
offset_filename
=
os
.
path
.
join
(
trial_job_dir
,
OFFSET_FILENAME
)
self
.
metrics_filename
=
os
.
path
.
join
(
trial_job_dir
,
METRICS_FILENAME
)
self
.
jobcode_filename
=
os
.
path
.
join
(
trial_job_dir
,
JOB_CODE_FILENAME
)
self
.
jobpid_filemame
=
os
.
path
.
join
(
trial_job_dir
,
JOB_PID_FILENAME
)
def
_metrics_file_is_empty
(
self
):
if
not
os
.
path
.
isfile
(
self
.
metrics_filename
):
return
True
statinfo
=
os
.
stat
(
self
.
metrics_filename
)
return
statinfo
.
st_size
==
0
def
_get_offset
(
self
):
offset
=
0
if
os
.
path
.
isfile
(
self
.
offset_filename
):
with
open
(
self
.
offset_filename
,
'r'
)
as
f
:
offset
=
int
(
f
.
readline
())
return
offset
def
_write_offset
(
self
,
offset
):
statinfo
=
os
.
stat
(
self
.
metrics_filename
)
if
offset
<
0
or
offset
>
statinfo
.
st_size
:
raise
ValueError
(
'offset value is invalid: {}'
.
format
(
offset
))
with
open
(
self
.
offset_filename
,
'w'
)
as
f
:
f
.
write
(
str
(
offset
)
+
'
\n
'
)
def
_read_all_available_records
(
self
,
offset
):
new_offset
=
offset
metrics
=
[]
with
open
(
self
.
metrics_filename
,
'r'
)
as
f
:
f
.
seek
(
offset
)
while
True
:
magic_string
=
f
.
read
(
len
(
MAGIC
))
# empty data means EOF
if
not
magic_string
:
break
strdatalen
=
f
.
read
(
LEN_FIELD_SIZE
)
# empty data means EOF
if
not
strdatalen
:
raise
ValueError
(
"metric file {} format error after offset: {}."
.
format
(
self
.
metrics_filename
,
new_offset
))
datalen
=
int
(
strdatalen
)
data
=
f
.
read
(
datalen
)
if
datalen
>
0
and
len
(
data
)
==
datalen
:
new_offset
=
f
.
tell
()
metrics
.
append
(
data
)
else
:
raise
ValueError
(
"metric file {} format error after offset: {}."
.
format
(
self
.
metrics_filename
,
new_offset
))
self
.
_write_offset
(
new_offset
)
return
metrics
def
_pid_exists
(
selft
,
pid
):
if
pid
<
0
:
return
False
if
pid
==
0
:
# According to "man 2 kill" PID 0 refers to every process
# in the process group of the calling process.
# On certain systems 0 is a valid PID but we have no way
# to know that in a portable fashion.
raise
ValueError
(
'invalid PID 0'
)
try
:
os
.
kill
(
pid
,
0
)
except
OSError
as
err
:
if
err
.
errno
==
errno
.
ESRCH
:
# ESRCH == No such process
return
False
elif
err
.
errno
==
errno
.
EPERM
:
# EPERM clearly means there's a process to deny access to
return
True
else
:
# According to "man 2 kill" possible error values are
# (EINVAL, EPERM, ESRCH)
raise
else
:
return
True
def
read_trial_metrics
(
self
):
'''
Read available metrics data for a trial
'''
if
self
.
_metrics_file_is_empty
():
return
[]
offset
=
self
.
_get_offset
()
return
self
.
_read_all_available_records
(
offset
)
def
read_trial_status
(
self
):
if
os
.
path
.
isfile
(
self
.
jobpid_filemame
):
with
open
(
self
.
jobpid_filemame
,
'r'
)
as
f
:
jobpid
=
int
(
f
.
readline
())
if
self
.
_pid_exists
(
jobpid
):
return
'RUNNING'
,
-
1
else
:
return
self
.
_read_job_return_code
()
else
:
# raise ValueError('offset value is invalid: {}'.format(offset))
return
'UNKNOWN'
,
-
1
def
_read_job_return_code
(
self
):
if
os
.
path
.
isfile
(
self
.
jobcode_filename
):
with
open
(
self
.
jobcode_filename
,
'r'
)
as
f
:
job_return_code
=
f
.
readline
()
match
=
JOB_CODE_PATTERN
.
match
(
job_return_code
)
if
(
match
):
return_code
=
int
(
match
.
group
(
1
))
timestamp
=
int
(
match
.
group
(
2
))
status
=
''
if
return_code
==
0
:
status
=
'SUCCEEDED'
elif
return_code
==
141
:
status
=
'USER_CANCELED'
else
:
status
=
'FAILED'
return
status
,
timestamp
else
:
raise
ValueError
(
'Job code file format incorrect'
)
else
:
raise
ValueError
(
'job return code file doesnt exist: {}'
.
format
(
self
.
jobcode_filename
))
def
read_experiment_metrics
(
args
):
'''
Read metrics data for specified trial jobs
'''
trial_job_ids
=
args
.
trial_job_ids
.
strip
().
split
(
','
)
trial_job_ids
=
[
id
.
strip
()
for
id
in
trial_job_ids
]
results
=
[]
for
trial_job_id
in
trial_job_ids
:
result
=
{}
try
:
trial_job_dir
=
os
.
path
.
join
(
args
.
experiment_dir
,
'trials'
,
trial_job_id
)
reader
=
TrialMetricsReader
(
trial_job_dir
)
result
[
'jobId'
]
=
trial_job_id
result
[
'metrics'
]
=
reader
.
read_trial_metrics
()
result
[
'jobStatus'
],
result
[
'endTimestamp'
]
=
reader
.
read_trial_status
()
results
.
append
(
result
)
except
Exception
:
#TODO error logging to file
pass
print
(
json
.
dumps
(
results
))
if
__name__
==
'__main__'
:
PARSER
=
argparse
.
ArgumentParser
()
PARSER
.
add_argument
(
"--experiment_dir"
,
type
=
str
,
help
=
"Root directory of experiment"
,
required
=
True
)
PARSER
.
add_argument
(
"--trial_job_ids"
,
type
=
str
,
help
=
"Trial job ids splited with ','"
,
required
=
True
)
ARGS
,
UNKNOWN
=
PARSER
.
parse_known_args
()
read_experiment_metrics
(
ARGS
)
Prev
1
2
3
4
5
6
7
8
…
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment