Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
9f4485c1
Unverified
Commit
9f4485c1
authored
Feb 25, 2019
by
SparkSnail
Committed by
GitHub
Feb 25, 2019
Browse files
Merge pull request #130 from Microsoft/master
merge master
parents
1ee97350
51fbf695
Changes
39
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
344 additions
and
50 deletions
+344
-50
src/nni_manager/common/trainingService.ts
src/nni_manager/common/trainingService.ts
+1
-0
src/nni_manager/core/test/mockedDatastore.ts
src/nni_manager/core/test/mockedDatastore.ts
+18
-0
src/nni_manager/core/test/nnimanager.test.ts
src/nni_manager/core/test/nnimanager.test.ts
+159
-19
src/nni_manager/package.json
src/nni_manager/package.json
+1
-1
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
...g_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
+1
-0
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+2
-1
src/nni_manager/training_service/pai/paiData.ts
src/nni_manager/training_service/pai/paiData.ts
+2
-1
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
+6
-2
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+4
-3
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+2
-1
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+14
-3
src/nni_manager/training_service/test/localTrainingService.test.ts
...anager/training_service/test/localTrainingService.test.ts
+97
-5
src/webui/src/components/SlideBar.tsx
src/webui/src/components/SlideBar.tsx
+2
-1
test/config_test.py
test/config_test.py
+3
-3
test/pipelines-it-kubeflow.yml
test/pipelines-it-kubeflow.yml
+1
-0
test/pipelines-it-pai.yml
test/pipelines-it-pai.yml
+1
-0
tools/nni_cmd/launcher.py
tools/nni_cmd/launcher.py
+3
-1
tools/nni_trial_tool/trial_keeper.py
tools/nni_trial_tool/trial_keeper.py
+26
-8
tools/setup.py
tools/setup.py
+1
-1
No files found.
src/nni_manager/common/trainingService.ts
View file @
9f4485c1
...
@@ -71,6 +71,7 @@ interface TrialJobDetail {
...
@@ -71,6 +71,7 @@ interface TrialJobDetail {
readonly
workingDirectory
:
string
;
readonly
workingDirectory
:
string
;
readonly
form
:
JobApplicationForm
;
readonly
form
:
JobApplicationForm
;
readonly
sequenceId
:
number
;
readonly
sequenceId
:
number
;
isEarlyStopped
?:
boolean
;
}
}
interface
HostJobDetail
{
interface
HostJobDetail
{
...
...
src/nni_manager/core/test/mockedDatastore.ts
View file @
9f4485c1
...
@@ -99,7 +99,25 @@ class MockedDataStore implements DataStore {
...
@@ -99,7 +99,25 @@ class MockedDataStore implements DataStore {
private
dbTrialJobs
:
SimpleDb
=
new
SimpleDb
(
'
trial_jobs
'
,
'
./trial_jobs.json
'
);
private
dbTrialJobs
:
SimpleDb
=
new
SimpleDb
(
'
trial_jobs
'
,
'
./trial_jobs.json
'
);
private
dbMetrics
:
SimpleDb
=
new
SimpleDb
(
'
metrics
'
,
'
./metrics.json
'
);
private
dbMetrics
:
SimpleDb
=
new
SimpleDb
(
'
metrics
'
,
'
./metrics.json
'
);
trailJob1
=
{
event
:
'
ADD_CUSTOMIZED
'
,
timestamp
:
Date
.
now
(),
trialJobId
:
"
4321
"
,
data
:
''
}
metrics1
=
{
timestamp
:
Date
.
now
(),
trialJobId
:
'
4321
'
,
parameterId
:
'
param1
'
,
type
:
'
CUSTOM
'
,
sequence
:
21
,
data
:
''
}
init
():
Promise
<
void
>
{
init
():
Promise
<
void
>
{
this
.
dbTrialJobs
.
saveData
(
this
.
trailJob1
);
this
.
dbMetrics
.
saveData
(
this
.
metrics1
);
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
...
...
src/nni_manager/core/test/nnimanager.test.ts
View file @
9f4485c1
...
@@ -19,25 +19,27 @@
...
@@ -19,25 +19,27 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
os
from
'
os
'
;
import
{
assert
,
expect
}
from
'
chai
'
;
import
{
assert
,
expect
}
from
'
chai
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
{
Container
,
Scope
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Database
,
DataStore
}
from
'
../../common/datastore
'
;
import
{
Database
,
DataStore
}
from
'
../../common/datastore
'
;
import
{
Manager
}
from
'
../../common/manager
'
;
import
{
Manager
,
ExperimentProfile
}
from
'
../../common/manager
'
;
import
{
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
NNIDataStore
}
from
'
../nniDataStore
'
;
import
{
NNIDataStore
}
from
'
../nniDataStore
'
;
import
{
NNIManager
}
from
'
../nnimanager
'
;
import
{
NNIManager
}
from
'
../nnimanager
'
;
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
import
{
MockedTrainingService
}
from
'
./mockedTrainingService
'
;
import
{
MockedTrainingService
}
from
'
./mockedTrainingService
'
;
import
{
MockedDataStore
}
from
'
./mockedDatastore
'
;
async
function
initContainer
():
Promise
<
void
>
{
async
function
initContainer
():
Promise
<
void
>
{
prepareUnitTest
();
prepareUnitTest
();
Container
.
bind
(
TrainingService
).
to
(
MockedTrainingService
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
TrainingService
).
to
(
MockedTrainingService
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Manager
).
to
(
NNIManager
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Manager
).
to
(
NNIManager
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Database
).
to
(
SqlDB
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
Database
).
to
(
SqlDB
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
DataStore
).
to
(
NNI
DataStore
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
DataStore
).
to
(
Mocked
DataStore
).
scope
(
Scope
.
Singleton
);
await
component
.
get
<
DataStore
>
(
DataStore
).
init
();
await
component
.
get
<
DataStore
>
(
DataStore
).
init
();
}
}
...
@@ -51,9 +53,9 @@ describe('Unit test for nnimanager', function () {
...
@@ -51,9 +53,9 @@ describe('Unit test for nnimanager', function () {
let
experimentParams
=
{
let
experimentParams
=
{
authorName
:
'
zql
'
,
authorName
:
'
zql
'
,
experimentName
:
'
naive_experiment
'
,
experimentName
:
'
naive_experiment
'
,
trialConcurrency
:
2
,
trialConcurrency
:
3
,
maxExecDuration
:
5
,
maxExecDuration
:
5
,
maxTrialNum
:
2
,
maxTrialNum
:
3
,
trainingServicePlatform
:
'
local
'
,
trainingServicePlatform
:
'
local
'
,
searchSpace
:
'
{"x":1}
'
,
searchSpace
:
'
{"x":1}
'
,
tuner
:
{
tuner
:
{
...
@@ -71,36 +73,74 @@ describe('Unit test for nnimanager', function () {
...
@@ -71,36 +73,74 @@ describe('Unit test for nnimanager', function () {
}
}
}
}
let
updateExperimentParams
=
{
authorName
:
''
,
experimentName
:
'
another_experiment
'
,
trialConcurrency
:
2
,
maxExecDuration
:
6
,
maxTrialNum
:
2
,
trainingServicePlatform
:
'
local
'
,
searchSpace
:
'
{"y":2}
'
,
tuner
:
{
className
:
'
TPE
'
,
classArgs
:
{
optimize_mode
:
'
maximize
'
},
checkpointDir
:
''
,
gpuNum
:
0
},
assessor
:
{
className
:
'
Medianstop
'
,
checkpointDir
:
''
,
gpuNum
:
1
}
}
let
experimentProfile
=
{
params
:
updateExperimentParams
,
id
:
'
test
'
,
execDuration
:
0
,
maxSequenceId
:
0
,
revision
:
0
}
before
(
async
()
=>
{
before
(
async
()
=>
{
await
initContainer
();
await
initContainer
();
nniManager
=
component
.
get
(
Manager
);
nniManager
=
component
.
get
(
Manager
);
const
expId
:
string
=
await
nniManager
.
startExperiment
(
experimentParams
);
const
expId
:
string
=
await
nniManager
.
startExperiment
(
experimentParams
);
assert
(
expId
);
assert
.
strictEqual
(
expId
,
'
unittest
'
);
})
;
})
after
(
async
()
=>
{
after
(
async
()
=>
{
await
nniManager
.
stopExperiment
();
await
setTimeout
(()
=>
{
nniManager
.
stopExperiment
()
},
15000
)
;
cleanupUnitTest
();
cleanupUnitTest
();
})
})
it
(
'
test resumeExperiment
'
,
()
=>
{
//TODO: add resume experiment unit test
it
(
'
test addCustomizedTrialJob
'
,
()
=>
{
return
nniManager
.
addCustomizedTrialJob
(
'
hyperParams
'
).
then
(()
=>
{
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
})
})
it
(
'
test listTrialJobs
'
,
()
=>
{
it
(
'
test listTrialJobs
'
,
()
=>
{
//FIXME: not implemented
return
nniManager
.
listTrialJobs
().
then
(
function
(
trialjobdetails
)
{
//return nniManager.listTrialJobs().then(function (trialJobDetails) {
expect
(
trialjobdetails
.
length
).
to
.
be
.
equal
(
2
);
// expect(trialJobDetails.length).to.be.equal(2);
}).
catch
((
error
)
=>
{
//}).catch(function (error) {
assert
.
fail
(
error
);
// assert.fail(error);
})
//})
})
})
it
(
'
test getTrialJob valid
'
,
()
=>
{
it
(
'
test getTrialJob valid
'
,
()
=>
{
//query a exist id
//query a exist id
return
nniManager
.
getTrialJob
(
'
1234
'
).
then
(
function
(
trialJobDetail
)
{
return
nniManager
.
getTrialJob
(
'
1234
'
).
then
(
function
(
trialJobDetail
)
{
expect
(
trialJobDetail
.
id
).
to
.
be
.
equal
(
'
1234
'
);
expect
(
trialJobDetail
.
id
).
to
.
be
.
equal
(
'
1234
'
);
}).
catch
(
function
(
error
)
{
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
assert
.
fail
(
error
);
})
})
})
})
...
@@ -132,7 +172,6 @@ describe('Unit test for nnimanager', function () {
...
@@ -132,7 +172,6 @@ describe('Unit test for nnimanager', function () {
})
})
})
})
//TODO: complete ut
it
(
'
test cancelTrialJobByUser
'
,
()
=>
{
it
(
'
test cancelTrialJobByUser
'
,
()
=>
{
return
nniManager
.
cancelTrialJobByUser
(
'
1234
'
).
then
(()
=>
{
return
nniManager
.
cancelTrialJobByUser
(
'
1234
'
).
then
(()
=>
{
...
@@ -141,11 +180,112 @@ describe('Unit test for nnimanager', function () {
...
@@ -141,11 +180,112 @@ describe('Unit test for nnimanager', function () {
})
})
})
})
it
(
'
test addCustomizedTrialJob
'
,
()
=>
{
it
(
'
test getExperimentProfile
'
,
()
=>
{
return
nniManager
.
addCustomizedTrialJob
(
'
hyperParams
'
).
then
(()
=>
{
return
nniManager
.
getExperimentProfile
().
then
((
experimentProfile
)
=>
{
expect
(
experimentProfile
.
id
).
to
.
be
.
equal
(
'
unittest
'
);
expect
(
experimentProfile
.
logDir
).
to
.
be
.
equal
(
os
.
homedir
()
+
'
/nni/experiments/unittest
'
);
}).
catch
((
error
)
=>
{
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
assert
.
fail
(
error
);
})
})
})
})
it
(
'
test updateExperimentProfile TRIAL_CONCURRENCY
'
,
()
=>
{
return
nniManager
.
updateExperimentProfile
(
experimentProfile
,
'
TRIAL_CONCURRENCY
'
).
then
(()
=>
{
nniManager
.
getExperimentProfile
().
then
((
updateProfile
)
=>
{
expect
(
updateProfile
.
params
.
trialConcurrency
).
to
.
be
.
equal
(
2
);
});
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
})
it
(
'
test updateExperimentProfile MAX_EXEC_DURATION
'
,
()
=>
{
return
nniManager
.
updateExperimentProfile
(
experimentProfile
,
'
MAX_EXEC_DURATION
'
).
then
(()
=>
{
nniManager
.
getExperimentProfile
().
then
((
updateProfile
)
=>
{
expect
(
updateProfile
.
params
.
maxExecDuration
).
to
.
be
.
equal
(
6
);
});
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
})
it
(
'
test updateExperimentProfile SEARCH_SPACE
'
,
()
=>
{
return
nniManager
.
updateExperimentProfile
(
experimentProfile
,
'
SEARCH_SPACE
'
).
then
(()
=>
{
nniManager
.
getExperimentProfile
().
then
((
updateProfile
)
=>
{
expect
(
updateProfile
.
params
.
searchSpace
).
to
.
be
.
equal
(
'
{"y":2}
'
);
});
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
})
it
(
'
test updateExperimentProfile MAX_TRIAL_NUM
'
,
()
=>
{
return
nniManager
.
updateExperimentProfile
(
experimentProfile
,
'
MAX_TRIAL_NUM
'
).
then
(()
=>
{
nniManager
.
getExperimentProfile
().
then
((
updateProfile
)
=>
{
expect
(
updateProfile
.
params
.
maxTrialNum
).
to
.
be
.
equal
(
2
);
});
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
})
it
(
'
test getStatus
'
,
()
=>
{
assert
.
strictEqual
(
nniManager
.
getStatus
().
status
,
'
RUNNING
'
);
})
it
(
'
test getMetricData with trialJobId
'
,
()
=>
{
//query a exist trialJobId
return
nniManager
.
getMetricData
(
'
4321
'
,
'
CUSTOM
'
).
then
((
metricData
)
=>
{
expect
(
metricData
.
length
).
to
.
be
.
equal
(
1
);
expect
(
metricData
[
0
].
trialJobId
).
to
.
be
.
equal
(
'
4321
'
);
expect
(
metricData
[
0
].
parameterId
).
to
.
be
.
equal
(
'
param1
'
);
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
})
it
(
'
test getMetricData with invalid trialJobId
'
,
()
=>
{
//query an invalid trialJobId
return
nniManager
.
getMetricData
(
'
43210
'
,
'
CUSTOM
'
).
then
((
metricData
)
=>
{
assert
.
fail
();
}).
catch
((
error
)
=>
{
})
})
it
(
'
test getTrialJobStatistics
'
,
()
=>
{
// get 3 trial jobs (init, addCustomizedTrialJob, cancelTrialJobByUser)
return
nniManager
.
getTrialJobStatistics
().
then
(
function
(
trialJobStatistics
)
{
expect
(
trialJobStatistics
.
length
).
to
.
be
.
equal
(
2
);
if
(
trialJobStatistics
[
0
].
trialJobStatus
===
'
WAITING
'
)
{
expect
(
trialJobStatistics
[
0
].
trialJobNumber
).
to
.
be
.
equal
(
2
);
expect
(
trialJobStatistics
[
1
].
trialJobNumber
).
to
.
be
.
equal
(
1
);
}
else
{
expect
(
trialJobStatistics
[
1
].
trialJobNumber
).
to
.
be
.
equal
(
2
);
expect
(
trialJobStatistics
[
0
].
trialJobNumber
).
to
.
be
.
equal
(
1
);
}
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
})
it
(
'
test addCustomizedTrialJob reach maxTrialNum
'
,
()
=>
{
// test currSubmittedTrialNum reach maxTrialNum
return
nniManager
.
addCustomizedTrialJob
(
'
hyperParam
'
).
then
(()
=>
{
nniManager
.
getTrialJobStatistics
().
then
(
function
(
trialJobStatistics
)
{
if
(
trialJobStatistics
[
0
].
trialJobStatus
===
'
WAITING
'
)
expect
(
trialJobStatistics
[
0
].
trialJobNumber
).
to
.
be
.
equal
(
2
);
else
expect
(
trialJobStatistics
[
1
].
trialJobNumber
).
to
.
be
.
equal
(
2
);
})
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
})
})
it
(
'
test resumeExperiment
'
,
async
()
=>
{
//TODO: add resume experiment unit test
})
})
})
src/nni_manager/package.json
View file @
9f4485c1
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
"scripts"
:
{
"scripts"
:
{
"postbuild"
:
"cp -rf config ./dist/"
,
"postbuild"
:
"cp -rf config ./dist/"
,
"build"
:
"tsc"
,
"build"
:
"tsc"
,
"test"
:
"nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts
--exclude core/test/nnimanager.test.ts
--colors"
,
"test"
:
"nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors"
,
"start"
:
"node dist/main.js"
,
"start"
:
"node dist/main.js"
,
"tslint"
:
"tslint -p ."
"tslint"
:
"tslint -p ."
},
},
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
View file @
9f4485c1
...
@@ -46,6 +46,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
...
@@ -46,6 +46,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
try
{
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
}
catch
(
error
)
{
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
//This is not treat as a error status
return
Promise
.
resolve
();
return
Promise
.
resolve
();
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
9f4485c1
...
@@ -255,7 +255,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -255,7 +255,7 @@ class LocalTrainingService implements TrainingService {
}
}
if
(
trialJob
.
pid
===
undefined
){
if
(
trialJob
.
pid
===
undefined
){
this
.
setTrialJobStatus
(
trialJob
,
'
USER_CANCELED
'
);
this
.
setTrialJobStatus
(
trialJob
,
'
USER_CANCELED
'
);
return
;
return
Promise
.
resolve
()
;
}
}
if
(
trialJob
.
form
.
jobType
===
'
TRIAL
'
)
{
if
(
trialJob
.
form
.
jobType
===
'
TRIAL
'
)
{
await
tkill
(
trialJob
.
pid
,
'
SIGKILL
'
);
await
tkill
(
trialJob
.
pid
,
'
SIGKILL
'
);
...
@@ -265,6 +265,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -265,6 +265,7 @@ class LocalTrainingService implements TrainingService {
throw
new
Error
(
`Job type not supported:
${
trialJob
.
form
.
jobType
}
`
);
throw
new
Error
(
`Job type not supported:
${
trialJob
.
form
.
jobType
}
`
);
}
}
this
.
setTrialJobStatus
(
trialJob
,
getJobCancelStatus
(
isEarlyStopped
));
this
.
setTrialJobStatus
(
trialJob
,
getJobCancelStatus
(
isEarlyStopped
));
return
Promise
.
resolve
();
}
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
...
...
src/nni_manager/training_service/pai/paiData.ts
View file @
9f4485c1
...
@@ -34,6 +34,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
...
@@ -34,6 +34,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
public
form
:
JobApplicationForm
;
public
form
:
JobApplicationForm
;
public
sequenceId
:
number
;
public
sequenceId
:
number
;
public
hdfsLogPath
:
string
;
public
hdfsLogPath
:
string
;
public
isEarlyStopped
?:
boolean
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
,
hdfsLogPath
:
string
)
{
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
,
hdfsLogPath
:
string
)
{
...
@@ -63,7 +64,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
...
@@ -63,7 +64,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}'`
;
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}'
--webhdfs_path '/webhdfs/api/v1'
`
;
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
`hdfs://{0}:9000/`
;
`hdfs://{0}:9000/`
;
...
...
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
View file @
9f4485c1
...
@@ -103,8 +103,12 @@ export class PAIJobInfoCollector {
...
@@ -103,8 +103,12 @@ export class PAIJobInfoCollector {
paiTrialJob
.
status
=
'
SUCCEEDED
'
;
paiTrialJob
.
status
=
'
SUCCEEDED
'
;
break
;
break
;
case
'
STOPPED
'
:
case
'
STOPPED
'
:
if
(
paiTrialJob
.
status
!==
'
EARLY_STOPPED
'
)
{
if
(
paiTrialJob
.
isEarlyStopped
!==
undefined
)
{
paiTrialJob
.
status
=
'
USER_CANCELED
'
;
paiTrialJob
.
status
=
paiTrialJob
.
isEarlyStopped
===
true
?
'
EARLY_STOPPED
'
:
'
USER_CANCELED
'
;
}
else
{
// if paiTrialJob's isEarlyStopped is undefined, that mean we didn't stop it via cancellation, mark it as SYS_CANCELLED by PAI
paiTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
}
break
;
break
;
case
'
FAILED
'
:
case
'
FAILED
'
:
...
...
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
9f4485c1
...
@@ -324,14 +324,15 @@ class PAITrainingService implements TrainingService {
...
@@ -324,14 +324,15 @@ class PAITrainingService implements TrainingService {
"
Authorization
"
:
'
Bearer
'
+
this
.
paiToken
"
Authorization
"
:
'
Bearer
'
+
this
.
paiToken
}
}
};
};
// Set trialjobDetail's early stopped field, to mark the job's cancellation source
trialJobDetail
.
isEarlyStopped
=
isEarlyStopped
;
request
(
stopJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
request
(
stopJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
400
)
{
if
(
error
||
response
.
statusCode
>=
400
)
{
this
.
log
.
error
(
`PAI Training service: stop trial
${
trialJobId
}
to PAI Cluster failed!`
);
this
.
log
.
error
(
`PAI Training service: stop trial
${
trialJobId
}
to PAI Cluster failed!`
);
deferred
.
reject
(
error
?
error
.
message
:
'
Stop trial failed, http code:
'
+
response
.
statusCode
);
deferred
.
reject
(
error
?
error
.
message
:
'
Stop trial failed, http code:
'
+
response
.
statusCode
);
}
else
{
}
else
{
if
(
isEarlyStopped
)
{
trialJobDetail
.
status
=
'
EARLY_STOPPED
'
;
}
deferred
.
resolve
();
deferred
.
resolve
();
}
}
});
});
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
9f4485c1
...
@@ -80,6 +80,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
...
@@ -80,6 +80,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
public
form
:
JobApplicationForm
;
public
form
:
JobApplicationForm
;
public
sequenceId
:
number
;
public
sequenceId
:
number
;
public
rmMeta
?:
RemoteMachineMeta
;
public
rmMeta
?:
RemoteMachineMeta
;
public
isEarlyStopped
?:
boolean
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
)
{
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
)
{
...
@@ -114,7 +115,7 @@ export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={
...
@@ -114,7 +115,7 @@ export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={
cd $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
sh install_nni.sh
echo $$ >{6}
echo $$ >{6}
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}'
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}'
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $?
\`
date +%s%3N
\`
>{10}`
;
echo $?
\`
date +%s%3N
\`
>{10}`
;
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
9f4485c1
...
@@ -48,7 +48,7 @@ import {
...
@@ -48,7 +48,7 @@ import {
GPU_COLLECTOR_FORMAT
GPU_COLLECTOR_FORMAT
}
from
'
./remoteMachineData
'
;
}
from
'
./remoteMachineData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
}
from
'
../common/util
'
;
import
{
validateCodeDir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
mkDirP
}
from
'
../../common/utils
'
;
import
{
mkDirP
}
from
'
../../common/utils
'
;
...
@@ -279,8 +279,9 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -279,8 +279,9 @@ class RemoteMachineTrainingService implements TrainingService {
const
jobpidPath
:
string
=
this
.
getJobPidPath
(
trialJob
.
id
);
const
jobpidPath
:
string
=
this
.
getJobPidPath
(
trialJob
.
id
);
try
{
try
{
// Mark the toEarlyStop tag here
trialJob
.
isEarlyStopped
=
isEarlyStopped
;
await
SSHClientUtility
.
remoteExeCommand
(
`pkill -P
\`
cat
${
jobpidPath
}
\`
`
,
sshClient
);
await
SSHClientUtility
.
remoteExeCommand
(
`pkill -P
\`
cat
${
jobpidPath
}
\`
`
,
sshClient
);
trialJob
.
status
=
getJobCancelStatus
(
isEarlyStopped
);
}
catch
(
error
)
{
}
catch
(
error
)
{
// Not handle the error since pkill failed will not impact trial job's current status
// Not handle the error since pkill failed will not impact trial job's current status
this
.
log
.
error
(
`remoteTrainingService.cancelTrialJob:
${
error
.
message
}
`
);
this
.
log
.
error
(
`remoteTrainingService.cancelTrialJob:
${
error
.
message
}
`
);
...
@@ -482,6 +483,11 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -482,6 +483,11 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
trialJobDetail
===
undefined
)
{
if
(
trialJobDetail
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
INVALID_JOB_DETAIL
,
`Invalid job detail information for trial job
${
trialJobId
}
`
);
throw
new
NNIError
(
NNIErrorNames
.
INVALID_JOB_DETAIL
,
`Invalid job detail information for trial job
${
trialJobId
}
`
);
}
}
// If job is not WATIING, Don't prepare and resolve true immediately
if
(
trialJobDetail
.
status
!==
'
WAITING
'
)
{
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
}
// get an ssh client from scheduler
// get an ssh client from scheduler
const
rmScheduleResult
:
RemoteMachineScheduleResult
=
this
.
gpuScheduler
.
scheduleMachine
(
this
.
trialConfig
.
gpuNum
,
trialJobId
);
const
rmScheduleResult
:
RemoteMachineScheduleResult
=
this
.
gpuScheduler
.
scheduleMachine
(
this
.
trialConfig
.
gpuNum
,
trialJobId
);
if
(
rmScheduleResult
.
resultType
===
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
)
{
if
(
rmScheduleResult
.
resultType
===
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
)
{
...
@@ -640,7 +646,12 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -640,7 +646,12 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
parseInt
(
code
,
10
)
===
0
)
{
if
(
parseInt
(
code
,
10
)
===
0
)
{
trialJob
.
status
=
'
SUCCEEDED
'
;
trialJob
.
status
=
'
SUCCEEDED
'
;
}
else
{
}
else
{
trialJob
.
status
=
'
FAILED
'
;
// isEarlyStopped is never set, mean it's not cancelled by NNI, so if the process's exit code >0, mark it as FAILED
if
(
trialJob
.
isEarlyStopped
===
undefined
)
{
trialJob
.
status
=
'
FAILED
'
;
}
else
{
trialJob
.
status
=
getJobCancelStatus
(
trialJob
.
isEarlyStopped
);
}
}
}
trialJob
.
endTime
=
parseInt
(
timestamp
,
10
);
trialJob
.
endTime
=
parseInt
(
timestamp
,
10
);
}
}
...
...
src/nni_manager/training_service/test/localTrainingService.test.ts
View file @
9f4485c1
...
@@ -19,14 +19,106 @@
...
@@ -19,14 +19,106 @@
'
use strict
'
;
'
use strict
'
;
import
{
TrainingService
}
from
'
../../common/trainingService
'
;
import
*
as
assert
from
'
assert
'
;
import
{
LocalTrainingService
}
from
'
../local/localTrainingService
'
;
import
*
as
chai
from
'
chai
'
;
import
*
as
chaiAsPromised
from
'
chai-as-promised
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
tmp
from
'
tmp
'
;
import
*
as
component
from
'
../../common/component
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
TrialJobApplicationForm
,
TrialJobDetail
,
TrainingService
}
from
'
../../common/trainingService
'
;
import
{
cleanupUnitTest
,
delay
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
LocalTrainingServiceForGPU
}
from
'
../local/localTrainingServiceForGPU
'
;
// TODO: copy mockedTrail.py to local folder
const
localCodeDir
:
string
=
tmp
.
dirSync
().
name
const
mockedTrialPath
:
string
=
'
./training_service/test/mockedTrial.py
'
fs
.
copyFileSync
(
mockedTrialPath
,
localCodeDir
+
'
/mockedTrial.py
'
)
describe
(
'
Unit Test for LocalTrainingService
'
,
()
=>
{
describe
(
'
Unit Test for LocalTrainingService
'
,
()
=>
{
let
trainingService
:
TrainingService
let
trialConfig
:
any
=
`{"command":"sleep 1h && echo hello","codeDir":"
${
localCodeDir
}
","gpuNum":1}`
let
localTrainingService
:
LocalTrainingServiceForGPU
;
before
(()
=>
{
chai
.
should
();
chai
.
use
(
chaiAsPromised
);
prepareUnitTest
();
});
after
(()
=>
{
cleanupUnitTest
();
});
beforeEach
(()
=>
{
localTrainingService
=
component
.
get
(
LocalTrainingServiceForGPU
);
localTrainingService
.
run
();
});
afterEach
(()
=>
{
localTrainingService
.
cleanUp
();
});
it
(
'
List empty trial jobs
'
,
async
()
=>
{
//trial jobs should be empty, since there are no submitted jobs
chai
.
expect
(
await
localTrainingService
.
listTrialJobs
()).
to
.
be
.
empty
;
});
it
(
'
setClusterMetadata and getClusterMetadata
'
,
async
()
=>
{
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
localTrainingService
.
getClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
).
then
((
data
)
=>
{
chai
.
expect
(
data
).
to
.
be
.
equals
(
trialConfig
);
});
});
it
(
'
Submit job and Cancel job
'
,
async
()
=>
{
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
// submit job
const
form
:
TrialJobApplicationForm
=
{
jobType
:
'
TRIAL
'
,
hyperParameters
:
{
value
:
'
mock hyperparameters
'
,
index
:
0
}
};
const
jobDetail
:
TrialJobDetail
=
await
localTrainingService
.
submitTrialJob
(
form
);
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
WAITING
'
);
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
USER_CANCELED
'
);
}).
timeout
(
20000
);
it
(
'
Read metrics, Add listener, and remove listener
'
,
async
()
=>
{
// set meta data
const
trialConfig
:
string
=
`{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"
${
localCodeDir
}
\",\"gpuNum\":0}`
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
// submit job
const
form
:
TrialJobApplicationForm
=
{
jobType
:
'
TRIAL
'
,
hyperParameters
:
{
value
:
'
mock hyperparameters
'
,
index
:
0
}
};
const
jobDetail
:
TrialJobDetail
=
await
localTrainingService
.
submitTrialJob
(
form
);
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
WAITING
'
);
localTrainingService
.
listTrialJobs
().
then
((
jobList
)
=>
{
chai
.
expect
(
jobList
.
length
).
to
.
be
.
equals
(
1
);
});
// Add metrics listeners
const
listener1
=
function
f1
(
metric
:
any
)
{
chai
.
expect
(
metric
.
id
).
to
.
be
.
equals
(
jobDetail
.
id
);
}
localTrainingService
.
addTrialJobMetricListener
(
listener1
);
// Wait to collect metric
await
delay
(
1000
);
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
localTrainingService
.
removeTrialJobMetricListener
(
listener1
);
}).
timeout
(
20000
);
beforeEach
(
async
()
=>
{
it
(
'
Test multiphaseSupported
'
,
()
=>
{
trainingService
=
component
.
get
(
LocalTrainingServic
e
)
;
chai
.
expect
(
localTrainingService
.
isMultiPhaseJobSupported
).
to
.
be
.
equals
(
tru
e
)
})
})
});
});
\ No newline at end of file
src/webui/src/components/SlideBar.tsx
View file @
9f4485c1
...
@@ -182,6 +182,7 @@ class SlideBar extends React.Component<{}, SliderState> {
...
@@ -182,6 +182,7 @@ class SlideBar extends React.Component<{}, SliderState> {
render
()
{
render
()
{
const
{
version
,
menuVisible
}
=
this
.
state
;
const
{
version
,
menuVisible
}
=
this
.
state
;
const
feed
=
`https://github.com/Microsoft/nni/issues/new?labels=
${
version
}
`
;
const
menu
=
(
const
menu
=
(
<
Menu
onClick
=
{
this
.
handleMenuClick
}
>
<
Menu
onClick
=
{
this
.
handleMenuClick
}
>
<
Menu
.
Item
key
=
"1"
>
Experiment Parameters
</
Menu
.
Item
>
<
Menu
.
Item
key
=
"1"
>
Experiment Parameters
</
Menu
.
Item
>
...
@@ -221,7 +222,7 @@ class SlideBar extends React.Component<{}, SliderState> {
...
@@ -221,7 +222,7 @@ class SlideBar extends React.Component<{}, SliderState> {
Download
<
Icon
type
=
"down"
/>
Download
<
Icon
type
=
"down"
/>
</
a
>
</
a
>
</
Dropdown
>
</
Dropdown
>
<
a
href
=
"https://github.com/Microsoft/nni/issues/new?labels=v0.5.1"
target
=
"_blank"
>
<
a
href
=
{
feed
}
target
=
"_blank"
>
<
img
<
img
src
=
{
require
(
'
../static/img/icon/issue.png
'
)
}
src
=
{
require
(
'
../static/img/icon/issue.png
'
)
}
alt
=
"NNI github issue"
alt
=
"NNI github issue"
...
...
test/config_test.py
View file @
9f4485c1
...
@@ -38,7 +38,6 @@ def gen_new_config(config_file, training_service='local'):
...
@@ -38,7 +38,6 @@ def gen_new_config(config_file, training_service='local'):
new_config_file
=
config_file
+
'.tmp'
new_config_file
=
config_file
+
'.tmp'
ts
=
get_yml_content
(
'training_service.yml'
)[
training_service
]
ts
=
get_yml_content
(
'training_service.yml'
)[
training_service
]
print
(
config
)
print
(
ts
)
print
(
ts
)
# hack for kubeflow trial config
# hack for kubeflow trial config
...
@@ -64,7 +63,6 @@ def run_test(config_file, training_service, local_gpu=False):
...
@@ -64,7 +63,6 @@ def run_test(config_file, training_service, local_gpu=False):
return
return
try
:
try
:
print
(
'Testing %s...'
%
config_file
)
proc
=
subprocess
.
run
([
'nnictl'
,
'create'
,
'--config'
,
new_config_file
])
proc
=
subprocess
.
run
([
'nnictl'
,
'create'
,
'--config'
,
new_config_file
])
assert
proc
.
returncode
==
0
,
'`nnictl create` failed with code %d'
%
proc
.
returncode
assert
proc
.
returncode
==
0
,
'`nnictl create` failed with code %d'
%
proc
.
returncode
...
@@ -109,8 +107,10 @@ def run(args):
...
@@ -109,8 +107,10 @@ def run(args):
try
:
try
:
# sleep 5 seconds here, to make sure previous stopped exp has enough time to exit to avoid port conflict
# sleep 5 seconds here, to make sure previous stopped exp has enough time to exit to avoid port conflict
time
.
sleep
(
5
)
time
.
sleep
(
5
)
print
(
GREEN
+
'Testing:'
+
config_file
+
CLEAR
)
begin_time
=
time
.
time
()
run_test
(
config_file
,
args
.
ts
,
args
.
local_gpu
)
run_test
(
config_file
,
args
.
ts
,
args
.
local_gpu
)
print
(
GREEN
+
'Test %s: TEST PASS'
%
(
config_file
)
+
CLEAR
)
print
(
GREEN
+
'Test %s: TEST PASS
IN %d mins
'
%
(
config_file
,
(
time
.
time
()
-
begin_time
)
/
60
)
+
CLEAR
)
except
Exception
as
error
:
except
Exception
as
error
:
print
(
RED
+
'Test %s: TEST FAIL'
%
(
config_file
)
+
CLEAR
)
print
(
RED
+
'Test %s: TEST FAIL'
%
(
config_file
)
+
CLEAR
)
print
(
'%r'
%
error
)
print
(
'%r'
%
error
)
...
...
test/pipelines-it-kubeflow.yml
View file @
9f4485c1
jobs
:
jobs
:
-
job
:
'
integration_test_kubeflow'
-
job
:
'
integration_test_kubeflow'
timeoutInMinutes
:
0
pool
:
'
NNI
CI
KUBE
CLI'
pool
:
'
NNI
CI
KUBE
CLI'
variables
:
variables
:
...
...
test/pipelines-it-pai.yml
View file @
9f4485c1
jobs
:
jobs
:
-
job
:
'
integration_test_pai'
-
job
:
'
integration_test_pai'
timeoutInMinutes
:
0
pool
:
'
NNI
CI
PAI
CLI'
pool
:
'
NNI
CI
PAI
CLI'
variables
:
variables
:
...
...
tools/nni_cmd/launcher.py
View file @
9f4485c1
...
@@ -168,7 +168,9 @@ def set_remote_config(experiment_config, port, config_file_name):
...
@@ -168,7 +168,9 @@ def set_remote_config(experiment_config, port, config_file_name):
with
open
(
stderr_full_path
,
'a+'
)
as
fout
:
with
open
(
stderr_full_path
,
'a+'
)
as
fout
:
fout
.
write
(
json
.
dumps
(
json
.
loads
(
err_message
),
indent
=
4
,
sort_keys
=
True
,
separators
=
(
','
,
':'
)))
fout
.
write
(
json
.
dumps
(
json
.
loads
(
err_message
),
indent
=
4
,
sort_keys
=
True
,
separators
=
(
','
,
':'
)))
return
False
,
err_message
return
False
,
err_message
result
,
message
=
setNNIManagerIp
(
experiment_config
,
port
,
config_file_name
)
if
not
result
:
return
result
,
message
#set trial_config
#set trial_config
return
set_trial_config
(
experiment_config
,
port
,
config_file_name
),
err_message
return
set_trial_config
(
experiment_config
,
port
,
config_file_name
),
err_message
...
...
tools/nni_trial_tool/trial_keeper.py
View file @
9f4485c1
...
@@ -48,10 +48,25 @@ def main_loop(args):
...
@@ -48,10 +48,25 @@ def main_loop(args):
# redirect trial keeper's stdout and stderr to syslog
# redirect trial keeper's stdout and stderr to syslog
trial_syslogger_stdout
=
RemoteLogger
(
args
.
nnimanager_ip
,
args
.
nnimanager_port
,
'trial'
,
StdOutputType
.
Stdout
)
trial_syslogger_stdout
=
RemoteLogger
(
args
.
nnimanager_ip
,
args
.
nnimanager_port
,
'trial'
,
StdOutputType
.
Stdout
)
sys
.
stdout
=
sys
.
stderr
=
trial_keeper_syslogger
sys
.
stdout
=
sys
.
stderr
=
trial_keeper_syslogger
# backward compatibility
hdfs_host
=
None
hdfs_output_dir
=
None
if
args
.
hdfs_host
:
hdfs_host
=
args
.
hdfs_host
elif
args
.
pai_hdfs_host
:
hdfs_host
=
args
.
pai_hdfs_host
if
args
.
hdfs_output_dir
:
hdfs_output_dir
=
args
.
hdfs_output_dir
elif
args
.
pai_hdfs_output_dir
:
hdfs_output_dir
=
args
.
pai_hdfs_output_dir
if
args
.
pai_
hdfs_host
is
not
None
and
args
.
nni_hdfs_exp_dir
is
not
None
:
if
hdfs_host
is
not
None
and
args
.
nni_hdfs_exp_dir
is
not
None
:
try
:
try
:
hdfs_client
=
HdfsClient
(
hosts
=
'{0}:{1}'
.
format
(
args
.
pai_hdfs_host
,
'50070'
),
user_name
=
args
.
pai_user_name
,
timeout
=
5
)
if
args
.
webhdfs_path
:
hdfs_client
=
HdfsClient
(
hosts
=
'{0}:80'
.
format
(
hdfs_host
),
user_name
=
args
.
pai_user_name
,
webhdfs_path
=
args
.
webhdfs_path
,
timeout
=
5
)
else
:
# backward compatibility
hdfs_client
=
HdfsClient
(
hosts
=
'{0}:{1}'
.
format
(
hdfs_host
,
'50070'
),
user_name
=
args
.
pai_user_name
,
timeout
=
5
)
except
Exception
as
e
:
except
Exception
as
e
:
nni_log
(
LogType
.
Error
,
'Create HDFS client error: '
+
str
(
e
))
nni_log
(
LogType
.
Error
,
'Create HDFS client error: '
+
str
(
e
))
raise
e
raise
e
...
@@ -67,14 +82,14 @@ def main_loop(args):
...
@@ -67,14 +82,14 @@ def main_loop(args):
# child worker process exits and all stdout data is read
# child worker process exits and all stdout data is read
if
retCode
is
not
None
and
log_pipe_stdout
.
set_process_exit
()
and
log_pipe_stdout
.
is_read_completed
==
True
:
if
retCode
is
not
None
and
log_pipe_stdout
.
set_process_exit
()
and
log_pipe_stdout
.
is_read_completed
==
True
:
nni_log
(
LogType
.
Info
,
'subprocess terminated. Exit code is {}. Quit'
.
format
(
retCode
))
nni_log
(
LogType
.
Info
,
'subprocess terminated. Exit code is {}. Quit'
.
format
(
retCode
))
if
args
.
pai_
hdfs_output_dir
is
not
None
:
if
hdfs_output_dir
is
not
None
:
# Copy local directory to hdfs for OpenPAI
# Copy local directory to hdfs for OpenPAI
nni_local_output_dir
=
os
.
environ
[
'NNI_OUTPUT_DIR'
]
nni_local_output_dir
=
os
.
environ
[
'NNI_OUTPUT_DIR'
]
try
:
try
:
if
copyDirectoryToHdfs
(
nni_local_output_dir
,
args
.
pai_
hdfs_output_dir
,
hdfs_client
):
if
copyDirectoryToHdfs
(
nni_local_output_dir
,
hdfs_output_dir
,
hdfs_client
):
nni_log
(
LogType
.
Info
,
'copy directory from {0} to {1} success!'
.
format
(
nni_local_output_dir
,
args
.
pai_
hdfs_output_dir
))
nni_log
(
LogType
.
Info
,
'copy directory from {0} to {1} success!'
.
format
(
nni_local_output_dir
,
hdfs_output_dir
))
else
:
else
:
nni_log
(
LogType
.
Info
,
'copy directory from {0} to {1} failed!'
.
format
(
nni_local_output_dir
,
args
.
pai_
hdfs_output_dir
))
nni_log
(
LogType
.
Info
,
'copy directory from {0} to {1} failed!'
.
format
(
nni_local_output_dir
,
hdfs_output_dir
))
except
Exception
as
e
:
except
Exception
as
e
:
nni_log
(
LogType
.
Error
,
'HDFS copy directory got exception: '
+
str
(
e
))
nni_log
(
LogType
.
Error
,
'HDFS copy directory got exception: '
+
str
(
e
))
raise
e
raise
e
...
@@ -95,10 +110,13 @@ if __name__ == '__main__':
...
@@ -95,10 +110,13 @@ if __name__ == '__main__':
PARSER
.
add_argument
(
'--trial_command'
,
type
=
str
,
help
=
'Command to launch trial process'
)
PARSER
.
add_argument
(
'--trial_command'
,
type
=
str
,
help
=
'Command to launch trial process'
)
PARSER
.
add_argument
(
'--nnimanager_ip'
,
type
=
str
,
default
=
'localhost'
,
help
=
'NNI manager rest server IP'
)
PARSER
.
add_argument
(
'--nnimanager_ip'
,
type
=
str
,
default
=
'localhost'
,
help
=
'NNI manager rest server IP'
)
PARSER
.
add_argument
(
'--nnimanager_port'
,
type
=
str
,
default
=
'8081'
,
help
=
'NNI manager rest server port'
)
PARSER
.
add_argument
(
'--nnimanager_port'
,
type
=
str
,
default
=
'8081'
,
help
=
'NNI manager rest server port'
)
PARSER
.
add_argument
(
'--pai_hdfs_output_dir'
,
type
=
str
,
help
=
'the output dir of hdfs'
)
PARSER
.
add_argument
(
'--pai_hdfs_output_dir'
,
type
=
str
,
help
=
'the output dir of pai_hdfs'
)
# backward compatibility
PARSER
.
add_argument
(
'--pai_hdfs_host'
,
type
=
str
,
help
=
'the host of hdfs'
)
PARSER
.
add_argument
(
'--hdfs_output_dir'
,
type
=
str
,
help
=
'the output dir of hdfs'
)
PARSER
.
add_argument
(
'--pai_hdfs_host'
,
type
=
str
,
help
=
'the host of pai_hdfs'
)
# backward compatibility
PARSER
.
add_argument
(
'--hdfs_host'
,
type
=
str
,
help
=
'the host of hdfs'
)
PARSER
.
add_argument
(
'--pai_user_name'
,
type
=
str
,
help
=
'the username of hdfs'
)
PARSER
.
add_argument
(
'--pai_user_name'
,
type
=
str
,
help
=
'the username of hdfs'
)
PARSER
.
add_argument
(
'--nni_hdfs_exp_dir'
,
type
=
str
,
help
=
'nni experiment directory in hdfs'
)
PARSER
.
add_argument
(
'--nni_hdfs_exp_dir'
,
type
=
str
,
help
=
'nni experiment directory in hdfs'
)
PARSER
.
add_argument
(
'--webhdfs_path'
,
type
=
str
,
help
=
'the webhdfs path used in webhdfs URL'
)
args
,
unknown
=
PARSER
.
parse_known_args
()
args
,
unknown
=
PARSER
.
parse_known_args
()
if
args
.
trial_command
is
None
:
if
args
.
trial_command
is
None
:
exit
(
1
)
exit
(
1
)
...
...
tools/setup.py
View file @
9f4485c1
...
@@ -12,7 +12,7 @@ setuptools.setup(
...
@@ -12,7 +12,7 @@ setuptools.setup(
'psutil'
,
'psutil'
,
'astor'
,
'astor'
,
'schema'
,
'schema'
,
'
pyhdfs
'
'
PythonWebHDFS
'
],
],
author
=
'Microsoft NNI Team'
,
author
=
'Microsoft NNI Team'
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment