Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
cfda0dae
Commit
cfda0dae
authored
Apr 22, 2019
by
demianzhang
Committed by
SparkSnail
Apr 22, 2019
Browse files
NNI on Windows for NNI Local mode (#937)
parent
88ceed71
Changes
42
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
284 additions
and
123 deletions
+284
-123
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+18
-21
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+41
-30
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+2
-2
src/nni_manager/training_service/test/localTrainingService.test.ts
...anager/training_service/test/localTrainingService.test.ts
+1
-1
src/sdk/pynni/nni/common.py
src/sdk/pynni/nni/common.py
+2
-1
src/sdk/pynni/nni/platform/local.py
src/sdk/pynni/nni/platform/local.py
+6
-1
test/generate_ts_config.py
test/generate_ts_config.py
+16
-0
test/pipelines-it-local-windows.yml
test/pipelines-it-local-windows.yml
+38
-0
test/unittest.ps1
test/unittest.ps1
+27
-0
test/utils.py
test/utils.py
+11
-4
tools/nni_annotation/__init__.py
tools/nni_annotation/__init__.py
+11
-6
tools/nni_cmd/command_utils.py
tools/nni_cmd/command_utils.py
+55
-0
tools/nni_cmd/common_utils.py
tools/nni_cmd/common_utils.py
+15
-0
tools/nni_cmd/constants.py
tools/nni_cmd/constants.py
+6
-5
tools/nni_cmd/launcher.py
tools/nni_cmd/launcher.py
+18
-24
tools/nni_cmd/nnictl.py
tools/nni_cmd/nnictl.py
+2
-0
tools/nni_cmd/nnictl_utils.py
tools/nni_cmd/nnictl_utils.py
+5
-22
tools/nni_cmd/package_management.py
tools/nni_cmd/package_management.py
+4
-3
tools/nni_cmd/ssh_utils.py
tools/nni_cmd/ssh_utils.py
+3
-3
tools/nni_gpu_tool/gpu_metrics_collector.py
tools/nni_gpu_tool/gpu_metrics_collector.py
+3
-0
No files found.
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
cfda0dae
...
...
@@ -25,9 +25,10 @@ import * as fs from 'fs';
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
execMkdir
,
getScriptName
,
getgpuMetricsCollectorScriptContent
,
execScript
,
execTail
,
execRemove
,
execKill
}
from
'
../common/util
'
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
,
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
/**
* GPUScheduler for local training service
...
...
@@ -57,6 +58,19 @@ class GPUScheduler {
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
);
//generate gpu_metrics_collector script
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
getScriptName
(
'
gpu_metrics_collector
'
));
const
gpuMetricsCollectorScriptContent
:
string
=
getgpuMetricsCollectorScriptContent
(
this
.
gpuMetricCollectorScriptFolder
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
execScript
(
gpuMetricsCollectorScriptPath
)
}
public
getAvailableGPUIndices
():
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
info
.
activeProcessNum
===
0
)
...
...
@@ -78,33 +92,16 @@ class GPUScheduler {
this
.
stopping
=
true
;
try
{
const
pid
:
string
=
await
fs
.
promises
.
readFile
(
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
'
utf8
'
);
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
await
cpp
.
exec
(
`rm -rf
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
await
execKill
(
pid
);
await
execRemove
(
this
.
gpuMetricCollectorScriptFolder
);
}
catch
(
error
)
{
this
.
log
.
error
(
`GPU scheduler error:
${
error
}
`
);
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
//generate gpu_metrics_collector.sh
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics_collector.sh
'
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT
,
this
.
gpuMetricCollectorScriptFolder
,
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
cp
.
exec
(
`bash
${
gpuMetricsCollectorScriptPath
}
`
);
}
private
async
updateGPUSummary
():
Promise
<
void
>
{
const
cmdresult
:
cpp
.
childProcessPromise
.
Result
=
await
cpp
.
exec
(
`tail -n 1
${
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
)
}
`
);
await
execTail
(
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
));
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
else
{
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
cfda0dae
...
...
@@ -18,7 +18,6 @@
*/
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
{
EventEmitter
}
from
'
events
'
;
...
...
@@ -32,7 +31,8 @@ import {
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
uniqueString
,
isAlive
,
getNewLine
}
from
'
../../common/utils
'
;
import
{
execMkdir
,
getScriptName
,
execScript
,
setEnvironmentVariable
,
execNewFile
}
from
'
../common/util
'
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
...
...
@@ -169,14 +169,7 @@ class LocalTrainingService implements TrainingService {
return
this
.
getHostJob
(
trialJobId
);
}
if
(
trialJob
.
status
===
'
RUNNING
'
)
{
let
alive
:
boolean
=
false
;
try
{
await
cpp
.
exec
(
`kill -0
${
trialJob
.
pid
}
`
);
alive
=
true
;
}
catch
(
error
)
{
//ignore
}
let
alive
:
boolean
=
await
isAlive
(
trialJob
.
pid
);
if
(
!
alive
)
{
trialJob
.
endTime
=
Date
.
now
();
this
.
setTrialJobStatus
(
trialJob
,
'
FAILED
'
);
...
...
@@ -284,7 +277,9 @@ class LocalTrainingService implements TrainingService {
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
if
(
!
this
.
initialized
)
{
this
.
rootDir
=
getExperimentRootDir
();
await
cpp
.
exec
(
`mkdir -p
${
this
.
rootDir
}
`
);
if
(
!
fs
.
existsSync
(
this
.
rootDir
)){
await
cpp
.
exec
(
`powershell.exe mkdir
${
this
.
rootDir
}
`
);
}
this
.
initialized
=
true
;
}
switch
(
key
)
{
...
...
@@ -381,7 +376,7 @@ class LocalTrainingService implements TrainingService {
envVariables
.
push
({
key
:
'
CUDA_VISIBLE_DEVICES
'
,
value
:
this
.
gpuScheduler
===
undefined
?
''
:
resource
.
gpuIndices
.
join
(
'
,
'
)
value
:
this
.
gpuScheduler
===
undefined
?
'
-1
'
:
resource
.
gpuIndices
.
join
(
'
,
'
)
});
return
envVariables
;
...
...
@@ -465,36 +460,52 @@ class LocalTrainingService implements TrainingService {
}
}
private
getScript
(
localTrailConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]{
let
script
:
string
[]
=
[];
if
(
process
.
platform
===
"
win32
"
)
{
script
.
push
(
`cmd /c
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = "$NOW_DATE" + "000"`
,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
-NoNewline -encoding utf8`
);
}
else
{
script
.
push
(
`eval
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s000
\`
>
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
}
return
script
;
}
private
async
runTrialJob
(
trialJobId
:
string
,
resource
:
{
gpuIndices
:
number
[]}):
Promise
<
void
>
{
const
trialJobDetail
:
LocalTrialJobDetail
=
<
LocalTrialJobDetail
>
this
.
jobMap
.
get
(
trialJobId
);
const
variables
:
{
key
:
string
;
value
:
string
}[]
=
this
.
getEnvironmentVariables
(
trialJobDetail
,
resource
);
const
runScriptLines
:
string
[]
=
[];
if
(
!
this
.
localTrailConfig
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
runScriptLines
.
push
(
'
#!/bin/bash
'
,
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
const
runScriptLines
:
string
[]
=
[];
if
(
process
.
platform
!==
"
win32
"
){
runScriptLines
.
push
(
'
#!/bin/bash
'
);
}
runScriptLines
.
push
(
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
for
(
const
variable
of
variables
)
{
runScriptLines
.
push
(
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
);
runScriptLines
.
push
(
setEnvironmentVariable
(
variable
)
);
}
runScriptLines
.
push
(
`eval
${
this
.
localTrailConfig
.
command
}
2>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s000
\`
>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
await
cpp
.
exec
(
`m
kdir
-p
${
trialJobDetail
.
workingDirectory
}
`
);
await
cpp
.
exec
(
`m
kdir
-p
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
)
}
`
);
await
cpp
.
exec
(
`touch
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
metrics
'
)
}
`
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
run.sh
'
),
runScriptLines
.
join
(
'
\n
'
),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
const
scripts
:
string
[]
=
this
.
getScript
(
this
.
localTrailConfig
,
trialJobDetail
.
workingDirectory
);
scripts
.
forEach
(
script
=>
{
runScriptLines
.
push
(
script
);
});
await
exec
M
kdir
(
trialJobDetail
.
workingDirectory
);
await
exec
M
kdir
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
));
await
execNewFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
metrics
'
));
const
scriptName
:
string
=
getScriptName
(
'
run
'
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
),
runScriptLines
.
join
(
getNewLine
()
),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
await
this
.
writeParameterFile
(
trialJobDetail
.
workingDirectory
,
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
).
hyperParameters
);
const
process
:
cp
.
ChildProcess
=
cp
.
exec
(
`bash
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
run.sh
'
)}
`
);
const
trialJobProcess
:
cp
.
ChildProcess
=
execScript
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
));
this
.
setTrialJobStatus
(
trialJobDetail
,
'
RUNNING
'
);
trialJobDetail
.
startTime
=
Date
.
now
();
trialJobDetail
.
pid
=
p
rocess
.
pid
;
trialJobDetail
.
pid
=
trialJobP
rocess
.
pid
;
this
.
setExtraProperties
(
trialJobDetail
,
resource
);
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
cfda0dae
...
...
@@ -46,7 +46,7 @@ import {
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
}
from
'
./remoteMachineData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
_LINUX
}
from
'
../common/gpuData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
...
...
@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT
,
GPU_INFO_COLLECTOR_FORMAT
_LINUX
,
remoteGPUScriptsDir
,
path
.
join
(
remoteGPUScriptsDir
,
'
pid
'
),
);
...
...
src/nni_manager/training_service/test/localTrainingService.test.ts
View file @
cfda0dae
...
...
@@ -31,7 +31,7 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import
{
LocalTrainingService
}
from
'
../local/localTrainingService
'
;
// TODO: copy mockedTrail.py to local folder
const
localCodeDir
:
string
=
tmp
.
dirSync
().
name
const
localCodeDir
:
string
=
tmp
.
dirSync
().
name
.
split
(
'
\\
'
).
join
(
'
\\\\
'
);
const
mockedTrialPath
:
string
=
'
./training_service/test/mockedTrial.py
'
fs
.
copyFileSync
(
mockedTrialPath
,
localCodeDir
+
'
/mockedTrial.py
'
)
...
...
src/sdk/pynni/nni/common.py
View file @
cfda0dae
...
...
@@ -33,7 +33,8 @@ log_level_map = {
'debug'
:
logging
.
DEBUG
}
_time_format
=
'%m/%d/%Y, %I:%M:%S %P'
_time_format
=
'%m/%d/%Y, %I:%M:%S %p'
class
_LoggerFileWrapper
(
TextIOBase
):
def
__init__
(
self
,
logger_file
):
self
.
file
=
logger_file
...
...
src/sdk/pynni/nni/platform/local.py
View file @
cfda0dae
...
...
@@ -19,6 +19,7 @@
# ==================================================================================================
import
os
import
sys
import
json
import
time
import
subprocess
...
...
@@ -87,7 +88,11 @@ def send_metric(string):
assert
len
(
data
)
<
1000000
,
'Metric too long'
_metric_file
.
write
(
b
'ME%06d%b'
%
(
len
(
data
),
data
))
_metric_file
.
flush
()
subprocess
.
run
([
'touch'
,
_metric_file
.
name
],
check
=
True
)
if
sys
.
platform
==
"win32"
:
file
=
open
(
_metric_file
.
name
)
file
.
close
()
else
:
subprocess
.
run
([
'touch'
,
_metric_file
.
name
],
check
=
True
)
def
get_sequence_id
():
return
trial_env_vars
.
NNI_TRIAL_SEQ_ID
test/generate_ts_config.py
View file @
cfda0dae
...
...
@@ -18,6 +18,8 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import
sys
import
glob
import
argparse
from
utils
import
get_yml_content
,
dump_yml_content
...
...
@@ -69,6 +71,19 @@ def update_training_service_config(args):
dump_yml_content
(
TRAINING_SERVICE_FILE
,
config
)
def
convert_command
():
'''convert command by platform'''
if
sys
.
platform
!=
'win32'
:
return
None
config_files
=
glob
.
glob
(
'./**/*.yml'
)
+
glob
.
glob
(
'./**/**/*.yml'
)
for
config_file
in
config_files
:
print
(
'processing {}'
.
format
(
config_file
))
yml_content
=
get_yml_content
(
config_file
)
if
yml_content
.
get
(
'trial'
):
if
yml_content
[
'trial'
].
get
(
'command'
):
yml_content
[
'trial'
][
'command'
]
=
yml_content
[
'trial'
][
'command'
].
replace
(
'python3'
,
'python'
)
dump_yml_content
(
config_file
,
yml_content
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--ts"
,
type
=
str
,
choices
=
[
'pai'
,
'kubeflow'
,
'remote'
],
default
=
'pai'
)
...
...
@@ -96,3 +111,4 @@ if __name__ == '__main__':
args
=
parser
.
parse_args
()
update_training_service_config
(
args
)
convert_command
()
test/pipelines-it-local-windows.yml
0 → 100644
View file @
cfda0dae
jobs
:
-
job
:
'
Test'
steps
:
-
script
:
|
powershell.exe -file install.ps1
displayName
:
'
Install
nni
toolkit
via
source
code'
-
script
:
|
python -m pip install scikit-learn==0.20.0 --user
python -m pip install keras==2.1.6 --user
python -m pip install https://download.pytorch.org/whl/cu90/torch-0.4.1-cp36-cp36m-win_amd64.whl --user
python -m pip install torchvision --user
python -m pip install tensorflow-gpu==1.11.0 --user
displayName
:
'
Install
dependencies
for
integration
tests'
-
script
:
|
cd test
python generate_ts_config.py
displayName
:
'
generate
config
files'
-
script
:
|
cd test
python config_test.py --ts local --local_gpu --exclude smac,bohb
displayName
:
'
Examples
and
advanced
features
tests
on
local
machine'
-
script
:
|
cd test
powershell.exe -file unittest.ps1
displayName
:
'
unit
test'
-
script
:
|
cd test
python naive_test.py
displayName
:
'
Naive
test'
-
script
:
|
cd test
python tuner_test.py
displayName
:
'
Built-in
tuners
/
assessors
tests'
-
script
:
|
cd test
python metrics_test.py
displayName
:
'
Trial
job
metrics
test'
test/unittest.ps1
0 → 100644
View file @
cfda0dae
$CWD
=
$PWD
# -------------For python unittest-------------
## ------Run annotation test------
echo
""
echo
"===========================Testing: nni_annotation==========================="
cd
$CWD
/../tools/
python
-m
unittest
-v
nni_annotation/test_annotation.py
## Export certain environment variables for unittest code to work
$
env
:
NNI_TRIAL_JOB_ID
=
"test_trial_job_id"
$
env
:
NNI_PLATFORM
=
"unittest"
## ------Run sdk test------
echo
""
echo
"===========================Testing: nni_sdk==========================="
cd
$CWD
/../src/sdk/pynni/
python
-m
unittest
discover
-v
tests
# -------------For typescript unittest-------------
cd
$CWD
/../src/nni_manager
echo
""
echo
"===========================Testing: nni_manager==========================="
npm
run
test
test/utils.py
View file @
cfda0dae
...
...
@@ -22,6 +22,7 @@ import contextlib
import
collections
import
json
import
os
import
sys
import
subprocess
import
requests
import
ruamel.yaml
as
yaml
...
...
@@ -65,7 +66,7 @@ def dump_yml_content(file_path, content):
def
setup_experiment
(
installed
=
True
):
'''setup the experiment if nni is not installed'''
if
not
installed
:
os
.
environ
[
'PATH'
]
=
os
.
environ
[
'PATH'
]
+
':'
+
os
.
environ
[
'PWD'
]
os
.
environ
[
'PATH'
]
=
os
.
environ
[
'PATH'
]
+
':'
+
os
.
getcwd
()
sdk_path
=
os
.
path
.
abspath
(
'../src/sdk/pynni'
)
cmd_path
=
os
.
path
.
abspath
(
'../tools'
)
pypath
=
os
.
environ
.
get
(
'PYTHONPATH'
)
...
...
@@ -79,7 +80,7 @@ def fetch_nni_log_path(experiment_url):
'''get nni's log path from nni's experiment url'''
experiment_profile
=
requests
.
get
(
experiment_url
)
experiment_id
=
json
.
loads
(
experiment_profile
.
text
)[
'id'
]
experiment_path
=
os
.
path
.
join
(
os
.
environ
[
'HOME'
]
,
'nni
/
experiments'
,
experiment_id
)
experiment_path
=
os
.
path
.
join
(
os
.
path
.
expanduser
(
'~'
)
,
'nni
'
,
'
experiments'
,
experiment_id
)
nnimanager_log_path
=
os
.
path
.
join
(
experiment_path
,
'log'
,
'nnimanager.log'
)
return
nnimanager_log_path
...
...
@@ -87,7 +88,10 @@ def fetch_nni_log_path(experiment_url):
def
is_experiment_done
(
nnimanager_log_path
):
'''check if the experiment is done successfully'''
assert
os
.
path
.
exists
(
nnimanager_log_path
),
'Experiment starts failed'
cmds
=
[
'cat'
,
nnimanager_log_path
,
'|'
,
'grep'
,
EXPERIMENT_DONE_SIGNAL
]
if
sys
.
platform
==
"win32"
:
cmds
=
[
'type'
,
nnimanager_log_path
,
'|'
,
'find'
,
EXPERIMENT_DONE_SIGNAL
]
else
:
cmds
=
[
'cat'
,
nnimanager_log_path
,
'|'
,
'grep'
,
EXPERIMENT_DONE_SIGNAL
]
completed_process
=
subprocess
.
run
(
' '
.
join
(
cmds
),
shell
=
True
)
return
completed_process
.
returncode
==
0
...
...
@@ -112,7 +116,10 @@ def print_stderr(trial_jobs_url):
for
trial_job
in
trial_jobs
:
if
trial_job
[
'status'
]
==
'FAILED'
:
stderr_path
=
trial_job
[
'stderrPath'
].
split
(
':'
)[
-
1
]
subprocess
.
run
([
'cat'
,
stderr_path
])
if
sys
.
platform
==
"win32"
:
subprocess
.
run
([
'type'
,
stderr_path
],
shell
=
True
)
else
:
subprocess
.
run
([
'cat'
,
stderr_path
])
def
parse_max_duration_time
(
max_exec_duration
):
unit
=
max_exec_duration
[
-
1
]
...
...
tools/nni_annotation/__init__.py
View file @
cfda0dae
...
...
@@ -20,6 +20,7 @@
import
os
import
sys
import
shutil
from
.
import
code_generator
...
...
@@ -28,6 +29,9 @@ from . import search_space_generator
__all__
=
[
'generate_search_space'
,
'expand_annotations'
]
slash
=
'/'
if
sys
.
platform
==
"win32"
:
slash
=
'
\\
'
def
generate_search_space
(
code_dir
):
"""Generate search space from Python source code.
...
...
@@ -35,8 +39,8 @@ def generate_search_space(code_dir):
code_dir: directory path of source files (str)
"""
search_space
=
{}
if
code_dir
.
endswith
(
'/'
):
if
code_dir
.
endswith
(
slash
):
code_dir
=
code_dir
[:
-
1
]
for
subdir
,
_
,
files
in
os
.
walk
(
code_dir
):
...
...
@@ -44,9 +48,9 @@ def generate_search_space(code_dir):
if
subdir
==
code_dir
:
package
=
''
else
:
assert
subdir
.
startswith
(
code_dir
+
'/'
),
subdir
assert
subdir
.
startswith
(
code_dir
+
slash
),
subdir
prefix_len
=
len
(
code_dir
)
+
1
package
=
subdir
[
prefix_len
:].
replace
(
'/'
,
'.'
)
+
'.'
package
=
subdir
[
prefix_len
:].
replace
(
slash
,
'.'
)
+
'.'
for
file_name
in
files
:
if
file_name
.
endswith
(
'.py'
):
...
...
@@ -76,9 +80,10 @@ def expand_annotations(src_dir, dst_dir):
src_dir: directory path of user code (str)
dst_dir: directory to place generated files (str)
"""
if
src_dir
[
-
1
]
==
'/'
:
if
src_dir
[
-
1
]
==
slash
:
src_dir
=
src_dir
[:
-
1
]
if
dst_dir
[
-
1
]
==
'/'
:
if
dst_dir
[
-
1
]
==
slash
:
dst_dir
=
dst_dir
[:
-
1
]
annotated
=
False
...
...
tools/nni_cmd/command_utils.py
0 → 100644
View file @
cfda0dae
from
subprocess
import
call
,
check_output
import
sys
import
os
import
signal
import
psutil
from
.common_utils
import
print_error
,
print_normal
,
print_warning
def
check_output_command
(
file_path
,
head
=
None
,
tail
=
None
):
'''call check_output command to read content from a file'''
if
os
.
path
.
exists
(
file_path
):
if
sys
.
platform
==
'win32'
:
cmds
=
[
'powershell.exe'
,
'type'
,
file_path
]
if
head
:
cmds
+=
[
'|'
,
'select'
,
'-first'
,
str
(
head
)]
elif
tail
:
cmds
+=
[
'|'
,
'select'
,
'-last'
,
str
(
tail
)]
return
check_output
(
cmds
,
shell
=
True
).
decode
(
'utf-8'
)
else
:
cmds
=
[
'cat'
,
file_path
]
if
head
:
cmds
=
[
'head'
,
'-'
+
str
(
head
),
file_path
]
elif
tail
:
cmds
=
[
'tail'
,
'-'
+
str
(
tail
),
file_path
]
return
check_output
(
cmds
,
shell
=
False
).
decode
(
'utf-8'
)
else
:
print_error
(
'{0} does not exist!'
.
format
(
file_path
))
exit
(
1
)
def
kill_command
(
pid
):
'''kill command'''
if
sys
.
platform
==
'win32'
:
process
=
psutil
.
Process
(
pid
=
pid
)
process
.
send_signal
(
signal
.
CTRL_BREAK_EVENT
)
else
:
cmds
=
[
'kill'
,
str
(
pid
)]
call
(
cmds
)
def
install_package_command
(
package_name
):
'''install python package from pip'''
#TODO refactor python logic
if
sys
.
platform
==
"win32"
:
cmds
=
'python -m pip install --user {0}'
.
format
(
package_name
)
else
:
cmds
=
'python3 -m pip install --user {0}'
.
format
(
package_name
)
call
(
cmds
,
shell
=
True
)
def
install_requirements_command
(
requirements_path
):
'''install requirements.txt'''
cmds
=
'cd '
+
requirements_path
+
' && {0} -m pip install --user -r requirements.txt'
#TODO refactor python logic
if
sys
.
platform
==
"win32"
:
cmds
=
cmds
.
format
(
'python'
)
else
:
cmds
=
cmds
.
format
(
'python3'
)
call
(
cmds
,
shell
=
True
)
tools/nni_cmd/common_utils.py
View file @
cfda0dae
...
...
@@ -18,10 +18,13 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import
os
import
sys
import
json
import
ruamel.yaml
as
yaml
import
psutil
import
socket
from
pathlib
import
Path
from
.constants
import
ERROR_INFO
,
NORMAL_INFO
,
WARNING_INFO
,
COLOR_RED_FORMAT
,
COLOR_YELLOW_FORMAT
def
get_yml_content
(
file_path
):
...
...
@@ -71,3 +74,15 @@ def detect_port(port):
return
True
except
:
return
False
def
get_user
():
if
sys
.
platform
==
'win32'
:
return
os
.
environ
[
'USERNAME'
]
else
:
return
os
.
environ
[
'USER'
]
def
get_python_dir
(
sitepackages_path
):
if
sys
.
platform
==
"win32"
:
return
str
(
Path
(
sitepackages_path
))
else
:
return
str
(
Path
(
sitepackages_path
).
parents
[
2
])
\ No newline at end of file
tools/nni_cmd/constants.py
View file @
cfda0dae
...
...
@@ -19,8 +19,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import
os
from
colorama
import
Fore
NNICTL_HOME_DIR
=
os
.
path
.
join
(
os
.
environ
[
'HOME'
]
,
'.local'
,
'nnictl'
)
NNICTL_HOME_DIR
=
os
.
path
.
join
(
os
.
path
.
expanduser
(
'~'
)
,
'.local'
,
'nnictl'
)
ERROR_INFO
=
'ERROR: %s'
...
...
@@ -32,7 +33,7 @@ DEFAULT_REST_PORT = 8080
REST_TIME_OUT
=
20
EXPERIMENT_SUCCESS_INFO
=
'
\033
[1;32;32m
Successfully started experiment!
\n
\033
[0m'
\
EXPERIMENT_SUCCESS_INFO
=
Fore
.
GREEN
+
'
Successfully started experiment!
\n
'
+
Fore
.
RESET
+
\
'-----------------------------------------------------------------------
\n
'
\
'The experiment id is %s
\n
'
\
'The Web UI urls are: %s
\n
'
\
...
...
@@ -94,11 +95,11 @@ TUNERS_NO_NEED_TO_IMPORT_DATA = {
'Hyperband'
}
COLOR_RED_FORMAT
=
'
\033
[1;31;31m%s
\033
[0m
'
COLOR_RED_FORMAT
=
Fore
.
RED
+
'%s
'
COLOR_GREEN_FORMAT
=
'
\033
[1;32;32m%s
\033
[0m
'
COLOR_GREEN_FORMAT
=
Fore
.
GREEN
+
'%s
'
COLOR_YELLOW_FORMAT
=
'
\033
[1;33;33m%s
\033
[0m
'
COLOR_YELLOW_FORMAT
=
Fore
.
YELLOW
+
'%s
'
SCHEMA_TYPE_ERROR
=
'%s should be %s type!'
...
...
tools/nni_cmd/launcher.py
View file @
cfda0dae
...
...
@@ -32,12 +32,13 @@ from .launcher_utils import validate_all_content
from
.rest_utils
import
rest_put
,
rest_post
,
check_rest_server
,
check_rest_server_quick
,
check_response
from
.url_utils
import
cluster_metadata_url
,
experiment_url
,
get_local_urls
from
.config_utils
import
Config
,
Experiments
from
.common_utils
import
get_yml_content
,
get_json_content
,
print_error
,
print_normal
,
print_warning
,
detect_process
,
detect_port
from
.common_utils
import
get_yml_content
,
get_json_content
,
print_error
,
print_normal
,
print_warning
,
detect_process
,
detect_port
,
get_user
,
get_python_dir
from
.constants
import
*
import
random
import
site
import
time
from
pathlib
import
Path
from
.command_utils
import
check_output_command
,
kill_command
def
get_log_path
(
config_file_name
):
'''generate stdout and stderr log path'''
...
...
@@ -49,14 +50,10 @@ def print_log_content(config_file_name):
'''print log information'''
stdout_full_path
,
stderr_full_path
=
get_log_path
(
config_file_name
)
print_normal
(
' Stdout:'
)
stdout_cmds
=
[
'cat'
,
stdout_full_path
]
stdout_content
=
check_output
(
stdout_cmds
)
print
(
stdout_content
.
decode
(
'utf-8'
))
print
(
check_output_command
(
stdout_full_path
))
print
(
'
\n\n
'
)
print_normal
(
' Stderr:'
)
stderr_cmds
=
[
'cat'
,
stderr_full_path
]
stderr_content
=
check_output
(
stderr_cmds
)
print
(
stderr_content
.
decode
(
'utf-8'
))
print
(
check_output_command
(
stderr_full_path
))
def
get_nni_installation_path
():
''' Find nni lib from the following locations in order
...
...
@@ -67,7 +64,7 @@ def get_nni_installation_path():
Return None if nothing is found
'''
def
_generate_installation_path
(
sitepackages_path
):
python_dir
=
str
(
Path
(
sitepackages_path
)
.
parents
[
2
])
python_dir
=
get_python_dir
(
sitepackages_path
)
entry_file
=
os
.
path
.
join
(
python_dir
,
'nni'
,
'main.js'
)
if
os
.
path
.
isfile
(
entry_file
):
return
python_dir
...
...
@@ -132,7 +129,11 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
log_header
=
LOG_HEADER
%
str
(
time_now
)
stdout_file
.
write
(
log_header
)
stderr_file
.
write
(
log_header
)
process
=
Popen
(
cmds
,
cwd
=
entry_dir
,
stdout
=
stdout_file
,
stderr
=
stderr_file
)
if
sys
.
platform
==
'win32'
:
from
subprocess
import
CREATE_NEW_PROCESS_GROUP
process
=
Popen
(
cmds
,
cwd
=
entry_dir
,
stdout
=
stdout_file
,
stderr
=
stderr_file
,
creationflags
=
CREATE_NEW_PROCESS_GROUP
)
else
:
process
=
Popen
(
cmds
,
cwd
=
entry_dir
,
stdout
=
stdout_file
,
stderr
=
stderr_file
)
return
process
,
str
(
time_now
)
def
set_trial_config
(
experiment_config
,
port
,
config_file_name
):
...
...
@@ -357,7 +358,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
nni_config
.
set_config
(
'restServerPid'
,
rest_process
.
pid
)
# Deal with annotation
if
experiment_config
.
get
(
'useAnnotation'
):
path
=
os
.
path
.
join
(
tempfile
.
gettempdir
(),
os
.
environ
[
'USER'
]
,
'nni'
,
'annotation'
)
path
=
os
.
path
.
join
(
tempfile
.
gettempdir
(),
get_user
()
,
'nni'
,
'annotation'
)
if
not
os
.
path
.
isdir
(
path
):
os
.
makedirs
(
path
)
path
=
tempfile
.
mkdtemp
(
dir
=
path
)
...
...
@@ -380,8 +381,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
print_error
(
'Restful server start failed!'
)
print_log_content
(
config_file_name
)
try
:
cmds
=
[
'kill'
,
str
(
rest_process
.
pid
)]
call
(
cmds
)
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
...
...
@@ -395,8 +395,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
else
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
cmds
=
[
'kill'
,
str
(
rest_process
.
pid
)]
call
(
cmds
)
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
...
...
@@ -409,8 +408,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
else
:
print_error
(
'Set local config failed!'
)
try
:
cmds
=
[
'kill'
,
str
(
rest_process
.
pid
)]
call
(
cmds
)
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
...
...
@@ -425,8 +423,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
if
err_msg
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
cmds
=
[
'kill'
,
str
(
rest_process
.
pid
)]
call
(
cmds
)
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
...
...
@@ -441,8 +438,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
if
err_msg
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
cmds
=
[
'pkill'
,
str
(
rest_process
.
pid
)]
call
(
cmds
)
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
...
...
@@ -457,8 +453,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
if
err_msg
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
cmds
=
[
'pkill'
,
str
(
rest_process
.
pid
)]
call
(
cmds
)
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
...
...
@@ -477,8 +472,7 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
print_error
(
'Start experiment failed!'
)
print_log_content
(
config_file_name
)
try
:
cmds
=
[
'kill'
,
str
(
rest_process
.
pid
)]
call
(
cmds
)
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
...
...
tools/nni_cmd/nnictl.py
View file @
cfda0dae
...
...
@@ -27,6 +27,8 @@ from .nnictl_utils import *
from
.package_management
import
*
from
.constants
import
*
from
.tensorboard_utils
import
*
from
colorama
import
init
init
(
autoreset
=
True
)
if
os
.
environ
.
get
(
'COVERAGE_PROCESS_START'
):
import
coverage
...
...
tools/nni_cmd/nnictl_utils.py
View file @
cfda0dae
...
...
@@ -24,7 +24,6 @@ import psutil
import
json
import
datetime
import
time
from
subprocess
import
call
,
check_output
from
.rest_utils
import
rest_get
,
rest_delete
,
check_rest_server_quick
,
check_response
from
.config_utils
import
Config
,
Experiments
...
...
@@ -32,6 +31,7 @@ from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url
from
.constants
import
NNICTL_HOME_DIR
,
EXPERIMENT_INFORMATION_FORMAT
,
EXPERIMENT_DETAIL_FORMAT
,
\
EXPERIMENT_MONITOR_INFO
,
TRIAL_MONITOR_HEAD
,
TRIAL_MONITOR_CONTENT
,
TRIAL_MONITOR_TAIL
,
REST_TIME_OUT
from
.common_utils
import
print_normal
,
print_error
,
print_warning
,
detect_process
from
.command_utils
import
check_output_command
,
kill_command
def
get_experiment_time
(
port
):
'''get the startTime and endTime of an experiment'''
...
...
@@ -219,14 +219,12 @@ def stop_experiment(args):
rest_port
=
nni_config
.
get_config
(
'restServerPort'
)
rest_pid
=
nni_config
.
get_config
(
'restServerPid'
)
if
rest_pid
:
stop_rest_cmds
=
[
'kill'
,
str
(
rest_pid
)]
call
(
stop_rest_cmds
)
kill_command
(
rest_pid
)
tensorboard_pid_list
=
nni_config
.
get_config
(
'tensorboardPidList'
)
if
tensorboard_pid_list
:
for
tensorboard_pid
in
tensorboard_pid_list
:
try
:
cmds
=
[
'kill'
,
'-9'
,
str
(
tensorboard_pid
)]
call
(
cmds
)
kill_command
(
tensorboard_pid
)
except
Exception
as
exception
:
print_error
(
exception
)
nni_config
.
set_config
(
'tensorboardPidList'
,
[])
...
...
@@ -303,14 +301,6 @@ def experiment_status(args):
else
:
print
(
json
.
dumps
(
json
.
loads
(
response
.
text
),
indent
=
4
,
sort_keys
=
True
,
separators
=
(
','
,
':'
)))
def
get_log_content
(
file_name
,
cmds
):
'''use cmds to read config content'''
if
os
.
path
.
exists
(
file_name
):
rest
=
check_output
(
cmds
)
print
(
rest
.
decode
(
'utf-8'
))
else
:
print_normal
(
'NULL!'
)
def
log_internal
(
args
,
filetype
):
'''internal function to call get_log_content'''
file_name
=
get_config_filename
(
args
)
...
...
@@ -318,15 +308,8 @@ def log_internal(args, filetype):
file_full_path
=
os
.
path
.
join
(
NNICTL_HOME_DIR
,
file_name
,
'stdout'
)
else
:
file_full_path
=
os
.
path
.
join
(
NNICTL_HOME_DIR
,
file_name
,
'stderr'
)
if
args
.
head
:
get_log_content
(
file_full_path
,
[
'head'
,
'-'
+
str
(
args
.
head
),
file_full_path
])
elif
args
.
tail
:
get_log_content
(
file_full_path
,
[
'tail'
,
'-'
+
str
(
args
.
tail
),
file_full_path
])
elif
args
.
path
:
print_normal
(
'The path of stdout file is: '
+
file_full_path
)
else
:
get_log_content
(
file_full_path
,
[
'cat'
,
file_full_path
])
print
(
check_output_command
(
file_full_path
,
head
=
args
.
head
,
tail
=
args
.
tail
))
def
log_stdout
(
args
):
'''get stdout log'''
log_internal
(
args
,
'stdout'
)
...
...
tools/nni_cmd/package_management.py
View file @
cfda0dae
...
...
@@ -20,17 +20,18 @@
import
nni
import
os
import
sys
from
subprocess
import
call
from
.constants
import
PACKAGE_REQUIREMENTS
from
.common_utils
import
print_normal
,
print_error
from
.command_utils
import
install_requirements_command
def
process_install
(
package_name
):
if
PACKAGE_REQUIREMENTS
.
get
(
package_name
)
is
None
:
print_error
(
'{0} is not supported!'
%
package_name
)
else
:
requirements_path
=
os
.
path
.
join
(
nni
.
__path__
[
0
],
PACKAGE_REQUIREMENTS
[
package_name
])
cmds
=
'cd '
+
requirements_path
+
' && python3 -m pip install --user -r requirements.txt'
call
(
cmds
,
shell
=
True
)
install_requirements_command
(
requirements_path
)
def
package_install
(
args
):
'''install packages'''
...
...
@@ -39,4 +40,4 @@ def package_install(args):
def
package_show
(
args
):
'''show all packages'''
print
(
' '
.
join
(
PACKAGE_REQUIREMENTS
.
keys
()))
\ No newline at end of file
tools/nni_cmd/ssh_utils.py
View file @
cfda0dae
...
...
@@ -21,14 +21,14 @@
import
os
from
.common_utils
import
print_error
from
subprocess
import
call
from
.command_utils
import
install_package_command
def
check_environment
():
'''check if paramiko is installed'''
try
:
import
paramiko
except
:
cmds
=
'python3 -m pip install --user paramiko'
call
(
cmds
,
shell
=
True
)
install_package_command
(
'paramiko'
)
def
copy_remote_directory_to_local
(
sftp
,
remote_path
,
local_path
):
'''copy remote directory to local machine'''
...
...
@@ -56,4 +56,4 @@ def create_ssh_sftp_client(host_ip, port, username, password):
sftp
=
paramiko
.
SFTPClient
.
from_transport
(
conn
)
return
sftp
except
Exception
as
exception
:
print_error
(
'Create ssh client error %s
\n
'
%
exception
)
\ No newline at end of file
print_error
(
'Create ssh client error %s
\n
'
%
exception
)
tools/nni_gpu_tool/gpu_metrics_collector.py
View file @
cfda0dae
...
...
@@ -25,6 +25,9 @@ import time
from
xml.dom
import
minidom
def
check_ready_to_run
():
#TODO check process in windows
if
sys
.
platform
==
'win32'
:
return
True
pgrep_output
=
subprocess
.
check_output
(
'pgrep -fx
\'
python3 -m nni_gpu_tool.gpu_metrics_collector
\'
'
,
shell
=
True
)
pidList
=
[]
for
pid
in
pgrep_output
.
splitlines
():
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment