Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
0663218b
Unverified
Commit
0663218b
authored
Apr 22, 2019
by
SparkSnail
Committed by
GitHub
Apr 22, 2019
Browse files
Merge pull request #163 from Microsoft/master
merge master
parents
6c9360a5
cf983800
Changes
116
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
491 additions
and
99 deletions
+491
-99
examples/trials/cifar10_pytorch/utils.py
examples/trials/cifar10_pytorch/utils.py
+1
-1
install.ps1
install.ps1
+127
-0
setup.py
setup.py
+3
-2
src/nni_manager/common/datastore.ts
src/nni_manager/common/datastore.ts
+1
-1
src/nni_manager/common/manager.ts
src/nni_manager/common/manager.ts
+1
-0
src/nni_manager/common/utils.ts
src/nni_manager/common/utils.ts
+102
-6
src/nni_manager/core/commands.ts
src/nni_manager/core/commands.ts
+3
-0
src/nni_manager/core/nnimanager.ts
src/nni_manager/core/nnimanager.ts
+16
-16
src/nni_manager/core/test/dataStore.test.ts
src/nni_manager/core/test/dataStore.test.ts
+1
-0
src/nni_manager/core/test/ipcInterface.test.ts
src/nni_manager/core/test/ipcInterface.test.ts
+10
-5
src/nni_manager/core/test/ipcInterfaceTerminate.test.ts
src/nni_manager/core/test/ipcInterfaceTerminate.test.ts
+2
-4
src/nni_manager/core/test/nnimanager.test.ts
src/nni_manager/core/test/nnimanager.test.ts
+2
-1
src/nni_manager/package.json
src/nni_manager/package.json
+1
-2
src/nni_manager/rest_server/restHandler.ts
src/nni_manager/rest_server/restHandler.ts
+11
-0
src/nni_manager/rest_server/test/mockedNNIManager.ts
src/nni_manager/rest_server/test/mockedNNIManager.ts
+3
-0
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+8
-1
src/nni_manager/training_service/common/util.ts
src/nni_manager/training_service/common/util.ts
+134
-1
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+18
-21
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+45
-36
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+2
-2
No files found.
examples/trials/cifar10_pytorch/utils.py
View file @
0663218b
...
...
@@ -43,7 +43,7 @@ def init_params(net):
term_width
=
0
try
:
_
,
term_width
=
os
.
popen
(
'stty size'
,
'r'
).
read
().
split
()
term_width
=
os
.
get_terminal_size
().
columns
except
Exception
as
exception
:
term_width
=
200
term_width
=
int
(
term_width
)
...
...
install.ps1
0 → 100644
View file @
0663218b
[
Net.ServicePointManager
]::
SecurityProtocol
=
[
Net.SecurityProtocolType
]::
Tls12
$install_node
=
$true
$install_yarn
=
$true
# nodejs
$nodeUrl
=
"https://aka.ms/nni/nodejs-download/win64"
$yarnUrl
=
"https://yarnpkg.com/latest.tar.gz"
$unzipNodeDir
=
"node-v*"
$unzipYarnDir
=
"yarn-v*"
$NNI_DEPENDENCY_FOLDER
=
"C:\tmp\
$
env
:
USERNAME
"
$WHICH_PYTHON
=
where.exe
python
if
(
$WHICH_PYTHON
-eq
$null
){
throw
"Can not find python"
}
else
{
$pyVersion
=
&
python
-V
2
>
&
1
$pyVersion
=
([
string
]
$pyVersion
)
.
substring
(
7
,
3
)
if
([
double
]
$pyVersion
-lt
3.5
){
throw
"python version should >= 3.5"
}
}
$WHICH_PIP
=
where.exe
pip
if
(
$WHICH_PIP
-eq
$null
){
throw
"Can not find pip"
}
$
env
:
PYTHONIOENCODING
=
"UTF-8"
if
(
$
env
:
VIRTUAL_ENV
){
$NNI_PYTHON3
=
$
env
:
VIRTUAL_ENV
+
"\Scripts"
$NNI_PKG_FOLDER
=
$
env
:
VIRTUAL_ENV
+
"\nni"
$NNI_PYTHON_SCRIPTS
=
$NNI_PYTHON3
}
else
{
$NNI_PYTHON3
=
$
(
python
-c
'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]))'
)
$NNI_PKG_FOLDER
=
$NNI_PYTHON3
+
"\nni"
$NNI_PYTHON_SCRIPTS
=
$NNI_PYTHON3
+
"\Scripts"
}
$PIP_INSTALL
=
"""
$NNI_PYTHON3
\python"" -m pip install ."
if
(
!
(
Test-Path
$NNI_DEPENDENCY_FOLDER
)){
New-Item
$NNI_DEPENDENCY_FOLDER
-ItemType
Directory
}
$NNI_NODE_ZIP
=
$NNI_DEPENDENCY_FOLDER
+
"\nni-node.zip"
$NNI_NODE_FOLDER
=
$NNI_DEPENDENCY_FOLDER
+
"\nni-node"
$NNI_YARN_TARBALL
=
$NNI_DEPENDENCY_FOLDER
+
"\nni-yarn.tar.gz"
$NNI_YARN_FOLDER
=
$NNI_DEPENDENCY_FOLDER
+
"\nni-yarn"
$NNI_YARN
=
$NNI_YARN_FOLDER
+
"\bin\yarn"
## Version number
$NNI_VERSION_VALUE
=
$
(
git
describe
--tags
)
$NNI_VERSION_TEMPLATE
=
"999.0.0-developing"
if
(
!
(
Test-Path
$NNI_NODE_ZIP
)){
Write-Host
"Downloading Node..."
(
New-Object
Net.WebClient
)
.
DownloadFile
(
$nodeUrl
,
$NNI_NODE_ZIP
)
}
if
(
!
(
Test-Path
$NNI_YARN_TARBALL
)){
Write-Host
"Downloading Yarn..."
(
New-Object
Net.WebClient
)
.
DownloadFile
(
$yarnUrl
,
$NNI_YARN_TARBALL
)
}
$NNI_YARN_TARBALL
=
$NNI_YARN_TARBALL
-split
'\\'
-join
'\\'
$NNI_DEPENDENCY_FOLDER
=
$NNI_DEPENDENCY_FOLDER
-split
'\\'
-join
'\\'
$SCRIPT_PATH
=
$NNI_DEPENDENCY_FOLDER
+
'\extract.py'
$SCRIPT
=
"import tarfile"
,
(
"tar = tarfile.open(""{0}"")"
-f
$NNI_YARN_TARBALL
),
(
"tar.extractall(""{0}"")"
-f
$NNI_DEPENDENCY_FOLDER
),
"tar.close()"
[
System.IO.File
]::
WriteAllLines
(
$SCRIPT_PATH
,
$SCRIPT
)
Add-Type
-AssemblyName
System.IO.Compression.FileSystem
function
Unzip
{
param
([
string
]
$zipfile
,
[
string
]
$outpath
)
[
System.IO.Compression.ZipFile
]::
ExtractToDirectory
(
$zipfile
,
$outpath
)
}
if
(
$install_node
)
{
### nodejs install
if
(
!
(
Test-Path
$NNI_NODE_FOLDER
)){
Unzip
$NNI_NODE_ZIP
$NNI_DEPENDENCY_FOLDER
$unzipNodeDir
=
Get-ChildItem
"
$NNI_DEPENDENCY_FOLDER
\
$unzipNodeDir
"
Rename-Item
$unzipNodeDir
"nni-node"
}
Copy-Item
"
$NNI_NODE_FOLDER
\node.exe"
$NNI_PYTHON_SCRIPTS
-Recurse
-Force
### yarn install
if
(
!
(
Test-Path
$NNI_YARN_FOLDER
)){
cmd
/C
"""
$NNI_PYTHON3
\python"""
$SCRIPT_PATH
$unzipYarnDir
=
Get-ChildItem
"
$NNI_DEPENDENCY_FOLDER
\
$unzipYarnDir
"
Rename-Item
$unzipYarnDir
"nni-yarn"
}
}
## install-python-modules:
### Installing Python SDK
(
Get-Content
setup.py
)
.
replace
(
$NNI_VERSION_TEMPLATE
,
$NNI_VERSION_VALUE
)
|
Set-Content
setup.py
cmd
/c
$PIP_INSTALL
# Building NNI Manager
$
env
:
PATH
=
$NNI_PYTHON_SCRIPTS
+
';'
+
$
env
:
PATH
cd
src\nni_manager
cmd
/c
$NNI_YARN
cmd
/c
$NNI_YARN
build
Copy-Item
config
-Destination
.
\dist\
-Recurse
-Force
# Building WebUI
cd
..
\webui
cmd
/c
$NNI_YARN
cmd
/c
$NNI_YARN
build
cd
..
\..
## install-node-modules
if
(
!
(
Test-Path
$NNI_PKG_FOLDER
)){
New-Item
$NNI_PKG_FOLDER
-ItemType
Directory
}
Remove-Item
$NNI_PKG_FOLDER
-Recurse
-Force
Copy-Item
"src\nni_manager\dist"
$NNI_PKG_FOLDER
-Recurse
Copy-Item
"src\nni_manager\package.json"
$NNI_PKG_FOLDER
$PKG_JSON
=
$NNI_PKG_FOLDER
+
"\package.json"
(
Get-Content
$PKG_JSON
)
.
replace
(
$NNI_VERSION_TEMPLATE
,
$NNI_VERSION_VALUE
)
|
Set-Content
$PKG_JSON
cmd
/c
$NNI_YARN
--prod
--cwd
$NNI_PKG_FOLDER
$NNI_PKG_FOLDER_STATIC
=
$NNI_PKG_FOLDER
+
"\static"
Copy-Item
"src\webui\build"
$NNI_PKG_FOLDER_STATIC
-Recurse
setup.py
View file @
0663218b
...
...
@@ -51,11 +51,12 @@ setup(
'json_tricks'
,
'numpy'
,
'psutil'
,
'
py
yaml'
,
'
ruamel.
yaml'
,
'requests'
,
'scipy'
,
'schema'
,
'PythonWebHDFS'
'PythonWebHDFS'
,
'colorama'
],
entry_points
=
{
...
...
src/nni_manager/common/datastore.ts
View file @
0663218b
...
...
@@ -22,7 +22,7 @@
import
{
ExperimentProfile
,
TrialJobStatistics
}
from
'
./manager
'
;
import
{
TrialJobDetail
,
TrialJobStatus
}
from
'
./trainingService
'
;
type
TrialJobEvent
=
TrialJobStatus
|
'
USER_TO_CANCEL
'
|
'
ADD_CUSTOMIZED
'
|
'
ADD_HYPERPARAMETER
'
;
type
TrialJobEvent
=
TrialJobStatus
|
'
USER_TO_CANCEL
'
|
'
ADD_CUSTOMIZED
'
|
'
ADD_HYPERPARAMETER
'
|
'
IMPORT_DATA
'
;
type
MetricType
=
'
PERIODICAL
'
|
'
FINAL
'
|
'
CUSTOM
'
|
'
REQUEST_PARAMETER
'
;
interface
ExperimentProfileRecord
{
...
...
src/nni_manager/common/manager.ts
View file @
0663218b
...
...
@@ -99,6 +99,7 @@ abstract class Manager {
public
abstract
stopExperiment
():
Promise
<
void
>
;
public
abstract
getExperimentProfile
():
Promise
<
ExperimentProfile
>
;
public
abstract
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
;
public
abstract
importData
(
data
:
string
):
Promise
<
void
>
;
public
abstract
addCustomizedTrialJob
(
hyperParams
:
string
):
Promise
<
void
>
;
public
abstract
cancelTrialJobByUser
(
trialJobId
:
string
):
Promise
<
void
>
;
...
...
src/nni_manager/common/utils.ts
View file @
0663218b
...
...
@@ -22,6 +22,8 @@
import
*
as
assert
from
'
assert
'
;
import
{
randomBytes
}
from
'
crypto
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
{
ChildProcess
,
spawn
,
StdioOptions
}
from
'
child_process
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
...
...
@@ -32,6 +34,7 @@ import * as util from 'util';
import
{
Database
,
DataStore
}
from
'
./datastore
'
;
import
{
ExperimentStartupInfo
,
getExperimentId
,
getExperimentStartupInfo
,
setExperimentStartupInfo
}
from
'
./experimentStartupInfo
'
;
import
{
Manager
}
from
'
./manager
'
;
import
{
TrialConfig
}
from
'
../training_service/common/trialConfig
'
;
import
{
HyperParameters
,
TrainingService
,
TrialJobStatus
}
from
'
./trainingService
'
;
import
{
getLogger
}
from
'
./log
'
;
...
...
@@ -146,6 +149,23 @@ function parseArg(names: string[]): string {
return
''
;
}
function
encodeCmdLineArgs
(
args
:
any
):
any
{
if
(
process
.
platform
===
'
win32
'
){
return
JSON
.
stringify
(
args
);
}
else
{
return
JSON
.
stringify
(
JSON
.
stringify
(
args
));
}
}
function
getCmdPy
():
string
{
let
cmd
=
'
python3
'
;
if
(
process
.
platform
===
'
win32
'
){
cmd
=
'
python
'
;
}
return
cmd
;
}
/**
* Generate command line to start automl algorithm(s),
* either start advisor or start a process which runs tuner and assessor
...
...
@@ -179,8 +199,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if
(
!
tuner
&&
!
advisor
)
{
throw
new
Error
(
'
Error: specify neither tuner nor advisor is not allowed
'
);
}
let
command
:
string
=
`python3 -m nni`
;
let
command
:
string
=
`
${
getCmdPy
()}
-m nni`
;
if
(
multiPhase
)
{
command
+=
'
--multi_phase
'
;
}
...
...
@@ -192,7 +211,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if
(
advisor
)
{
command
+=
` --advisor_class_name
${
advisor
.
className
}
`
;
if
(
advisor
.
classArgs
!==
undefined
)
{
command
+=
` --advisor_args
${
JSON
.
stringify
(
JSON
.
stringify
(
advisor
.
classArgs
)
)
}
`
;
command
+=
` --advisor_args
${
encodeCmdLineArgs
(
advisor
.
classArgs
)}
`
;
}
if
(
advisor
.
codeDir
!==
undefined
&&
advisor
.
codeDir
.
length
>
1
)
{
command
+=
` --advisor_directory
${
advisor
.
codeDir
}
`
;
...
...
@@ -203,7 +222,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
}
else
{
command
+=
` --tuner_class_name
${
tuner
.
className
}
`
;
if
(
tuner
.
classArgs
!==
undefined
)
{
command
+=
` --tuner_args
${
JSON
.
stringify
(
JSON
.
stringify
(
tuner
.
classArgs
)
)
}
`
;
command
+=
` --tuner_args
${
encodeCmdLineArgs
(
tuner
.
classArgs
)}
`
;
}
if
(
tuner
.
codeDir
!==
undefined
&&
tuner
.
codeDir
.
length
>
1
)
{
command
+=
` --tuner_directory
${
tuner
.
codeDir
}
`
;
...
...
@@ -215,7 +234,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if
(
assessor
!==
undefined
&&
assessor
.
className
!==
undefined
)
{
command
+=
` --assessor_class_name
${
assessor
.
className
}
`
;
if
(
assessor
.
classArgs
!==
undefined
)
{
command
+=
` --assessor_args
${
JSON
.
stringify
(
JSON
.
stringify
(
assessor
.
classArgs
)
)
}
`
;
command
+=
` --assessor_args
${
encodeCmdLineArgs
(
assessor
.
classArgs
)}
`
;
}
if
(
assessor
.
codeDir
!==
undefined
&&
assessor
.
codeDir
.
length
>
1
)
{
command
+=
` --assessor_directory
${
assessor
.
codeDir
}
`
;
...
...
@@ -363,6 +382,83 @@ async function getVersion(): Promise<string> {
return
deferred
.
promise
;
}
/**
* run command as ChildProcess
*/
function
getTunerProc
(
command
:
string
,
stdio
:
StdioOptions
,
newCwd
:
string
,
newEnv
:
any
):
ChildProcess
{
let
cmd
:
string
=
command
;
let
arg
:
string
[]
=
[];
let
newShell
:
boolean
=
true
;
if
(
process
.
platform
===
"
win32
"
){
cmd
=
command
.
split
(
"
"
,
1
)[
0
];
arg
=
command
.
substr
(
cmd
.
length
+
1
).
split
(
"
"
);
newShell
=
false
;
}
const
tunerProc
:
ChildProcess
=
spawn
(
cmd
,
arg
,
{
stdio
,
cwd
:
newCwd
,
env
:
newEnv
,
shell
:
newShell
});
return
tunerProc
;
}
/**
* judge whether the process is alive
*/
async
function
isAlive
(
pid
:
any
):
Promise
<
boolean
>
{
let
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
let
alive
:
boolean
=
false
;
if
(
process
.
platform
===
'
win32
'
){
try
{
const
str
=
cp
.
execSync
(
`powershell.exe Get-Process -Id
${
pid
}
-ErrorAction SilentlyContinue`
).
toString
();
if
(
str
)
{
alive
=
true
;
}
}
catch
(
error
)
{
}
}
else
{
try
{
await
cpp
.
exec
(
`kill -0
${
pid
}
`
);
alive
=
true
;
}
catch
(
error
)
{
//ignore
}
}
deferred
.
resolve
(
alive
);
return
deferred
.
promise
;
}
/**
* kill process
*/
async
function
killPid
(
pid
:
any
):
Promise
<
void
>
{
let
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
try
{
if
(
process
.
platform
===
"
win32
"
)
{
await
cpp
.
exec
(
`cmd /c taskkill /PID
${
pid
}
/F`
);
}
else
{
await
cpp
.
exec
(
`kill -9
${
pid
}
`
);
}
}
catch
(
error
)
{
// pid does not exist, do nothing here
}
deferred
.
resolve
();
return
deferred
.
promise
;
}
function
getNewLine
():
string
{
if
(
process
.
platform
===
"
win32
"
)
{
return
"
\r\n
"
;
}
else
{
return
"
\n
"
;
}
}
export
{
countFilesRecursively
,
getRemoteTmpDir
,
generateParamFileName
,
getMsgDispatcherCommand
,
getCheckpointDir
,
getLogDir
,
getExperimentRootDir
,
getJobCancelStatus
,
getDefaultDatabaseDir
,
getIPV4Address
,
mkDirP
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomSelect
,
getLogLevel
,
getVersion
};
mkDirP
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomSelect
,
getLogLevel
,
getVersion
,
getCmdPy
,
getTunerProc
,
isAlive
,
killPid
,
getNewLine
};
src/nni_manager/core/commands.ts
View file @
0663218b
...
...
@@ -22,6 +22,7 @@ const INITIALIZE = 'IN';
const
REQUEST_TRIAL_JOBS
=
'
GE
'
;
const
REPORT_METRIC_DATA
=
'
ME
'
;
const
UPDATE_SEARCH_SPACE
=
'
SS
'
;
const
IMPORT_DATA
=
'
FD
'
const
ADD_CUSTOMIZED_TRIAL_JOB
=
'
AD
'
;
const
TRIAL_END
=
'
EN
'
;
const
TERMINATE
=
'
TE
'
;
...
...
@@ -38,6 +39,7 @@ const TUNER_COMMANDS: Set<string> = new Set([
REQUEST_TRIAL_JOBS
,
REPORT_METRIC_DATA
,
UPDATE_SEARCH_SPACE
,
IMPORT_DATA
,
ADD_CUSTOMIZED_TRIAL_JOB
,
TERMINATE
,
PING
,
...
...
@@ -62,6 +64,7 @@ export {
REQUEST_TRIAL_JOBS
,
REPORT_METRIC_DATA
,
UPDATE_SEARCH_SPACE
,
IMPORT_DATA
,
ADD_CUSTOMIZED_TRIAL_JOB
,
TRIAL_END
,
TERMINATE
,
...
...
src/nni_manager/core/nnimanager.ts
View file @
0663218b
...
...
@@ -35,10 +35,10 @@ import {
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../common/trainingService
'
;
import
{
delay
,
getCheckpointDir
,
getExperimentRootDir
,
getLogDir
,
getMsgDispatcherCommand
,
mkDirP
,
get
LogLevel
}
from
'
../common/utils
'
;
import
{
delay
,
getCheckpointDir
,
getExperimentRootDir
,
getLogDir
,
getMsgDispatcherCommand
,
mkDirP
,
get
TunerProc
,
getLogLevel
,
isAlive
,
killPid
}
from
'
../common/utils
'
;
import
{
ADD_CUSTOMIZED_TRIAL_JOB
,
INITIALIZE
,
INITIALIZED
,
KILL_TRIAL_JOB
,
NEW_TRIAL_JOB
,
NO_MORE_TRIAL_JOBS
,
PING
,
REPORT_METRIC_DATA
,
REQUEST_TRIAL_JOBS
,
SEND_TRIAL_JOB_PARAMETER
,
TERMINATE
,
TRIAL_END
,
UPDATE_SEARCH_SPACE
REPORT_METRIC_DATA
,
REQUEST_TRIAL_JOBS
,
SEND_TRIAL_JOB_PARAMETER
,
TERMINATE
,
TRIAL_END
,
UPDATE_SEARCH_SPACE
,
IMPORT_DATA
}
from
'
./commands
'
;
import
{
createDispatcherInterface
,
IpcInterface
}
from
'
./ipcInterface
'
;
...
...
@@ -99,6 +99,17 @@ class NNIManager implements Manager {
return
this
.
storeExperimentProfile
();
}
public
importData
(
data
:
string
):
Promise
<
void
>
{
if
(
this
.
dispatcher
===
undefined
)
{
return
Promise
.
reject
(
new
Error
(
'
tuner has not been setup
'
)
);
}
this
.
dispatcher
.
sendCommand
(
IMPORT_DATA
,
data
);
return
this
.
dataStore
.
storeTrialJobEvent
(
'
IMPORT_DATA
'
,
''
,
data
);
}
public
addCustomizedTrialJob
(
hyperParams
:
string
):
Promise
<
void
>
{
if
(
this
.
currSubmittedTrialNum
>=
this
.
experimentProfile
.
params
.
maxTrialNum
)
{
return
Promise
.
reject
(
...
...
@@ -290,12 +301,7 @@ class NNIManager implements Manager {
NNI_INCLUDE_INTERMEDIATE_RESULTS
:
includeIntermediateResultsEnv
};
let
newEnv
=
Object
.
assign
({},
process
.
env
,
nniEnv
);
const
tunerProc
:
ChildProcess
=
spawn
(
command
,
[],
{
stdio
,
cwd
:
newCwd
,
env
:
newEnv
,
shell
:
true
});
const
tunerProc
:
ChildProcess
=
getTunerProc
(
command
,
stdio
,
newCwd
,
newEnv
);
this
.
dispatcherPid
=
tunerProc
.
pid
;
this
.
dispatcher
=
createDispatcherInterface
(
tunerProc
);
...
...
@@ -341,16 +347,10 @@ class NNIManager implements Manager {
// gracefully terminate tuner and assessor here, wait at most 30 seconds.
for
(
let
i
:
number
=
0
;
i
<
30
;
i
++
)
{
if
(
!
tunerAlive
)
{
break
;
}
try
{
await
cpp
.
exec
(
`kill -0
${
this
.
dispatcherPid
}
`
);
}
catch
(
error
)
{
tunerAlive
=
false
;
}
tunerAlive
=
await
isAlive
(
this
.
dispatcherPid
);
await
delay
(
1000
);
}
try
{
await
cpp
.
exec
(
`kill -9
${
this
.
dispatcherPid
}
`
);
}
catch
(
error
)
{
// this.tunerPid does not exist, do nothing here
}
await
killPid
(
this
.
dispatcherPid
);
const
trialJobList
:
TrialJobDetail
[]
=
await
this
.
trainingService
.
listTrialJobs
();
// TO DO: to promise all
for
(
const
trialJob
of
trialJobList
)
{
...
...
src/nni_manager/core/test/dataStore.test.ts
View file @
0663218b
...
...
@@ -42,6 +42,7 @@ describe('Unit test for dataStore', () => {
});
after
(()
=>
{
ds
.
close
();
cleanupUnitTest
();
});
...
...
src/nni_manager/core/test/ipcInterface.test.ts
View file @
0663218b
...
...
@@ -18,11 +18,10 @@
*/
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
ChildProcess
,
spawn
,
StdioOptions
}
from
'
child_process
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
,
getTunerProc
,
getCmdPy
}
from
'
../../common/utils
'
;
import
*
as
CommandType
from
'
../commands
'
;
import
{
createDispatcherInterface
,
IpcInterface
}
from
'
../ipcInterface
'
;
import
{
NNIError
}
from
'
../../common/errors
'
;
...
...
@@ -39,15 +38,21 @@ function runProcess(): Promise<Error | null> {
// create fake assessor process
const
stdio
:
StdioOptions
=
[
'
ignore
'
,
'
pipe
'
,
process
.
stderr
,
'
pipe
'
,
'
pipe
'
];
const
proc
:
ChildProcess
=
spawn
(
'
python3 assessor.py
'
,
[],
{
stdio
,
cwd
:
'
core/test
'
,
shell
:
true
})
;
const
command
:
string
=
getCmdPy
()
+
'
assessor.py
'
;
const
proc
:
ChildProcess
=
getTunerProc
(
command
,
stdio
,
'
core/test
'
,
process
.
env
);
// record its sent/received commands on exit
proc
.
on
(
'
error
'
,
(
error
:
Error
):
void
=>
{
deferred
.
resolve
(
error
);
});
proc
.
on
(
'
exit
'
,
(
code
:
number
):
void
=>
{
if
(
code
!==
0
)
{
deferred
.
resolve
(
new
Error
(
`return code:
${
code
}
`
));
}
else
{
sentCommands
=
proc
.
stdout
.
read
().
toString
().
split
(
'
\n
'
);
let
str
=
proc
.
stdout
.
read
().
toString
();
if
(
str
.
search
(
"
\r\n
"
)
!=-
1
){
sentCommands
=
str
.
split
(
"
\r\n
"
);
}
else
{
sentCommands
=
str
.
split
(
'
\n
'
);
}
deferred
.
resolve
(
null
);
}
});
...
...
src/nni_manager/core/test/ipcInterfaceTerminate.test.ts
View file @
0663218b
...
...
@@ -22,7 +22,7 @@
import
*
as
assert
from
'
assert
'
;
import
{
ChildProcess
,
spawn
,
StdioOptions
}
from
'
child_process
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
,
getMsgDispatcherCommand
}
from
'
../../common/utils
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
,
getMsgDispatcherCommand
,
getTunerProc
}
from
'
../../common/utils
'
;
import
*
as
CommandType
from
'
../commands
'
;
import
{
createDispatcherInterface
,
IpcInterface
}
from
'
../ipcInterface
'
;
...
...
@@ -50,9 +50,7 @@ function startProcess(): void {
// advisor
undefined
);
const
proc
:
ChildProcess
=
spawn
(
dispatcherCmd
,
[],
{
stdio
,
cwd
:
'
core/test
'
,
shell
:
true
});
const
proc
:
ChildProcess
=
getTunerProc
(
dispatcherCmd
,
stdio
,
'
core/test
'
,
process
.
env
);
proc
.
on
(
'
error
'
,
(
error
:
Error
):
void
=>
{
procExit
=
true
;
procError
=
true
;
...
...
src/nni_manager/core/test/nnimanager.test.ts
View file @
0663218b
...
...
@@ -33,6 +33,7 @@ import { NNIManager } from '../nnimanager';
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
import
{
MockedTrainingService
}
from
'
./mockedTrainingService
'
;
import
{
MockedDataStore
}
from
'
./mockedDatastore
'
;
import
*
as
path
from
'
path
'
;
async
function
initContainer
():
Promise
<
void
>
{
prepareUnitTest
();
...
...
@@ -183,7 +184,7 @@ describe('Unit test for nnimanager', function () {
it
(
'
test getExperimentProfile
'
,
()
=>
{
return
nniManager
.
getExperimentProfile
().
then
((
experimentProfile
)
=>
{
expect
(
experimentProfile
.
id
).
to
.
be
.
equal
(
'
unittest
'
);
expect
(
experimentProfile
.
logDir
).
to
.
be
.
equal
(
os
.
homedir
()
+
'
/nni/
experiments
/
unittest
'
);
expect
(
experimentProfile
.
logDir
).
to
.
be
.
equal
(
path
.
join
(
os
.
homedir
()
,
'
nni
'
,
'
experiments
'
,
'
unittest
'
)
)
;
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
...
...
src/nni_manager/package.json
View file @
0663218b
...
...
@@ -3,7 +3,6 @@
"version"
:
"999.0.0-developing"
,
"main"
:
"index.js"
,
"scripts"
:
{
"postbuild"
:
"cp -rf config ./dist/"
,
"build"
:
"tsc"
,
"test"
:
"nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors"
,
"start"
:
"node dist/main.js"
,
...
...
@@ -35,7 +34,7 @@
"@types/express"
:
"^4.16.0"
,
"@types/glob"
:
"^7.1.1"
,
"@types/mocha"
:
"^5.2.5"
,
"@types/node"
:
"
^
10.12.18"
,
"@types/node"
:
"10.12.18"
,
"@types/request"
:
"^2.47.1"
,
"@types/rx"
:
"^4.1.1"
,
"@types/sqlite3"
:
"^3.1.3"
,
...
...
src/nni_manager/rest_server/restHandler.ts
View file @
0663218b
...
...
@@ -63,6 +63,7 @@ class NNIRestHandler {
this
.
checkStatus
(
router
);
this
.
getExperimentProfile
(
router
);
this
.
updateExperimentProfile
(
router
);
this
.
importData
(
router
);
this
.
startExperiment
(
router
);
this
.
getTrialJobStatistics
(
router
);
this
.
setClusterMetaData
(
router
);
...
...
@@ -145,6 +146,16 @@ class NNIRestHandler {
});
}
private
importData
(
router
:
Router
):
void
{
router
.
post
(
'
/experiment/import-data
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
importData
(
JSON
.
stringify
(
req
.
body
)).
then
(()
=>
{
res
.
send
();
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
startExperiment
(
router
:
Router
):
void
{
router
.
post
(
'
/experiment
'
,
expressJoi
(
ValidationSchemas
.
STARTEXPERIMENT
),
(
req
:
Request
,
res
:
Response
)
=>
{
if
(
isNewExperiment
())
{
...
...
src/nni_manager/rest_server/test/mockedNNIManager.ts
View file @
0663218b
...
...
@@ -46,6 +46,9 @@ export class MockedNNIManager extends Manager {
public
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
{
return
Promise
.
resolve
();
}
public
importData
(
data
:
string
):
Promise
<
void
>
{
return
Promise
.
resolve
();
}
public
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
{
const
deferred
:
Deferred
<
TrialJobStatistics
[]
>
=
new
Deferred
<
TrialJobStatistics
[]
>
();
deferred
.
resolve
([{
...
...
src/nni_manager/training_service/common/gpuData.ts
View file @
0663218b
...
...
@@ -59,10 +59,17 @@ export class GPUSummary {
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT
_LINUX
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
`
$env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
Write $app.ID | Out-File {1} -NoNewline -encoding utf8
`
\ No newline at end of file
src/nni_manager/training_service/common/util.ts
View file @
0663218b
...
...
@@ -22,6 +22,12 @@ import { getLogger } from "common/log";
'
use strict
'
;
import
{
countFilesRecursively
}
from
'
../../common/utils
'
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
}
from
'
./gpuData
'
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
file
}
from
"
../../node_modules/@types/tmp
"
;
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
...
...
@@ -46,3 +52,130 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
return
fileCount
;
}
/**
* crete a new directory
* @param directory
*/
export
async
function
execMkdir
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
await
cpp
.
exec
(
`powershell.exe New-Item -Path
${
directory
}
-ItemType "directory" -Force`
);
}
else
{
await
cpp
.
exec
(
`mkdir -p
${
directory
}
`
);
}
return
Promise
.
resolve
();
}
/**
* crete a new file
* @param filename
*/
export
async
function
execNewFile
(
filename
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
await
cpp
.
exec
(
`powershell.exe New-Item -Path
${
filename
}
-ItemType "file" -Force`
);
}
else
{
await
cpp
.
exec
(
`touch
${
filename
}
`
);
}
return
Promise
.
resolve
();
}
/**
* run script
* @param filePath
*/
export
function
execScript
(
filePath
:
string
):
cp
.
ChildProcess
{
if
(
process
.
platform
===
'
win32
'
)
{
return
cp
.
exec
(
`powershell.exe -file
${
filePath
}
`
);
}
else
{
return
cp
.
exec
(
`bash
${
filePath
}
`
);
}
}
/**
* output the last line of a file
* @param filePath
*/
export
async
function
execTail
(
filePath
:
string
):
Promise
<
cpp
.
childProcessPromise
.
Result
>
{
let
cmdresult
:
cpp
.
childProcessPromise
.
Result
;
if
(
process
.
platform
===
'
win32
'
)
{
cmdresult
=
await
cpp
.
exec
(
`powershell.exe Get-Content
${
filePath
}
-Tail 1`
);
}
else
{
cmdresult
=
await
cpp
.
exec
(
`tail -n 1
${
filePath
}
`
);
}
return
Promise
.
resolve
(
cmdresult
);
}
/**
* delete a directory
* @param directory
*/
export
async
function
execRemove
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
await
cpp
.
exec
(
`powershell.exe Remove-Item
${
directory
}
`
);
}
else
{
await
cpp
.
exec
(
`rm -rf
${
directory
}
`
);
}
return
Promise
.
resolve
();
}
/**
* kill a process
* @param directory
*/
export
async
function
execKill
(
pid
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
await
cpp
.
exec
(
`cmd /c taskkill /PID
${
pid
}
/T /F`
);
}
else
{
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
}
return
Promise
.
resolve
();
}
/**
* set environment variable
* @param variable
* @returns command string
*/
export
function
setEnvironmentVariable
(
variable
:
{
key
:
string
;
value
:
string
}):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
return
`$env:
${
variable
.
key
}
="
${
variable
.
value
}
"`
;
}
else
{
return
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
;
}
}
/**
* generate script file name
* @param fileNamePrefix
*/
export
function
getScriptName
(
fileNamePrefix
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
return
fileNamePrefix
+
'
.ps1
'
;
}
else
{
return
fileNamePrefix
+
'
.sh
'
;
}
}
/**
* generate script file
* @param gpuMetricCollectorScriptFolder
*/
export
function
getgpuMetricsCollectorScriptContent
(
gpuMetricCollectorScriptFolder
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
),
);
}
else
{
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
),
);
}
}
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
0663218b
...
...
@@ -25,9 +25,10 @@ import * as fs from 'fs';
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
execMkdir
,
getScriptName
,
getgpuMetricsCollectorScriptContent
,
execScript
,
execTail
,
execRemove
,
execKill
}
from
'
../common/util
'
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
,
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
/**
* GPUScheduler for local training service
...
...
@@ -57,6 +58,19 @@ class GPUScheduler {
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
);
//generate gpu_metrics_collector script
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
getScriptName
(
'
gpu_metrics_collector
'
));
const
gpuMetricsCollectorScriptContent
:
string
=
getgpuMetricsCollectorScriptContent
(
this
.
gpuMetricCollectorScriptFolder
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
execScript
(
gpuMetricsCollectorScriptPath
)
}
public
getAvailableGPUIndices
():
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
info
.
activeProcessNum
===
0
)
...
...
@@ -78,33 +92,16 @@ class GPUScheduler {
this
.
stopping
=
true
;
try
{
const
pid
:
string
=
await
fs
.
promises
.
readFile
(
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
'
utf8
'
);
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
await
cpp
.
exec
(
`rm -rf
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
await
execKill
(
pid
);
await
execRemove
(
this
.
gpuMetricCollectorScriptFolder
);
}
catch
(
error
)
{
this
.
log
.
error
(
`GPU scheduler error:
${
error
}
`
);
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
//generate gpu_metrics_collector.sh
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics_collector.sh
'
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT
,
this
.
gpuMetricCollectorScriptFolder
,
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
cp
.
exec
(
`bash
${
gpuMetricsCollectorScriptPath
}
`
);
}
private
async
updateGPUSummary
():
Promise
<
void
>
{
const
cmdresult
:
cpp
.
childProcessPromise
.
Result
=
await
cpp
.
exec
(
`tail -n 1
${
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
)
}
`
);
await
execTail
(
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
));
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
else
{
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
0663218b
...
...
@@ -18,7 +18,6 @@
*/
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
{
EventEmitter
}
from
'
events
'
;
...
...
@@ -32,7 +31,8 @@ import {
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
uniqueString
,
isAlive
,
getNewLine
}
from
'
../../common/utils
'
;
import
{
execMkdir
,
getScriptName
,
execScript
,
setEnvironmentVariable
,
execNewFile
}
from
'
../common/util
'
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
...
...
@@ -169,14 +169,7 @@ class LocalTrainingService implements TrainingService {
return
this
.
getHostJob
(
trialJobId
);
}
if
(
trialJob
.
status
===
'
RUNNING
'
)
{
let
alive
:
boolean
=
false
;
try
{
await
cpp
.
exec
(
`kill -0
${
trialJob
.
pid
}
`
);
alive
=
true
;
}
catch
(
error
)
{
//ignore
}
let
alive
:
boolean
=
await
isAlive
(
trialJob
.
pid
);
if
(
!
alive
)
{
trialJob
.
endTime
=
Date
.
now
();
this
.
setTrialJobStatus
(
trialJob
,
'
FAILED
'
);
...
...
@@ -284,7 +277,9 @@ class LocalTrainingService implements TrainingService {
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
if
(
!
this
.
initialized
)
{
this
.
rootDir
=
getExperimentRootDir
();
await
cpp
.
exec
(
`mkdir -p
${
this
.
rootDir
}
`
);
if
(
!
fs
.
existsSync
(
this
.
rootDir
)){
await
cpp
.
exec
(
`powershell.exe mkdir
${
this
.
rootDir
}
`
);
}
this
.
initialized
=
true
;
}
switch
(
key
)
{
...
...
@@ -369,7 +364,7 @@ class LocalTrainingService implements TrainingService {
private
getEnvironmentVariables
(
trialJobDetail
:
TrialJobDetail
,
resource
?
:
{
gpuIndices
:
number
[]
}):
{
key
:
string
;
value
:
string
}[]
{
resource
:
{
gpuIndices
:
number
[]
}):
{
key
:
string
;
value
:
string
}[]
{
const
envVariables
:
{
key
:
string
;
value
:
string
}[]
=
[
{
key
:
'
NNI_PLATFORM
'
,
value
:
'
local
'
},
{
key
:
'
NNI_SYS_DIR
'
,
value
:
trialJobDetail
.
workingDirectory
},
...
...
@@ -379,12 +374,10 @@ class LocalTrainingService implements TrainingService {
{
key
:
'
MULTI_PHASE
'
,
value
:
this
.
isMultiPhase
.
toString
()
}
];
if
(
resource
!==
undefined
&&
resource
.
gpuIndices
.
length
>
0
)
{
envVariables
.
push
({
key
:
'
CUDA_VISIBLE_DEVICES
'
,
value
:
this
.
gpuScheduler
===
undefined
?
''
:
resource
.
gpuIndices
.
join
(
'
,
'
)
value
:
this
.
gpuScheduler
===
undefined
?
'
-1
'
:
resource
.
gpuIndices
.
join
(
'
,
'
)
});
}
return
envVariables
;
}
...
...
@@ -467,36 +460,52 @@ class LocalTrainingService implements TrainingService {
}
}
private
getScript
(
localTrailConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]{
let
script
:
string
[]
=
[];
if
(
process
.
platform
===
"
win32
"
)
{
script
.
push
(
`cmd /c
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = "$NOW_DATE" + "000"`
,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
-NoNewline -encoding utf8`
);
}
else
{
script
.
push
(
`eval
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s000
\`
>
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
}
return
script
;
}
private
async
runTrialJob
(
trialJobId
:
string
,
resource
:
{
gpuIndices
:
number
[]}):
Promise
<
void
>
{
const
trialJobDetail
:
LocalTrialJobDetail
=
<
LocalTrialJobDetail
>
this
.
jobMap
.
get
(
trialJobId
);
const
variables
:
{
key
:
string
;
value
:
string
}[]
=
this
.
getEnvironmentVariables
(
trialJobDetail
,
resource
);
const
runScriptLines
:
string
[]
=
[];
if
(
!
this
.
localTrailConfig
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
runScriptLines
.
push
(
'
#!/bin/bash
'
,
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
const
runScriptLines
:
string
[]
=
[];
if
(
process
.
platform
!==
"
win32
"
){
runScriptLines
.
push
(
'
#!/bin/bash
'
);
}
runScriptLines
.
push
(
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
for
(
const
variable
of
variables
)
{
runScriptLines
.
push
(
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
);
runScriptLines
.
push
(
setEnvironmentVariable
(
variable
)
);
}
runScriptLines
.
push
(
`eval
${
this
.
localTrailConfig
.
command
}
2>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s000
\`
>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
await
cpp
.
exec
(
`m
kdir
-p
${
trialJobDetail
.
workingDirectory
}
`
);
await
cpp
.
exec
(
`m
kdir
-p
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
)
}
`
);
await
cpp
.
exec
(
`touch
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
metrics
'
)
}
`
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
run.sh
'
),
runScriptLines
.
join
(
'
\n
'
),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
const
scripts
:
string
[]
=
this
.
getScript
(
this
.
localTrailConfig
,
trialJobDetail
.
workingDirectory
);
scripts
.
forEach
(
script
=>
{
runScriptLines
.
push
(
script
);
});
await
exec
M
kdir
(
trialJobDetail
.
workingDirectory
);
await
exec
M
kdir
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
));
await
execNewFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
metrics
'
));
const
scriptName
:
string
=
getScriptName
(
'
run
'
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
),
runScriptLines
.
join
(
getNewLine
()
),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
await
this
.
writeParameterFile
(
trialJobDetail
.
workingDirectory
,
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
).
hyperParameters
);
const
process
:
cp
.
ChildProcess
=
cp
.
exec
(
`bash
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
run.sh
'
)}
`
);
const
trialJobProcess
:
cp
.
ChildProcess
=
execScript
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
));
this
.
setTrialJobStatus
(
trialJobDetail
,
'
RUNNING
'
);
trialJobDetail
.
startTime
=
Date
.
now
();
trialJobDetail
.
pid
=
p
rocess
.
pid
;
trialJobDetail
.
pid
=
trialJobP
rocess
.
pid
;
this
.
setExtraProperties
(
trialJobDetail
,
resource
);
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
0663218b
...
...
@@ -46,7 +46,7 @@ import {
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
}
from
'
./remoteMachineData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
_LINUX
}
from
'
../common/gpuData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
...
...
@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT
,
GPU_INFO_COLLECTOR_FORMAT
_LINUX
,
remoteGPUScriptsDir
,
path
.
join
(
remoteGPUScriptsDir
,
'
pid
'
),
);
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment