Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
0663218b
Unverified
Commit
0663218b
authored
Apr 22, 2019
by
SparkSnail
Committed by
GitHub
Apr 22, 2019
Browse files
Merge pull request #163 from Microsoft/master
merge master
parents
6c9360a5
cf983800
Changes
116
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
491 additions
and
99 deletions
+491
-99
examples/trials/cifar10_pytorch/utils.py
examples/trials/cifar10_pytorch/utils.py
+1
-1
install.ps1
install.ps1
+127
-0
setup.py
setup.py
+3
-2
src/nni_manager/common/datastore.ts
src/nni_manager/common/datastore.ts
+1
-1
src/nni_manager/common/manager.ts
src/nni_manager/common/manager.ts
+1
-0
src/nni_manager/common/utils.ts
src/nni_manager/common/utils.ts
+102
-6
src/nni_manager/core/commands.ts
src/nni_manager/core/commands.ts
+3
-0
src/nni_manager/core/nnimanager.ts
src/nni_manager/core/nnimanager.ts
+16
-16
src/nni_manager/core/test/dataStore.test.ts
src/nni_manager/core/test/dataStore.test.ts
+1
-0
src/nni_manager/core/test/ipcInterface.test.ts
src/nni_manager/core/test/ipcInterface.test.ts
+10
-5
src/nni_manager/core/test/ipcInterfaceTerminate.test.ts
src/nni_manager/core/test/ipcInterfaceTerminate.test.ts
+2
-4
src/nni_manager/core/test/nnimanager.test.ts
src/nni_manager/core/test/nnimanager.test.ts
+2
-1
src/nni_manager/package.json
src/nni_manager/package.json
+1
-2
src/nni_manager/rest_server/restHandler.ts
src/nni_manager/rest_server/restHandler.ts
+11
-0
src/nni_manager/rest_server/test/mockedNNIManager.ts
src/nni_manager/rest_server/test/mockedNNIManager.ts
+3
-0
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+8
-1
src/nni_manager/training_service/common/util.ts
src/nni_manager/training_service/common/util.ts
+134
-1
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+18
-21
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+45
-36
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+2
-2
No files found.
examples/trials/cifar10_pytorch/utils.py
View file @
0663218b
...
...
@@ -43,7 +43,7 @@ def init_params(net):
term_width
=
0
try
:
_
,
term_width
=
os
.
popen
(
'stty size'
,
'r'
).
read
().
split
()
term_width
=
os
.
get_terminal_size
().
columns
except
Exception
as
exception
:
term_width
=
200
term_width
=
int
(
term_width
)
...
...
install.ps1
0 → 100644
View file @
0663218b
[
Net.ServicePointManager
]::
SecurityProtocol
=
[
Net.SecurityProtocolType
]::
Tls12
$install_node
=
$true
$install_yarn
=
$true
# nodejs
$nodeUrl
=
"https://aka.ms/nni/nodejs-download/win64"
$yarnUrl
=
"https://yarnpkg.com/latest.tar.gz"
$unzipNodeDir
=
"node-v*"
$unzipYarnDir
=
"yarn-v*"
$NNI_DEPENDENCY_FOLDER
=
"C:\tmp\
$
env
:
USERNAME
"
$WHICH_PYTHON
=
where.exe
python
if
(
$WHICH_PYTHON
-eq
$null
){
throw
"Can not find python"
}
else
{
$pyVersion
=
&
python
-V
2
>
&
1
$pyVersion
=
([
string
]
$pyVersion
)
.
substring
(
7
,
3
)
if
([
double
]
$pyVersion
-lt
3.5
){
throw
"python version should >= 3.5"
}
}
$WHICH_PIP
=
where.exe
pip
if
(
$WHICH_PIP
-eq
$null
){
throw
"Can not find pip"
}
$
env
:
PYTHONIOENCODING
=
"UTF-8"
if
(
$
env
:
VIRTUAL_ENV
){
$NNI_PYTHON3
=
$
env
:
VIRTUAL_ENV
+
"\Scripts"
$NNI_PKG_FOLDER
=
$
env
:
VIRTUAL_ENV
+
"\nni"
$NNI_PYTHON_SCRIPTS
=
$NNI_PYTHON3
}
else
{
$NNI_PYTHON3
=
$
(
python
-c
'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]))'
)
$NNI_PKG_FOLDER
=
$NNI_PYTHON3
+
"\nni"
$NNI_PYTHON_SCRIPTS
=
$NNI_PYTHON3
+
"\Scripts"
}
$PIP_INSTALL
=
"""
$NNI_PYTHON3
\python"" -m pip install ."
if
(
!
(
Test-Path
$NNI_DEPENDENCY_FOLDER
)){
New-Item
$NNI_DEPENDENCY_FOLDER
-ItemType
Directory
}
$NNI_NODE_ZIP
=
$NNI_DEPENDENCY_FOLDER
+
"\nni-node.zip"
$NNI_NODE_FOLDER
=
$NNI_DEPENDENCY_FOLDER
+
"\nni-node"
$NNI_YARN_TARBALL
=
$NNI_DEPENDENCY_FOLDER
+
"\nni-yarn.tar.gz"
$NNI_YARN_FOLDER
=
$NNI_DEPENDENCY_FOLDER
+
"\nni-yarn"
$NNI_YARN
=
$NNI_YARN_FOLDER
+
"\bin\yarn"
## Version number
$NNI_VERSION_VALUE
=
$
(
git
describe
--tags
)
$NNI_VERSION_TEMPLATE
=
"999.0.0-developing"
if
(
!
(
Test-Path
$NNI_NODE_ZIP
)){
Write-Host
"Downloading Node..."
(
New-Object
Net.WebClient
)
.
DownloadFile
(
$nodeUrl
,
$NNI_NODE_ZIP
)
}
if
(
!
(
Test-Path
$NNI_YARN_TARBALL
)){
Write-Host
"Downloading Yarn..."
(
New-Object
Net.WebClient
)
.
DownloadFile
(
$yarnUrl
,
$NNI_YARN_TARBALL
)
}
$NNI_YARN_TARBALL
=
$NNI_YARN_TARBALL
-split
'\\'
-join
'\\'
$NNI_DEPENDENCY_FOLDER
=
$NNI_DEPENDENCY_FOLDER
-split
'\\'
-join
'\\'
$SCRIPT_PATH
=
$NNI_DEPENDENCY_FOLDER
+
'\extract.py'
$SCRIPT
=
"import tarfile"
,
(
"tar = tarfile.open(""{0}"")"
-f
$NNI_YARN_TARBALL
),
(
"tar.extractall(""{0}"")"
-f
$NNI_DEPENDENCY_FOLDER
),
"tar.close()"
[
System.IO.File
]::
WriteAllLines
(
$SCRIPT_PATH
,
$SCRIPT
)
Add-Type
-AssemblyName
System.IO.Compression.FileSystem
function
Unzip
{
param
([
string
]
$zipfile
,
[
string
]
$outpath
)
[
System.IO.Compression.ZipFile
]::
ExtractToDirectory
(
$zipfile
,
$outpath
)
}
if
(
$install_node
)
{
### nodejs install
if
(
!
(
Test-Path
$NNI_NODE_FOLDER
)){
Unzip
$NNI_NODE_ZIP
$NNI_DEPENDENCY_FOLDER
$unzipNodeDir
=
Get-ChildItem
"
$NNI_DEPENDENCY_FOLDER
\
$unzipNodeDir
"
Rename-Item
$unzipNodeDir
"nni-node"
}
Copy-Item
"
$NNI_NODE_FOLDER
\node.exe"
$NNI_PYTHON_SCRIPTS
-Recurse
-Force
### yarn install
if
(
!
(
Test-Path
$NNI_YARN_FOLDER
)){
cmd
/C
"""
$NNI_PYTHON3
\python"""
$SCRIPT_PATH
$unzipYarnDir
=
Get-ChildItem
"
$NNI_DEPENDENCY_FOLDER
\
$unzipYarnDir
"
Rename-Item
$unzipYarnDir
"nni-yarn"
}
}
## install-python-modules:
### Installing Python SDK
(
Get-Content
setup.py
)
.
replace
(
$NNI_VERSION_TEMPLATE
,
$NNI_VERSION_VALUE
)
|
Set-Content
setup.py
cmd
/c
$PIP_INSTALL
# Building NNI Manager
$
env
:
PATH
=
$NNI_PYTHON_SCRIPTS
+
';'
+
$
env
:
PATH
cd
src\nni_manager
cmd
/c
$NNI_YARN
cmd
/c
$NNI_YARN
build
Copy-Item
config
-Destination
.
\dist\
-Recurse
-Force
# Building WebUI
cd
..
\webui
cmd
/c
$NNI_YARN
cmd
/c
$NNI_YARN
build
cd
..
\..
## install-node-modules
if
(
!
(
Test-Path
$NNI_PKG_FOLDER
)){
New-Item
$NNI_PKG_FOLDER
-ItemType
Directory
}
Remove-Item
$NNI_PKG_FOLDER
-Recurse
-Force
Copy-Item
"src\nni_manager\dist"
$NNI_PKG_FOLDER
-Recurse
Copy-Item
"src\nni_manager\package.json"
$NNI_PKG_FOLDER
$PKG_JSON
=
$NNI_PKG_FOLDER
+
"\package.json"
(
Get-Content
$PKG_JSON
)
.
replace
(
$NNI_VERSION_TEMPLATE
,
$NNI_VERSION_VALUE
)
|
Set-Content
$PKG_JSON
cmd
/c
$NNI_YARN
--prod
--cwd
$NNI_PKG_FOLDER
$NNI_PKG_FOLDER_STATIC
=
$NNI_PKG_FOLDER
+
"\static"
Copy-Item
"src\webui\build"
$NNI_PKG_FOLDER_STATIC
-Recurse
setup.py
View file @
0663218b
...
...
@@ -51,11 +51,12 @@ setup(
'json_tricks'
,
'numpy'
,
'psutil'
,
'
py
yaml'
,
'
ruamel.
yaml'
,
'requests'
,
'scipy'
,
'schema'
,
'PythonWebHDFS'
'PythonWebHDFS'
,
'colorama'
],
entry_points
=
{
...
...
src/nni_manager/common/datastore.ts
View file @
0663218b
...
...
@@ -22,7 +22,7 @@
import
{
ExperimentProfile
,
TrialJobStatistics
}
from
'
./manager
'
;
import
{
TrialJobDetail
,
TrialJobStatus
}
from
'
./trainingService
'
;
type
TrialJobEvent
=
TrialJobStatus
|
'
USER_TO_CANCEL
'
|
'
ADD_CUSTOMIZED
'
|
'
ADD_HYPERPARAMETER
'
;
type
TrialJobEvent
=
TrialJobStatus
|
'
USER_TO_CANCEL
'
|
'
ADD_CUSTOMIZED
'
|
'
ADD_HYPERPARAMETER
'
|
'
IMPORT_DATA
'
;
type
MetricType
=
'
PERIODICAL
'
|
'
FINAL
'
|
'
CUSTOM
'
|
'
REQUEST_PARAMETER
'
;
interface
ExperimentProfileRecord
{
...
...
src/nni_manager/common/manager.ts
View file @
0663218b
...
...
@@ -99,6 +99,7 @@ abstract class Manager {
public
abstract
stopExperiment
():
Promise
<
void
>
;
public
abstract
getExperimentProfile
():
Promise
<
ExperimentProfile
>
;
public
abstract
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
;
public
abstract
importData
(
data
:
string
):
Promise
<
void
>
;
public
abstract
addCustomizedTrialJob
(
hyperParams
:
string
):
Promise
<
void
>
;
public
abstract
cancelTrialJobByUser
(
trialJobId
:
string
):
Promise
<
void
>
;
...
...
src/nni_manager/common/utils.ts
View file @
0663218b
...
...
@@ -22,6 +22,8 @@
import
*
as
assert
from
'
assert
'
;
import
{
randomBytes
}
from
'
crypto
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
{
ChildProcess
,
spawn
,
StdioOptions
}
from
'
child_process
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
...
...
@@ -32,6 +34,7 @@ import * as util from 'util';
import
{
Database
,
DataStore
}
from
'
./datastore
'
;
import
{
ExperimentStartupInfo
,
getExperimentId
,
getExperimentStartupInfo
,
setExperimentStartupInfo
}
from
'
./experimentStartupInfo
'
;
import
{
Manager
}
from
'
./manager
'
;
import
{
TrialConfig
}
from
'
../training_service/common/trialConfig
'
;
import
{
HyperParameters
,
TrainingService
,
TrialJobStatus
}
from
'
./trainingService
'
;
import
{
getLogger
}
from
'
./log
'
;
...
...
@@ -146,6 +149,23 @@ function parseArg(names: string[]): string {
return
''
;
}
function
encodeCmdLineArgs
(
args
:
any
):
any
{
if
(
process
.
platform
===
'
win32
'
){
return
JSON
.
stringify
(
args
);
}
else
{
return
JSON
.
stringify
(
JSON
.
stringify
(
args
));
}
}
function
getCmdPy
():
string
{
let
cmd
=
'
python3
'
;
if
(
process
.
platform
===
'
win32
'
){
cmd
=
'
python
'
;
}
return
cmd
;
}
/**
* Generate command line to start automl algorithm(s),
* either start advisor or start a process which runs tuner and assessor
...
...
@@ -179,8 +199,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if
(
!
tuner
&&
!
advisor
)
{
throw
new
Error
(
'
Error: specify neither tuner nor advisor is not allowed
'
);
}
let
command
:
string
=
`python3 -m nni`
;
let
command
:
string
=
`
${
getCmdPy
()}
-m nni`
;
if
(
multiPhase
)
{
command
+=
'
--multi_phase
'
;
}
...
...
@@ -192,7 +211,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if
(
advisor
)
{
command
+=
` --advisor_class_name
${
advisor
.
className
}
`
;
if
(
advisor
.
classArgs
!==
undefined
)
{
command
+=
` --advisor_args
${
JSON
.
stringify
(
JSON
.
stringify
(
advisor
.
classArgs
)
)
}
`
;
command
+=
` --advisor_args
${
encodeCmdLineArgs
(
advisor
.
classArgs
)}
`
;
}
if
(
advisor
.
codeDir
!==
undefined
&&
advisor
.
codeDir
.
length
>
1
)
{
command
+=
` --advisor_directory
${
advisor
.
codeDir
}
`
;
...
...
@@ -203,7 +222,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
}
else
{
command
+=
` --tuner_class_name
${
tuner
.
className
}
`
;
if
(
tuner
.
classArgs
!==
undefined
)
{
command
+=
` --tuner_args
${
JSON
.
stringify
(
JSON
.
stringify
(
tuner
.
classArgs
)
)
}
`
;
command
+=
` --tuner_args
${
encodeCmdLineArgs
(
tuner
.
classArgs
)}
`
;
}
if
(
tuner
.
codeDir
!==
undefined
&&
tuner
.
codeDir
.
length
>
1
)
{
command
+=
` --tuner_directory
${
tuner
.
codeDir
}
`
;
...
...
@@ -215,7 +234,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if
(
assessor
!==
undefined
&&
assessor
.
className
!==
undefined
)
{
command
+=
` --assessor_class_name
${
assessor
.
className
}
`
;
if
(
assessor
.
classArgs
!==
undefined
)
{
command
+=
` --assessor_args
${
JSON
.
stringify
(
JSON
.
stringify
(
assessor
.
classArgs
)
)
}
`
;
command
+=
` --assessor_args
${
encodeCmdLineArgs
(
assessor
.
classArgs
)}
`
;
}
if
(
assessor
.
codeDir
!==
undefined
&&
assessor
.
codeDir
.
length
>
1
)
{
command
+=
` --assessor_directory
${
assessor
.
codeDir
}
`
;
...
...
@@ -363,6 +382,83 @@ async function getVersion(): Promise<string> {
return
deferred
.
promise
;
}
/**
* run command as ChildProcess
*/
function
getTunerProc
(
command
:
string
,
stdio
:
StdioOptions
,
newCwd
:
string
,
newEnv
:
any
):
ChildProcess
{
let
cmd
:
string
=
command
;
let
arg
:
string
[]
=
[];
let
newShell
:
boolean
=
true
;
if
(
process
.
platform
===
"
win32
"
){
cmd
=
command
.
split
(
"
"
,
1
)[
0
];
arg
=
command
.
substr
(
cmd
.
length
+
1
).
split
(
"
"
);
newShell
=
false
;
}
const
tunerProc
:
ChildProcess
=
spawn
(
cmd
,
arg
,
{
stdio
,
cwd
:
newCwd
,
env
:
newEnv
,
shell
:
newShell
});
return
tunerProc
;
}
/**
* judge whether the process is alive
*/
async
function
isAlive
(
pid
:
any
):
Promise
<
boolean
>
{
let
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
let
alive
:
boolean
=
false
;
if
(
process
.
platform
===
'
win32
'
){
try
{
const
str
=
cp
.
execSync
(
`powershell.exe Get-Process -Id
${
pid
}
-ErrorAction SilentlyContinue`
).
toString
();
if
(
str
)
{
alive
=
true
;
}
}
catch
(
error
)
{
}
}
else
{
try
{
await
cpp
.
exec
(
`kill -0
${
pid
}
`
);
alive
=
true
;
}
catch
(
error
)
{
//ignore
}
}
deferred
.
resolve
(
alive
);
return
deferred
.
promise
;
}
/**
* kill process
*/
async
function
killPid
(
pid
:
any
):
Promise
<
void
>
{
let
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
try
{
if
(
process
.
platform
===
"
win32
"
)
{
await
cpp
.
exec
(
`cmd /c taskkill /PID
${
pid
}
/F`
);
}
else
{
await
cpp
.
exec
(
`kill -9
${
pid
}
`
);
}
}
catch
(
error
)
{
// pid does not exist, do nothing here
}
deferred
.
resolve
();
return
deferred
.
promise
;
}
function
getNewLine
():
string
{
if
(
process
.
platform
===
"
win32
"
)
{
return
"
\r\n
"
;
}
else
{
return
"
\n
"
;
}
}
export
{
countFilesRecursively
,
getRemoteTmpDir
,
generateParamFileName
,
getMsgDispatcherCommand
,
getCheckpointDir
,
getLogDir
,
getExperimentRootDir
,
getJobCancelStatus
,
getDefaultDatabaseDir
,
getIPV4Address
,
mkDirP
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomSelect
,
getLogLevel
,
getVersion
};
mkDirP
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomSelect
,
getLogLevel
,
getVersion
,
getCmdPy
,
getTunerProc
,
isAlive
,
killPid
,
getNewLine
};
src/nni_manager/core/commands.ts
View file @
0663218b
...
...
@@ -22,6 +22,7 @@ const INITIALIZE = 'IN';
const
REQUEST_TRIAL_JOBS
=
'
GE
'
;
const
REPORT_METRIC_DATA
=
'
ME
'
;
const
UPDATE_SEARCH_SPACE
=
'
SS
'
;
const
IMPORT_DATA
=
'
FD
'
const
ADD_CUSTOMIZED_TRIAL_JOB
=
'
AD
'
;
const
TRIAL_END
=
'
EN
'
;
const
TERMINATE
=
'
TE
'
;
...
...
@@ -38,6 +39,7 @@ const TUNER_COMMANDS: Set<string> = new Set([
REQUEST_TRIAL_JOBS
,
REPORT_METRIC_DATA
,
UPDATE_SEARCH_SPACE
,
IMPORT_DATA
,
ADD_CUSTOMIZED_TRIAL_JOB
,
TERMINATE
,
PING
,
...
...
@@ -62,6 +64,7 @@ export {
REQUEST_TRIAL_JOBS
,
REPORT_METRIC_DATA
,
UPDATE_SEARCH_SPACE
,
IMPORT_DATA
,
ADD_CUSTOMIZED_TRIAL_JOB
,
TRIAL_END
,
TERMINATE
,
...
...
src/nni_manager/core/nnimanager.ts
View file @
0663218b
...
...
@@ -35,10 +35,10 @@ import {
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../common/trainingService
'
;
import
{
delay
,
getCheckpointDir
,
getExperimentRootDir
,
getLogDir
,
getMsgDispatcherCommand
,
mkDirP
,
get
LogLevel
}
from
'
../common/utils
'
;
import
{
delay
,
getCheckpointDir
,
getExperimentRootDir
,
getLogDir
,
getMsgDispatcherCommand
,
mkDirP
,
get
TunerProc
,
getLogLevel
,
isAlive
,
killPid
}
from
'
../common/utils
'
;
import
{
ADD_CUSTOMIZED_TRIAL_JOB
,
INITIALIZE
,
INITIALIZED
,
KILL_TRIAL_JOB
,
NEW_TRIAL_JOB
,
NO_MORE_TRIAL_JOBS
,
PING
,
REPORT_METRIC_DATA
,
REQUEST_TRIAL_JOBS
,
SEND_TRIAL_JOB_PARAMETER
,
TERMINATE
,
TRIAL_END
,
UPDATE_SEARCH_SPACE
REPORT_METRIC_DATA
,
REQUEST_TRIAL_JOBS
,
SEND_TRIAL_JOB_PARAMETER
,
TERMINATE
,
TRIAL_END
,
UPDATE_SEARCH_SPACE
,
IMPORT_DATA
}
from
'
./commands
'
;
import
{
createDispatcherInterface
,
IpcInterface
}
from
'
./ipcInterface
'
;
...
...
@@ -99,6 +99,17 @@ class NNIManager implements Manager {
return
this
.
storeExperimentProfile
();
}
public
importData
(
data
:
string
):
Promise
<
void
>
{
if
(
this
.
dispatcher
===
undefined
)
{
return
Promise
.
reject
(
new
Error
(
'
tuner has not been setup
'
)
);
}
this
.
dispatcher
.
sendCommand
(
IMPORT_DATA
,
data
);
return
this
.
dataStore
.
storeTrialJobEvent
(
'
IMPORT_DATA
'
,
''
,
data
);
}
public
addCustomizedTrialJob
(
hyperParams
:
string
):
Promise
<
void
>
{
if
(
this
.
currSubmittedTrialNum
>=
this
.
experimentProfile
.
params
.
maxTrialNum
)
{
return
Promise
.
reject
(
...
...
@@ -290,12 +301,7 @@ class NNIManager implements Manager {
NNI_INCLUDE_INTERMEDIATE_RESULTS
:
includeIntermediateResultsEnv
};
let
newEnv
=
Object
.
assign
({},
process
.
env
,
nniEnv
);
const
tunerProc
:
ChildProcess
=
spawn
(
command
,
[],
{
stdio
,
cwd
:
newCwd
,
env
:
newEnv
,
shell
:
true
});
const
tunerProc
:
ChildProcess
=
getTunerProc
(
command
,
stdio
,
newCwd
,
newEnv
);
this
.
dispatcherPid
=
tunerProc
.
pid
;
this
.
dispatcher
=
createDispatcherInterface
(
tunerProc
);
...
...
@@ -341,16 +347,10 @@ class NNIManager implements Manager {
// gracefully terminate tuner and assessor here, wait at most 30 seconds.
for
(
let
i
:
number
=
0
;
i
<
30
;
i
++
)
{
if
(
!
tunerAlive
)
{
break
;
}
try
{
await
cpp
.
exec
(
`kill -0
${
this
.
dispatcherPid
}
`
);
}
catch
(
error
)
{
tunerAlive
=
false
;
}
tunerAlive
=
await
isAlive
(
this
.
dispatcherPid
);
await
delay
(
1000
);
}
try
{
await
cpp
.
exec
(
`kill -9
${
this
.
dispatcherPid
}
`
);
}
catch
(
error
)
{
// this.tunerPid does not exist, do nothing here
}
await
killPid
(
this
.
dispatcherPid
);
const
trialJobList
:
TrialJobDetail
[]
=
await
this
.
trainingService
.
listTrialJobs
();
// TO DO: to promise all
for
(
const
trialJob
of
trialJobList
)
{
...
...
src/nni_manager/core/test/dataStore.test.ts
View file @
0663218b
...
...
@@ -42,6 +42,7 @@ describe('Unit test for dataStore', () => {
});
after
(()
=>
{
ds
.
close
();
cleanupUnitTest
();
});
...
...
src/nni_manager/core/test/ipcInterface.test.ts
View file @
0663218b
...
...
@@ -18,11 +18,10 @@
*/
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
ChildProcess
,
spawn
,
StdioOptions
}
from
'
child_process
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
}
from
'
../../common/utils
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
,
getTunerProc
,
getCmdPy
}
from
'
../../common/utils
'
;
import
*
as
CommandType
from
'
../commands
'
;
import
{
createDispatcherInterface
,
IpcInterface
}
from
'
../ipcInterface
'
;
import
{
NNIError
}
from
'
../../common/errors
'
;
...
...
@@ -39,15 +38,21 @@ function runProcess(): Promise<Error | null> {
// create fake assessor process
const
stdio
:
StdioOptions
=
[
'
ignore
'
,
'
pipe
'
,
process
.
stderr
,
'
pipe
'
,
'
pipe
'
];
const
proc
:
ChildProcess
=
spawn
(
'
python3 assessor.py
'
,
[],
{
stdio
,
cwd
:
'
core/test
'
,
shell
:
true
})
;
const
command
:
string
=
getCmdPy
()
+
'
assessor.py
'
;
const
proc
:
ChildProcess
=
getTunerProc
(
command
,
stdio
,
'
core/test
'
,
process
.
env
);
// record its sent/received commands on exit
proc
.
on
(
'
error
'
,
(
error
:
Error
):
void
=>
{
deferred
.
resolve
(
error
);
});
proc
.
on
(
'
exit
'
,
(
code
:
number
):
void
=>
{
if
(
code
!==
0
)
{
deferred
.
resolve
(
new
Error
(
`return code:
${
code
}
`
));
}
else
{
sentCommands
=
proc
.
stdout
.
read
().
toString
().
split
(
'
\n
'
);
let
str
=
proc
.
stdout
.
read
().
toString
();
if
(
str
.
search
(
"
\r\n
"
)
!=-
1
){
sentCommands
=
str
.
split
(
"
\r\n
"
);
}
else
{
sentCommands
=
str
.
split
(
'
\n
'
);
}
deferred
.
resolve
(
null
);
}
});
...
...
src/nni_manager/core/test/ipcInterfaceTerminate.test.ts
View file @
0663218b
...
...
@@ -22,7 +22,7 @@
import
*
as
assert
from
'
assert
'
;
import
{
ChildProcess
,
spawn
,
StdioOptions
}
from
'
child_process
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
,
getMsgDispatcherCommand
}
from
'
../../common/utils
'
;
import
{
cleanupUnitTest
,
prepareUnitTest
,
getMsgDispatcherCommand
,
getTunerProc
}
from
'
../../common/utils
'
;
import
*
as
CommandType
from
'
../commands
'
;
import
{
createDispatcherInterface
,
IpcInterface
}
from
'
../ipcInterface
'
;
...
...
@@ -50,9 +50,7 @@ function startProcess(): void {
// advisor
undefined
);
const
proc
:
ChildProcess
=
spawn
(
dispatcherCmd
,
[],
{
stdio
,
cwd
:
'
core/test
'
,
shell
:
true
});
const
proc
:
ChildProcess
=
getTunerProc
(
dispatcherCmd
,
stdio
,
'
core/test
'
,
process
.
env
);
proc
.
on
(
'
error
'
,
(
error
:
Error
):
void
=>
{
procExit
=
true
;
procError
=
true
;
...
...
src/nni_manager/core/test/nnimanager.test.ts
View file @
0663218b
...
...
@@ -33,6 +33,7 @@ import { NNIManager } from '../nnimanager';
import
{
SqlDB
}
from
'
../sqlDatabase
'
;
import
{
MockedTrainingService
}
from
'
./mockedTrainingService
'
;
import
{
MockedDataStore
}
from
'
./mockedDatastore
'
;
import
*
as
path
from
'
path
'
;
async
function
initContainer
():
Promise
<
void
>
{
prepareUnitTest
();
...
...
@@ -183,7 +184,7 @@ describe('Unit test for nnimanager', function () {
it
(
'
test getExperimentProfile
'
,
()
=>
{
return
nniManager
.
getExperimentProfile
().
then
((
experimentProfile
)
=>
{
expect
(
experimentProfile
.
id
).
to
.
be
.
equal
(
'
unittest
'
);
expect
(
experimentProfile
.
logDir
).
to
.
be
.
equal
(
os
.
homedir
()
+
'
/nni/
experiments
/
unittest
'
);
expect
(
experimentProfile
.
logDir
).
to
.
be
.
equal
(
path
.
join
(
os
.
homedir
()
,
'
nni
'
,
'
experiments
'
,
'
unittest
'
)
)
;
}).
catch
((
error
)
=>
{
assert
.
fail
(
error
);
...
...
src/nni_manager/package.json
View file @
0663218b
...
...
@@ -3,7 +3,6 @@
"version"
:
"999.0.0-developing"
,
"main"
:
"index.js"
,
"scripts"
:
{
"postbuild"
:
"cp -rf config ./dist/"
,
"build"
:
"tsc"
,
"test"
:
"nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors"
,
"start"
:
"node dist/main.js"
,
...
...
@@ -35,7 +34,7 @@
"@types/express"
:
"^4.16.0"
,
"@types/glob"
:
"^7.1.1"
,
"@types/mocha"
:
"^5.2.5"
,
"@types/node"
:
"
^
10.12.18"
,
"@types/node"
:
"10.12.18"
,
"@types/request"
:
"^2.47.1"
,
"@types/rx"
:
"^4.1.1"
,
"@types/sqlite3"
:
"^3.1.3"
,
...
...
src/nni_manager/rest_server/restHandler.ts
View file @
0663218b
...
...
@@ -63,6 +63,7 @@ class NNIRestHandler {
this
.
checkStatus
(
router
);
this
.
getExperimentProfile
(
router
);
this
.
updateExperimentProfile
(
router
);
this
.
importData
(
router
);
this
.
startExperiment
(
router
);
this
.
getTrialJobStatistics
(
router
);
this
.
setClusterMetaData
(
router
);
...
...
@@ -144,6 +145,16 @@ class NNIRestHandler {
});
});
}
private
importData
(
router
:
Router
):
void
{
router
.
post
(
'
/experiment/import-data
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
importData
(
JSON
.
stringify
(
req
.
body
)).
then
(()
=>
{
res
.
send
();
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
startExperiment
(
router
:
Router
):
void
{
router
.
post
(
'
/experiment
'
,
expressJoi
(
ValidationSchemas
.
STARTEXPERIMENT
),
(
req
:
Request
,
res
:
Response
)
=>
{
...
...
src/nni_manager/rest_server/test/mockedNNIManager.ts
View file @
0663218b
...
...
@@ -46,6 +46,9 @@ export class MockedNNIManager extends Manager {
public
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
{
return
Promise
.
resolve
();
}
public
importData
(
data
:
string
):
Promise
<
void
>
{
return
Promise
.
resolve
();
}
public
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
{
const
deferred
:
Deferred
<
TrialJobStatistics
[]
>
=
new
Deferred
<
TrialJobStatistics
[]
>
();
deferred
.
resolve
([{
...
...
src/nni_manager/training_service/common/gpuData.ts
View file @
0663218b
...
...
@@ -59,10 +59,17 @@ export class GPUSummary {
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT
_LINUX
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
`
$env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
Write $app.ID | Out-File {1} -NoNewline -encoding utf8
`
\ No newline at end of file
src/nni_manager/training_service/common/util.ts
View file @
0663218b
...
...
@@ -22,6 +22,12 @@ import { getLogger } from "common/log";
'
use strict
'
;
import
{
countFilesRecursively
}
from
'
../../common/utils
'
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
}
from
'
./gpuData
'
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
file
}
from
"
../../node_modules/@types/tmp
"
;
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
...
...
@@ -45,4 +51,131 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
}
return
fileCount
;
}
\ No newline at end of file
}
/**
* crete a new directory
* @param directory
*/
export
async
function
execMkdir
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
await
cpp
.
exec
(
`powershell.exe New-Item -Path
${
directory
}
-ItemType "directory" -Force`
);
}
else
{
await
cpp
.
exec
(
`mkdir -p
${
directory
}
`
);
}
return
Promise
.
resolve
();
}
/**
* crete a new file
* @param filename
*/
export
async
function
execNewFile
(
filename
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
await
cpp
.
exec
(
`powershell.exe New-Item -Path
${
filename
}
-ItemType "file" -Force`
);
}
else
{
await
cpp
.
exec
(
`touch
${
filename
}
`
);
}
return
Promise
.
resolve
();
}
/**
* run script
* @param filePath
*/
export
function
execScript
(
filePath
:
string
):
cp
.
ChildProcess
{
if
(
process
.
platform
===
'
win32
'
)
{
return
cp
.
exec
(
`powershell.exe -file
${
filePath
}
`
);
}
else
{
return
cp
.
exec
(
`bash
${
filePath
}
`
);
}
}
/**
* output the last line of a file
* @param filePath
*/
export
async
function
execTail
(
filePath
:
string
):
Promise
<
cpp
.
childProcessPromise
.
Result
>
{
let
cmdresult
:
cpp
.
childProcessPromise
.
Result
;
if
(
process
.
platform
===
'
win32
'
)
{
cmdresult
=
await
cpp
.
exec
(
`powershell.exe Get-Content
${
filePath
}
-Tail 1`
);
}
else
{
cmdresult
=
await
cpp
.
exec
(
`tail -n 1
${
filePath
}
`
);
}
return
Promise
.
resolve
(
cmdresult
);
}
/**
* delete a directory
* @param directory
*/
export
async
function
execRemove
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
await
cpp
.
exec
(
`powershell.exe Remove-Item
${
directory
}
`
);
}
else
{
await
cpp
.
exec
(
`rm -rf
${
directory
}
`
);
}
return
Promise
.
resolve
();
}
/**
* kill a process
* @param directory
*/
export
async
function
execKill
(
pid
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
await
cpp
.
exec
(
`cmd /c taskkill /PID
${
pid
}
/T /F`
);
}
else
{
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
}
return
Promise
.
resolve
();
}
/**
* set environment variable
* @param variable
* @returns command string
*/
export
function
setEnvironmentVariable
(
variable
:
{
key
:
string
;
value
:
string
}):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
return
`$env:
${
variable
.
key
}
="
${
variable
.
value
}
"`
;
}
else
{
return
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
;
}
}
/**
* generate script file name
* @param fileNamePrefix
*/
export
function
getScriptName
(
fileNamePrefix
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
return
fileNamePrefix
+
'
.ps1
'
;
}
else
{
return
fileNamePrefix
+
'
.sh
'
;
}
}
/**
* generate script file
* @param gpuMetricCollectorScriptFolder
*/
export
function
getgpuMetricsCollectorScriptContent
(
gpuMetricCollectorScriptFolder
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
),
);
}
else
{
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
),
);
}
}
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
0663218b
...
...
@@ -25,9 +25,10 @@ import * as fs from 'fs';
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
execMkdir
,
getScriptName
,
getgpuMetricsCollectorScriptContent
,
execScript
,
execTail
,
execRemove
,
execKill
}
from
'
../common/util
'
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
,
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
/**
* GPUScheduler for local training service
...
...
@@ -57,6 +58,19 @@ class GPUScheduler {
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
);
//generate gpu_metrics_collector script
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
getScriptName
(
'
gpu_metrics_collector
'
));
const
gpuMetricsCollectorScriptContent
:
string
=
getgpuMetricsCollectorScriptContent
(
this
.
gpuMetricCollectorScriptFolder
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
execScript
(
gpuMetricsCollectorScriptPath
)
}
public
getAvailableGPUIndices
():
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
info
.
activeProcessNum
===
0
)
...
...
@@ -78,33 +92,16 @@ class GPUScheduler {
this
.
stopping
=
true
;
try
{
const
pid
:
string
=
await
fs
.
promises
.
readFile
(
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
'
utf8
'
);
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
await
cpp
.
exec
(
`rm -rf
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
await
execKill
(
pid
);
await
execRemove
(
this
.
gpuMetricCollectorScriptFolder
);
}
catch
(
error
)
{
this
.
log
.
error
(
`GPU scheduler error:
${
error
}
`
);
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
//generate gpu_metrics_collector.sh
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics_collector.sh
'
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT
,
this
.
gpuMetricCollectorScriptFolder
,
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
cp
.
exec
(
`bash
${
gpuMetricsCollectorScriptPath
}
`
);
}
private
async
updateGPUSummary
():
Promise
<
void
>
{
const
cmdresult
:
cpp
.
childProcessPromise
.
Result
=
await
cpp
.
exec
(
`tail -n 1
${
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
)
}
`
);
await
execTail
(
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
));
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
else
{
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
0663218b
...
...
@@ -18,7 +18,6 @@
*/
'
use strict
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
{
EventEmitter
}
from
'
events
'
;
...
...
@@ -32,7 +31,8 @@ import {
HostJobApplicationForm
,
HyperParameters
,
JobApplicationForm
,
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
getJobCancelStatus
,
uniqueString
,
isAlive
,
getNewLine
}
from
'
../../common/utils
'
;
import
{
execMkdir
,
getScriptName
,
execScript
,
setEnvironmentVariable
,
execNewFile
}
from
'
../common/util
'
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
...
...
@@ -169,14 +169,7 @@ class LocalTrainingService implements TrainingService {
return
this
.
getHostJob
(
trialJobId
);
}
if
(
trialJob
.
status
===
'
RUNNING
'
)
{
let
alive
:
boolean
=
false
;
try
{
await
cpp
.
exec
(
`kill -0
${
trialJob
.
pid
}
`
);
alive
=
true
;
}
catch
(
error
)
{
//ignore
}
let
alive
:
boolean
=
await
isAlive
(
trialJob
.
pid
);
if
(
!
alive
)
{
trialJob
.
endTime
=
Date
.
now
();
this
.
setTrialJobStatus
(
trialJob
,
'
FAILED
'
);
...
...
@@ -284,7 +277,9 @@ class LocalTrainingService implements TrainingService {
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
if
(
!
this
.
initialized
)
{
this
.
rootDir
=
getExperimentRootDir
();
await
cpp
.
exec
(
`mkdir -p
${
this
.
rootDir
}
`
);
if
(
!
fs
.
existsSync
(
this
.
rootDir
)){
await
cpp
.
exec
(
`powershell.exe mkdir
${
this
.
rootDir
}
`
);
}
this
.
initialized
=
true
;
}
switch
(
key
)
{
...
...
@@ -369,7 +364,7 @@ class LocalTrainingService implements TrainingService {
private
getEnvironmentVariables
(
trialJobDetail
:
TrialJobDetail
,
resource
?
:
{
gpuIndices
:
number
[]
}):
{
key
:
string
;
value
:
string
}[]
{
resource
:
{
gpuIndices
:
number
[]
}):
{
key
:
string
;
value
:
string
}[]
{
const
envVariables
:
{
key
:
string
;
value
:
string
}[]
=
[
{
key
:
'
NNI_PLATFORM
'
,
value
:
'
local
'
},
{
key
:
'
NNI_SYS_DIR
'
,
value
:
trialJobDetail
.
workingDirectory
},
...
...
@@ -379,12 +374,10 @@ class LocalTrainingService implements TrainingService {
{
key
:
'
MULTI_PHASE
'
,
value
:
this
.
isMultiPhase
.
toString
()
}
];
if
(
resource
!==
undefined
&&
resource
.
gpuIndices
.
length
>
0
)
{
envVariables
.
push
({
key
:
'
CUDA_VISIBLE_DEVICES
'
,
value
:
this
.
gpuScheduler
===
undefined
?
''
:
resource
.
gpuIndices
.
join
(
'
,
'
)
});
}
envVariables
.
push
({
key
:
'
CUDA_VISIBLE_DEVICES
'
,
value
:
this
.
gpuScheduler
===
undefined
?
'
-1
'
:
resource
.
gpuIndices
.
join
(
'
,
'
)
});
return
envVariables
;
}
...
...
@@ -467,36 +460,52 @@ class LocalTrainingService implements TrainingService {
}
}
private
getScript
(
localTrailConfig
:
TrialConfig
,
workingDirectory
:
string
):
string
[]{
let
script
:
string
[]
=
[];
if
(
process
.
platform
===
"
win32
"
)
{
script
.
push
(
`cmd /c
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`
,
`$NOW_DATE = "$NOW_DATE" + "000"`
,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
-NoNewline -encoding utf8`
);
}
else
{
script
.
push
(
`eval
${
localTrailConfig
.
command
}
2>
${
path
.
join
(
workingDirectory
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s000
\`
>
${
path
.
join
(
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
}
return
script
;
}
private
async
runTrialJob
(
trialJobId
:
string
,
resource
:
{
gpuIndices
:
number
[]}):
Promise
<
void
>
{
const
trialJobDetail
:
LocalTrialJobDetail
=
<
LocalTrialJobDetail
>
this
.
jobMap
.
get
(
trialJobId
);
const
variables
:
{
key
:
string
;
value
:
string
}[]
=
this
.
getEnvironmentVariables
(
trialJobDetail
,
resource
);
const
runScriptLines
:
string
[]
=
[];
if
(
!
this
.
localTrailConfig
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
runScriptLines
.
push
(
'
#!/bin/bash
'
,
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
const
runScriptLines
:
string
[]
=
[];
if
(
process
.
platform
!==
"
win32
"
){
runScriptLines
.
push
(
'
#!/bin/bash
'
);
}
runScriptLines
.
push
(
`cd
${
this
.
localTrailConfig
.
codeDir
}
`
);
for
(
const
variable
of
variables
)
{
runScriptLines
.
push
(
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
);
runScriptLines
.
push
(
setEnvironmentVariable
(
variable
)
);
}
runScriptLines
.
push
(
`eval
${
this
.
localTrailConfig
.
command
}
2>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
stderr
'
)}
`
,
`echo $?
\`
date +%s000
\`
>
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
state
'
)}
`
);
await
cpp
.
exec
(
`m
kdir
-p
${
trialJobDetail
.
workingDirectory
}
`
);
await
cpp
.
exec
(
`m
kdir
-p
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
)
}
`
);
await
cpp
.
exec
(
`touch
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
metrics
'
)
}
`
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
run.sh
'
),
runScriptLines
.
join
(
'
\n
'
),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
const
scripts
:
string
[]
=
this
.
getScript
(
this
.
localTrailConfig
,
trialJobDetail
.
workingDirectory
);
scripts
.
forEach
(
script
=>
{
runScriptLines
.
push
(
script
);
});
await
exec
M
kdir
(
trialJobDetail
.
workingDirectory
);
await
exec
M
kdir
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
));
await
execNewFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
,
'
metrics
'
));
const
scriptName
:
string
=
getScriptName
(
'
run
'
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
),
runScriptLines
.
join
(
getNewLine
()
),
{
encoding
:
'
utf8
'
,
mode
:
0o777
});
await
this
.
writeParameterFile
(
trialJobDetail
.
workingDirectory
,
(
<
TrialJobApplicationForm
>
trialJobDetail
.
form
).
hyperParameters
);
const
process
:
cp
.
ChildProcess
=
cp
.
exec
(
`bash
${
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
run.sh
'
)}
`
);
const
trialJobProcess
:
cp
.
ChildProcess
=
execScript
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
));
this
.
setTrialJobStatus
(
trialJobDetail
,
'
RUNNING
'
);
trialJobDetail
.
startTime
=
Date
.
now
();
trialJobDetail
.
pid
=
p
rocess
.
pid
;
trialJobDetail
.
pid
=
trialJobP
rocess
.
pid
;
this
.
setExtraProperties
(
trialJobDetail
,
resource
);
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
0663218b
...
...
@@ -46,7 +46,7 @@ import {
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
}
from
'
./remoteMachineData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
_LINUX
}
from
'
../common/gpuData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
...
...
@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT
,
GPU_INFO_COLLECTOR_FORMAT
_LINUX
,
remoteGPUScriptsDir
,
path
.
join
(
remoteGPUScriptsDir
,
'
pid
'
),
);
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment