Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
ae7a72bc
"vscode:/vscode.git/clone" did not exist on "d13964dc4444d24b73e3ec1f6f8250e385d72cec"
Commit
ae7a72bc
authored
Jun 19, 2019
by
Hongarc
Committed by
Chi Song
Jun 19, 2019
Browse files
Remove all whitespace at end of line (#1162)
parent
14c1b31c
Changes
176
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
100 additions
and
100 deletions
+100
-100
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+14
-14
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+2
-2
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+1
-1
src/nni_manager/training_service/pai/hdfsClientUtility.ts
src/nni_manager/training_service/pai/hdfsClientUtility.ts
+11
-11
src/nni_manager/training_service/pai/paiConfig.ts
src/nni_manager/training_service/pai/paiConfig.ts
+5
-5
src/nni_manager/training_service/pai/paiData.ts
src/nni_manager/training_service/pai/paiData.ts
+6
-6
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
+6
-6
src/nni_manager/training_service/pai/paiJobRestServer.ts
src/nni_manager/training_service/pai/paiJobRestServer.ts
+1
-1
src/nni_manager/training_service/pai/paiTrialConfig.ts
src/nni_manager/training_service/pai/paiTrialConfig.ts
+1
-1
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+10
-10
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
...ning_service/remote_machine/remoteMachineJobRestServer.ts
+1
-1
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+19
-19
src/nni_manager/training_service/test/hdfsClientUtility.test.ts
...i_manager/training_service/test/hdfsClientUtility.test.ts
+4
-4
src/nni_manager/training_service/test/kubeflowTrainingService.test.ts
...ger/training_service/test/kubeflowTrainingService.test.ts
+2
-2
src/nni_manager/training_service/test/localTrainingService.test.ts
...anager/training_service/test/localTrainingService.test.ts
+2
-2
src/nni_manager/training_service/test/paiTrainingService.test.ts
..._manager/training_service/test/paiTrainingService.test.ts
+1
-1
src/nni_manager/types/child-process-promise/index.d.ts
src/nni_manager/types/child-process-promise/index.d.ts
+1
-1
src/sdk/pynni/nni/__main__.py
src/sdk/pynni/nni/__main__.py
+1
-1
src/sdk/pynni/nni/bohb_advisor/bohb_advisor.py
src/sdk/pynni/nni/bohb_advisor/bohb_advisor.py
+11
-11
src/sdk/pynni/nni/bohb_advisor/config_generator.py
src/sdk/pynni/nni/bohb_advisor/config_generator.py
+1
-1
No files found.
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
ae7a72bc
...
...
@@ -63,13 +63,13 @@ abstract class KubernetesTrainingService {
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
versionCheck
:
boolean
=
true
;
protected
logCollection
:
string
;
constructor
()
{
this
.
log
=
getLogger
();
this
.
metricsEmitter
=
new
EventEmitter
();
this
.
trialJobsMap
=
new
Map
<
string
,
KubernetesTrialJobDetail
>
();
this
.
trialLocalNFSTempFolder
=
path
.
join
(
getExperimentRootDir
(),
'
trials-nfs-tmp
'
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
this
.
CONTAINER_MOUNT_PATH
=
'
/tmp/mount
'
;
this
.
genericK8sClient
=
new
GeneralK8sClient
();
...
...
@@ -86,8 +86,8 @@ abstract class KubernetesTrainingService {
public
async
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
const
jobs
:
TrialJobDetail
[]
=
[];
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
value
.
form
.
jobType
===
'
TRIAL
'
)
{
jobs
.
push
(
await
this
.
getTrialJob
(
key
));
}
...
...
@@ -102,7 +102,7 @@ abstract class KubernetesTrainingService {
if
(
!
kubernetesTrialJob
)
{
return
Promise
.
reject
(
`trial job
${
trialJobId
}
not found`
)
}
}
return
Promise
.
resolve
(
kubernetesTrialJob
);
}
...
...
@@ -114,7 +114,7 @@ abstract class KubernetesTrainingService {
public
removeTrialJobMetricListener
(
listener
:
(
metric
:
TrialJobMetric
)
=>
void
)
{
this
.
metricsEmitter
.
off
(
'
metric
'
,
listener
);
}
public
get
isMultiPhaseJobSupported
():
boolean
{
return
false
;
}
...
...
@@ -153,7 +153,7 @@ abstract class KubernetesTrainingService {
{
apiVersion
:
'
v1
'
,
kind
:
'
Secret
'
,
metadata
:
{
metadata
:
{
name
:
this
.
azureStorageSecretName
,
namespace
:
'
default
'
,
labels
:
{
...
...
@@ -174,15 +174,15 @@ abstract class KubernetesTrainingService {
}
return
Promise
.
resolve
();
}
/**
* Genereate run script for different roles(like worker or ps)
* @param trialJobId trial job id
* @param trialWorkingFolder working folder
* @param command
* @param command
* @param trialSequenceId sequence id
*/
protected
async
generateRunScript
(
platform
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
protected
async
generateRunScript
(
platform
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
command
:
string
,
trialSequenceId
:
string
,
roleName
:
string
,
gpuNum
:
number
):
Promise
<
string
>
{
let
nvidia_script
:
string
=
''
;
// Nvidia devcie plugin for K8S has a known issue that requesting zero GPUs allocates all GPUs
...
...
@@ -229,7 +229,7 @@ abstract class KubernetesTrainingService {
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
not found`
;
this
.
log
.
error
(
errorMessage
);
return
Promise
.
reject
(
errorMessage
);
}
}
if
(
!
this
.
kubernetesCRDClient
)
{
const
errorMessage
:
string
=
`CancelTrialJob: trial job id
${
trialJobId
}
failed because operatorClient is undefined`
;
this
.
log
.
error
(
errorMessage
);
...
...
@@ -268,8 +268,8 @@ abstract class KubernetesTrainingService {
kubernetesTrialJob
.
status
=
'
SYS_CANCELED
'
;
}
}
// Delete all kubernetes jobs whose expId label is current experiment id
// Delete all kubernetes jobs whose expId label is current experiment id
try
{
if
(
this
.
kubernetesCRDClient
)
{
await
this
.
kubernetesCRDClient
.
deleteKubernetesJob
(
new
Map
(
...
...
@@ -290,7 +290,7 @@ abstract class KubernetesTrainingService {
this
.
log
.
error
(
`Unmount
${
this
.
trialLocalNFSTempFolder
}
failed, error is
${
error
}
`
);
}
// Stop kubernetes rest server
// Stop kubernetes rest server
if
(
!
this
.
kubernetesJobRestServer
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
...
...
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
ae7a72bc
...
...
@@ -59,8 +59,8 @@ class GPUScheduler {
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
);
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
ae7a72bc
...
...
@@ -532,7 +532,7 @@ class LocalTrainingService implements TrainingService {
}
const
scripts
:
string
[]
=
this
.
getScript
(
this
.
localTrailConfig
,
trialJobDetail
.
workingDirectory
);
scripts
.
forEach
(
script
=>
{
runScriptLines
.
push
(
script
);
runScriptLines
.
push
(
script
);
});
await
execMkdir
(
trialJobDetail
.
workingDirectory
);
await
execMkdir
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
'
.nni
'
));
...
...
src/nni_manager/training_service/pai/hdfsClientUtility.ts
View file @
ae7a72bc
...
...
@@ -57,7 +57,7 @@ export namespace HDFSClientUtility {
/**
* Copy a local file to hdfs directory
*
*
* @param localFilePath local file path(source)
* @param hdfsFilePath hdfs file path(target)
* @param hdfsClient hdfs client
...
...
@@ -87,7 +87,7 @@ export namespace HDFSClientUtility {
/**
* Recursively copy local directory to hdfs directory
*
*
* @param localDirectory local directory
* @param hdfsDirectory HDFS directory
* @param hdfsClient HDFS client
...
...
@@ -118,7 +118,7 @@ export namespace HDFSClientUtility {
/**
* Read content from HDFS file
*
*
* @param hdfsPath HDFS file path
* @param hdfsClient HDFS client
*/
...
...
@@ -141,7 +141,7 @@ export namespace HDFSClientUtility {
// Concat the data chunk to buffer
buffer
=
Buffer
.
concat
([
buffer
,
chunk
]);
});
remoteFileStream
.
on
(
'
finish
'
,
function
onFinish
()
{
// Upload is done, resolve
deferred
.
resolve
(
buffer
);
...
...
@@ -152,7 +152,7 @@ export namespace HDFSClientUtility {
/**
* Check if an HDFS path already exists
*
*
* @param hdfsPath target path need to check in HDFS
* @param hdfsClient HDFS client
*/
...
...
@@ -164,7 +164,7 @@ export namespace HDFSClientUtility {
let
timeoutId
:
NodeJS
.
Timer
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
)
:
void
=>
{
// Set timeout and reject the promise once reach timeout (5 seconds)
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId
=
setTimeout
(()
=>
deferred
.
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
),
5000
);
});
...
...
@@ -173,9 +173,9 @@ export namespace HDFSClientUtility {
/**
* Mkdir in HDFS, use default permission 755
*
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
*/
export
function
mkdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
...
...
@@ -193,9 +193,9 @@ export namespace HDFSClientUtility {
/**
* Read directory contents
*
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
*/
export
async
function
readdir
(
hdfsPath
:
string
,
hdfsClient
:
any
)
:
Promise
<
string
[]
>
{
const
deferred
:
Deferred
<
string
[]
>
=
new
Deferred
<
string
[]
>
();
...
...
@@ -218,7 +218,7 @@ export namespace HDFSClientUtility {
/**
* Delete HDFS path
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param hdfsClient
* @param recursive Mark if need to delete recursively
*/
export
function
deletePath
(
hdfsPath
:
string
,
hdfsClient
:
any
,
recursive
:
boolean
=
true
)
:
Promise
<
boolean
>
{
...
...
src/nni_manager/training_service/pai/paiConfig.ts
View file @
ae7a72bc
...
...
@@ -36,7 +36,7 @@ export class PAITaskRole {
public
readonly
command
:
string
;
//Shared memory for one task in the task role
public
readonly
shmMB
?:
number
;
/**
* Constructor
* @param name Name for the task role
...
...
@@ -52,7 +52,7 @@ export class PAITaskRole {
this
.
cpuNumber
=
cpuNumber
;
this
.
memoryMB
=
memoryMB
;
this
.
gpuNumber
=
gpuNumber
;
this
.
command
=
command
;
this
.
command
=
command
;
this
.
shmMB
=
shmMB
;
}
}
...
...
@@ -83,7 +83,7 @@ export class PAIJobConfig{
* @param outputDir Output directory on HDFS
* @param taskRoles List of taskRole, one task role at least
*/
constructor
(
jobName
:
string
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
codeDir
:
string
,
constructor
(
jobName
:
string
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
codeDir
:
string
,
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
)
{
this
.
jobName
=
jobName
;
this
.
image
=
image
;
...
...
@@ -117,7 +117,7 @@ export class NNIPAITrialConfig extends TrialConfig{
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
dataDir
:
string
;
public
outputDir
:
string
;
//The virtual cluster job runs on. If omitted, the job will run on default virtual cluster
...
...
@@ -125,7 +125,7 @@ export class NNIPAITrialConfig extends TrialConfig{
//Shared memory for one task in the task role
public
shmMB
?:
number
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
)
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
...
...
src/nni_manager/training_service/pai/paiData.ts
View file @
ae7a72bc
...
...
@@ -36,7 +36,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
public
hdfsLogPath
:
string
;
public
isEarlyStopped
?:
boolean
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
paiJobName
:
string
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
,
hdfsLogPath
:
string
)
{
this
.
id
=
id
;
this
.
status
=
status
;
...
...
@@ -50,7 +50,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
}
}
export
const
PAI_INSTALL_NNI_SHELL_FORMAT
:
string
=
export
const
PAI_INSTALL_NNI_SHELL_FORMAT
:
string
=
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
...
...
@@ -62,12 +62,12 @@ fi`;
export
const
PAI_TRIAL_COMMAND_FORMAT
:
string
=
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --nni_manager_version '{12}' --log_collection '{13}'`
;
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
`hdfs://{0}:9000/`
;
export
const
PAI_LOG_PATH_FORMAT
:
string
=
export
const
PAI_LOG_PATH_FORMAT
:
string
=
`http://{0}/webhdfs/explorer.html#{1}`
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
View file @
ae7a72bc
...
...
@@ -44,7 +44,7 @@ export class PAIJobInfoCollector {
public
async
retrieveTrialStatus
(
paiToken
?
:
string
,
paiClusterConfig
?:
PAIClusterConfig
)
:
Promise
<
void
>
{
if
(
!
paiClusterConfig
||
!
paiToken
)
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
const
updatePaiTrialJobs
:
Promise
<
void
>
[]
=
[];
...
...
@@ -76,7 +76,7 @@ export class PAIJobInfoCollector {
"
Authorization
"
:
'
Bearer
'
+
paiToken
}
};
//TODO : pass in request timeout param?
//TODO : pass in request timeout param?
request
(
getJobInfoRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
(
error
||
response
.
statusCode
>=
500
)
{
this
.
log
.
error
(
`PAI Training service: get job info for trial
${
paiTrialJob
.
id
}
from PAI Cluster failed!`
);
...
...
@@ -87,7 +87,7 @@ export class PAIJobInfoCollector {
}
else
{
if
(
response
.
body
.
jobStatus
&&
response
.
body
.
jobStatus
.
state
)
{
switch
(
response
.
body
.
jobStatus
.
state
)
{
case
'
WAITING
'
:
case
'
WAITING
'
:
paiTrialJob
.
status
=
'
WAITING
'
;
break
;
case
'
RUNNING
'
:
...
...
@@ -96,7 +96,7 @@ export class PAIJobInfoCollector {
paiTrialJob
.
startTime
=
response
.
body
.
jobStatus
.
appLaunchedTime
;
}
if
(
!
paiTrialJob
.
url
)
{
paiTrialJob
.
url
=
response
.
body
.
jobStatus
.
appTrackingUrl
;
paiTrialJob
.
url
=
response
.
body
.
jobStatus
.
appTrackingUrl
;
}
break
;
case
'
SUCCEEDED
'
:
...
...
@@ -104,7 +104,7 @@ export class PAIJobInfoCollector {
break
;
case
'
STOPPED
'
:
if
(
paiTrialJob
.
isEarlyStopped
!==
undefined
)
{
paiTrialJob
.
status
=
paiTrialJob
.
isEarlyStopped
===
true
?
paiTrialJob
.
status
=
paiTrialJob
.
isEarlyStopped
===
true
?
'
EARLY_STOPPED
'
:
'
USER_CANCELED
'
;
}
else
{
// if paiTrialJob's isEarlyStopped is undefined, that mean we didn't stop it via cancellation, mark it as SYS_CANCELLED by PAI
...
...
@@ -112,7 +112,7 @@ export class PAIJobInfoCollector {
}
break
;
case
'
FAILED
'
:
paiTrialJob
.
status
=
'
FAILED
'
;
paiTrialJob
.
status
=
'
FAILED
'
;
break
;
default
:
paiTrialJob
.
status
=
'
UNKNOWN
'
;
...
...
src/nni_manager/training_service/pai/paiJobRestServer.ts
View file @
ae7a72bc
...
...
@@ -26,7 +26,7 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/**
* PAI Training service Rest server, provides rest API to support pai job metrics update
*
*
*/
@
component
.
Singleton
export
class
PAIJobRestServer
extends
ClusterJobRestServer
{
...
...
src/nni_manager/training_service/pai/paiTrialConfig.ts
View file @
ae7a72bc
...
...
@@ -25,7 +25,7 @@ export class PAITrialConfig extends TrialConfig{
public
readonly
cpuNum
:
number
;
public
readonly
memoryMB
:
number
;
public
readonly
image
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
dataDir
:
string
;
public
readonly
outputDir
:
string
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
dataDir
:
string
,
outputDir
:
string
)
{
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
ae7a72bc
...
...
@@ -112,7 +112,7 @@ export class SSHClient {
this
.
sshClient
=
sshClient
;
this
.
usedConnectionNumber
=
usedConnectionNumber
;
}
public
get
getSSHClientInstance
():
Client
{
return
this
.
sshClient
;
}
...
...
@@ -151,7 +151,7 @@ export class SSHClientManager {
port
:
this
.
rmMeta
.
port
,
username
:
this
.
rmMeta
.
username
};
if
(
this
.
rmMeta
.
passwd
)
{
connectConfig
.
password
=
this
.
rmMeta
.
passwd
;
connectConfig
.
password
=
this
.
rmMeta
.
passwd
;
}
else
if
(
this
.
rmMeta
.
sshKeyPath
)
{
if
(
!
fs
.
existsSync
(
this
.
rmMeta
.
sshKeyPath
))
{
//SSh key path is not a valid file, reject
...
...
@@ -171,10 +171,10 @@ export class SSHClientManager {
// SSH connection error, reject with error message
deferred
.
reject
(
new
Error
(
err
.
message
));
}).
connect
(
connectConfig
);
return
deferred
.
promise
;
}
/**
* find a available ssh client in ssh array, if no ssh client available, return undefined
*/
...
...
@@ -191,7 +191,7 @@ export class SSHClientManager {
//init a new ssh client if could not get an available one
return
await
this
.
initNewSSHClient
();
}
/**
* add a new ssh client to sshClientArray
* @param sshClient
...
...
@@ -199,14 +199,14 @@ export class SSHClientManager {
public
addNewSSHClient
(
client
:
Client
)
{
this
.
sshClientArray
.
push
(
new
SSHClient
(
client
,
1
));
}
/**
* first ssh clilent instance is used for gpu collector and host job
*/
public
getFirstSSHClient
()
{
return
this
.
sshClientArray
[
0
].
getSSHClientInstance
;
}
/**
* close all of ssh client
*/
...
...
@@ -215,7 +215,7 @@ export class SSHClientManager {
sshClient
.
getSSHClientInstance
.
end
();
}
}
/**
* retrieve resource, minus a number for given ssh client
* @param client
...
...
@@ -231,7 +231,7 @@ export class SSHClientManager {
}
}
}
}
}
export
type
RemoteMachineScheduleResult
=
{
scheduleInfo
:
RemoteMachineScheduleInfo
|
undefined
;
resultType
:
ScheduleResultType
};
...
...
@@ -242,7 +242,7 @@ export enum ScheduleResultType {
/* Schedule succeeded*/
SUCCEED
,
/* Temporarily, no enough available GPU right now */
/* Temporarily, no enough available GPU right now */
TMP_NO_AVAILABLE_GPU
,
/* Cannot match requirement even if all GPU are a*/
...
...
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
View file @
ae7a72bc
...
...
@@ -26,7 +26,7 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/**
* RemoteMachine Training service Rest server, provides rest RemoteMachine to support remotemachine job metrics update
*
*
*/
@
component
.
Singleton
export
class
RemoteMachineJobRestServer
extends
ClusterJobRestServer
{
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
ae7a72bc
...
...
@@ -125,10 +125,10 @@ class RemoteMachineTrainingService implements TrainingService {
}
this
.
log
.
info
(
'
Remote machine training service exit.
'
);
}
/**
* give trial a ssh connection
* @param trial
* @param trial
*/
public
async
allocateSSHClientForTrial
(
trial
:
RemoteMachineTrialJobDetail
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -144,10 +144,10 @@ class RemoteMachineTrainingService implements TrainingService {
deferred
.
resolve
();
return
deferred
.
promise
;
}
/**
* If a trial is finished, release the connection resource
* @param trial
* @param trial
*/
public
releaseTrialSSHClient
(
trial
:
RemoteMachineTrialJobDetail
):
void
{
if
(
!
trial
.
rmMeta
)
{
...
...
@@ -167,7 +167,7 @@ class RemoteMachineTrainingService implements TrainingService {
const
jobs
:
TrialJobDetail
[]
=
[];
const
deferred
:
Deferred
<
TrialJobDetail
[]
>
=
new
Deferred
<
TrialJobDetail
[]
>
();
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
value
.
form
.
jobType
===
'
TRIAL
'
)
{
jobs
.
push
(
await
this
.
getTrialJob
(
key
));
}
...
...
@@ -275,12 +275,12 @@ class RemoteMachineTrainingService implements TrainingService {
return
trialJobDetail
;
}
/**
* remove gpu reversion when job is not running
*/
private
updateGpuReservation
()
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
}
...
...
@@ -371,7 +371,7 @@ class RemoteMachineTrainingService implements TrainingService {
await
validateCodeDir
(
remoteMachineTrailConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
return
Promise
.
reject
(
new
Error
(
error
));
}
this
.
trialConfig
=
remoteMachineTrailConfig
;
...
...
@@ -400,16 +400,16 @@ class RemoteMachineTrainingService implements TrainingService {
return
deferred
.
promise
;
}
/**
* cleanup() has a time out of 10s to clean remote connections
* cleanup() has a time out of 10s to clean remote connections
*/
public
async
cleanUp
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Stopping remote machine training service...
'
);
this
.
stopping
=
true
;
await
Promise
.
race
([
delay
(
10000
),
this
.
cleanupConnections
()]);
}
/**
* stop gpu_metric_collector process in remote machine and remove unused scripts
*/
...
...
@@ -430,8 +430,8 @@ class RemoteMachineTrainingService implements TrainingService {
}
return
Promise
.
resolve
();
}
}
/**
* Generate gpu metric collector directory to store temp gpu metric collector script files
*/
...
...
@@ -441,8 +441,8 @@ class RemoteMachineTrainingService implements TrainingService {
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
generateGpuMetricsCollectorScript
(
userName
:
string
):
Promise
<
void
>
{
let
gpuMetricCollectorScriptFolder
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
...
...
@@ -451,9 +451,9 @@ class RemoteMachineTrainingService implements TrainingService {
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
remoteGPUScriptsDir
,
unixPathJoin
(
remoteGPUScriptsDir
,
'
pid
'
),
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
remoteGPUScriptsDir
,
unixPathJoin
(
remoteGPUScriptsDir
,
'
pid
'
),
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
}
...
...
@@ -589,7 +589,7 @@ class RemoteMachineTrainingService implements TrainingService {
}
else
{
command
=
`CUDA_VISIBLE_DEVICES=" "
${
this
.
trialConfig
.
command
}
`
;
}
const
nniManagerIp
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
if
(
!
this
.
remoteRestServerPort
)
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
...
...
src/nni_manager/training_service/test/hdfsClientUtility.test.ts
View file @
ae7a72bc
...
...
@@ -37,7 +37,7 @@ describe('WebHDFS', function () {
{
"user": "user1",
"port": 50070,
"host": "10.0.0.0"
"host": "10.0.0.0"
}
*/
let
skip
:
boolean
=
false
;
...
...
@@ -45,7 +45,7 @@ describe('WebHDFS', function () {
let
hdfsClient
:
any
;
try
{
testHDFSInfo
=
JSON
.
parse
(
fs
.
readFileSync
(
'
../../.vscode/hdfsInfo.json
'
,
'
utf8
'
));
console
.
log
(
testHDFSInfo
);
console
.
log
(
testHDFSInfo
);
hdfsClient
=
WebHDFS
.
createClient
({
user
:
testHDFSInfo
.
user
,
port
:
testHDFSInfo
.
port
,
...
...
@@ -120,7 +120,7 @@ describe('WebHDFS', function () {
chai
.
expect
(
actualFileData
).
to
.
be
.
equals
(
testFileData
);
const
testHDFSDirPath
:
string
=
path
.
join
(
'
/nni_unittest_
'
+
uniqueString
(
6
)
+
'
_dir
'
);
await
HDFSClientUtility
.
copyDirectoryToHdfs
(
tmpLocalDirectoryPath
,
testHDFSDirPath
,
hdfsClient
);
const
files
:
any
[]
=
await
HDFSClientUtility
.
readdir
(
testHDFSDirPath
,
hdfsClient
);
...
...
@@ -133,7 +133,7 @@ describe('WebHDFS', function () {
// Cleanup
rmdir
(
tmpLocalDirectoryPath
);
let
deleteRestult
:
boolean
=
await
HDFSClientUtility
.
deletePath
(
testHDFSFilePath
,
hdfsClient
);
chai
.
expect
(
deleteRestult
).
to
.
be
.
equals
(
true
);
...
...
src/nni_manager/training_service/test/kubeflowTrainingService.test.ts
View file @
ae7a72bc
...
...
@@ -63,7 +63,7 @@ describe('Unit Test for KubeflowTrainingService', () => {
if
(
skip
)
{
return
;
}
kubeflowTrainingService
=
component
.
get
(
KubeflowTrainingService
);
kubeflowTrainingService
=
component
.
get
(
KubeflowTrainingService
);
});
afterEach
(()
=>
{
...
...
@@ -78,6 +78,6 @@ describe('Unit Test for KubeflowTrainingService', () => {
return
;
}
await
kubeflowTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
,
testKubeflowConfig
),
await
kubeflowTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
testKubeflowTrialConfig
);
await
kubeflowTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
testKubeflowTrialConfig
);
});
});
\ No newline at end of file
src/nni_manager/training_service/test/localTrainingService.test.ts
View file @
ae7a72bc
...
...
@@ -63,7 +63,7 @@ describe('Unit Test for LocalTrainingService', () => {
//trial jobs should be empty, since there are no submitted jobs
chai
.
expect
(
await
localTrainingService
.
listTrialJobs
()).
to
.
be
.
empty
;
});
it
(
'
setClusterMetadata and getClusterMetadata
'
,
async
()
=>
{
await
localTrainingService
.
setClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
,
trialConfig
);
localTrainingService
.
getClusterMetadata
(
TrialConfigMetadataKey
.
TRIAL_CONFIG
).
then
((
data
)
=>
{
...
...
@@ -87,7 +87,7 @@ describe('Unit Test for LocalTrainingService', () => {
await
localTrainingService
.
cancelTrialJob
(
jobDetail
.
id
);
chai
.
expect
(
jobDetail
.
status
).
to
.
be
.
equals
(
'
USER_CANCELED
'
);
}).
timeout
(
20000
);
it
(
'
Read metrics, Add listener, and remove listener
'
,
async
()
=>
{
// set meta data
const
trialConfig
:
string
=
`{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"
${
localCodeDir
}
\",\"gpuNum\":0}`
...
...
src/nni_manager/training_service/test/paiTrainingService.test.ts
View file @
ae7a72bc
...
...
@@ -89,7 +89,7 @@ describe('Unit Test for PAITrainingService', () => {
chai
.
expect
(
trialDetail
.
status
).
to
.
be
.
equals
(
'
WAITING
'
);
}
catch
(
error
)
{
console
.
log
(
'
Submit job failed:
'
+
error
);
chai
.
assert
(
error
)
chai
.
assert
(
error
)
}
});
});
\ No newline at end of file
src/nni_manager/types/child-process-promise/index.d.ts
View file @
ae7a72bc
...
...
@@ -7,5 +7,5 @@ declare module 'child-process-promise' {
stderr
:
string
,
message
:
string
}
}
}
}
\ No newline at end of file
src/sdk/pynni/nni/__main__.py
View file @
ae7a72bc
...
...
@@ -154,7 +154,7 @@ def main():
assessor
=
None
if
args
.
tuner_class_name
in
ModuleName
:
tuner
=
create_builtin_class_instance
(
args
.
tuner_class_name
,
args
.
tuner_class_name
,
args
.
tuner_args
)
else
:
tuner
=
create_customized_class_instance
(
...
...
src/sdk/pynni/nni/bohb_advisor/bohb_advisor.py
View file @
ae7a72bc
...
...
@@ -81,7 +81,7 @@ def create_bracket_parameter_id(brackets_id, brackets_curr_decay, increased_id=-
class
Bracket
():
"""
A bracket in BOHB, all the information of a bracket is managed by
A bracket in BOHB, all the information of a bracket is managed by
an instance of this class.
Parameters
...
...
@@ -251,7 +251,7 @@ class BOHB(MsgDispatcherBase):
BOHB performs robust and efficient hyperparameter optimization
at scale by combining the speed of Hyperband searches with the
guidance and guarantees of convergence of Bayesian Optimization.
Instead of sampling new configurations at random, BOHB uses
Instead of sampling new configurations at random, BOHB uses
kernel density estimators to select promising candidates.
Parameters
...
...
@@ -335,7 +335,7 @@ class BOHB(MsgDispatcherBase):
pass
def
handle_initialize
(
self
,
data
):
"""Initialize Tuner, including creating Bayesian optimization-based parametric models
"""Initialize Tuner, including creating Bayesian optimization-based parametric models
and search space formations
Parameters
...
...
@@ -403,7 +403,7 @@ class BOHB(MsgDispatcherBase):
If this function is called, Command will be sent by BOHB:
a. If there is a parameter need to run, will return "NewTrialJob" with a dict:
{
{
'parameter_id': id of new hyperparameter
'parameter_source': 'algorithm'
'parameters': value of new hyperparameter
...
...
@@ -458,30 +458,30 @@ class BOHB(MsgDispatcherBase):
var
,
lower
=
search_space
[
var
][
"_value"
][
0
],
upper
=
search_space
[
var
][
"_value"
][
1
]))
elif
_type
==
'quniform'
:
cs
.
add_hyperparameter
(
CSH
.
UniformFloatHyperparameter
(
var
,
lower
=
search_space
[
var
][
"_value"
][
0
],
upper
=
search_space
[
var
][
"_value"
][
1
],
var
,
lower
=
search_space
[
var
][
"_value"
][
0
],
upper
=
search_space
[
var
][
"_value"
][
1
],
q
=
search_space
[
var
][
"_value"
][
2
]))
elif
_type
==
'loguniform'
:
cs
.
add_hyperparameter
(
CSH
.
UniformFloatHyperparameter
(
var
,
lower
=
search_space
[
var
][
"_value"
][
0
],
upper
=
search_space
[
var
][
"_value"
][
1
],
var
,
lower
=
search_space
[
var
][
"_value"
][
0
],
upper
=
search_space
[
var
][
"_value"
][
1
],
log
=
True
))
elif
_type
==
'qloguniform'
:
cs
.
add_hyperparameter
(
CSH
.
UniformFloatHyperparameter
(
var
,
lower
=
search_space
[
var
][
"_value"
][
0
],
upper
=
search_space
[
var
][
"_value"
][
1
],
var
,
lower
=
search_space
[
var
][
"_value"
][
0
],
upper
=
search_space
[
var
][
"_value"
][
1
],
q
=
search_space
[
var
][
"_value"
][
2
],
log
=
True
))
elif
_type
==
'normal'
:
cs
.
add_hyperparameter
(
CSH
.
NormalFloatHyperparameter
(
var
,
mu
=
search_space
[
var
][
"_value"
][
1
],
sigma
=
search_space
[
var
][
"_value"
][
2
]))
elif
_type
==
'qnormal'
:
cs
.
add_hyperparameter
(
CSH
.
NormalFloatHyperparameter
(
var
,
mu
=
search_space
[
var
][
"_value"
][
1
],
sigma
=
search_space
[
var
][
"_value"
][
2
],
var
,
mu
=
search_space
[
var
][
"_value"
][
1
],
sigma
=
search_space
[
var
][
"_value"
][
2
],
q
=
search_space
[
var
][
"_value"
][
3
]))
elif
_type
==
'lognormal'
:
cs
.
add_hyperparameter
(
CSH
.
NormalFloatHyperparameter
(
var
,
mu
=
search_space
[
var
][
"_value"
][
1
],
sigma
=
search_space
[
var
][
"_value"
][
2
],
var
,
mu
=
search_space
[
var
][
"_value"
][
1
],
sigma
=
search_space
[
var
][
"_value"
][
2
],
log
=
True
))
elif
_type
==
'qlognormal'
:
cs
.
add_hyperparameter
(
CSH
.
NormalFloatHyperparameter
(
var
,
mu
=
search_space
[
var
][
"_value"
][
1
],
sigma
=
search_space
[
var
][
"_value"
][
2
],
var
,
mu
=
search_space
[
var
][
"_value"
][
1
],
sigma
=
search_space
[
var
][
"_value"
][
2
],
q
=
search_space
[
var
][
"_value"
][
3
],
log
=
True
))
else
:
raise
ValueError
(
...
...
@@ -553,7 +553,7 @@ class BOHB(MsgDispatcherBase):
self
.
brackets
[
s
].
set_config_perf
(
int
(
i
),
data
[
'parameter_id'
],
sys
.
maxsize
,
value
)
self
.
completed_hyper_configs
.
append
(
data
)
_parameters
=
self
.
parameters
[
data
[
'parameter_id'
]]
_parameters
.
pop
(
_KEY
)
# update BO with loss, max_s budget, hyperparameters
...
...
src/sdk/pynni/nni/bohb_advisor/config_generator.py
View file @
ae7a72bc
...
...
@@ -117,7 +117,7 @@ class CG_BOHB(object):
seperated by budget. This function sample a configuration from
largest budget. Firstly we sample "num_samples" configurations,
then prefer one with the largest l(x)/g(x).
Parameters:
-----------
info_dict: dict
...
...
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment