Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
cb15be49
Unverified
Commit
cb15be49
authored
Dec 11, 2019
by
chicm-ms
Committed by
GitHub
Dec 11, 2019
Browse files
Enable eslint for nni_manager (#1845)
* enable eslint * remove tslint
parent
a2210436
Changes
42
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
8 additions
and
72 deletions
+8
-72
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+0
-5
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
...training_service/kubernetes/kubeflow/kubeflowApiClient.ts
+0
-3
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
...er/training_service/kubernetes/kubeflow/kubeflowConfig.ts
+0
-6
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
...g_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
+2
-3
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+0
-4
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
...anager/training_service/kubernetes/kubernetesApiClient.ts
+0
-1
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+0
-2
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
...er/training_service/kubernetes/kubernetesJobRestServer.ts
+0
-1
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+1
-6
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+0
-1
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+2
-5
src/nni_manager/training_service/pai/hdfsClientUtility.ts
src/nni_manager/training_service/pai/hdfsClientUtility.ts
+0
-5
src/nni_manager/training_service/pai/paiData.ts
src/nni_manager/training_service/pai/paiData.ts
+0
-1
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
+0
-4
src/nni_manager/training_service/pai/paiJobRestServer.ts
src/nni_manager/training_service/pai/paiJobRestServer.ts
+0
-1
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+1
-13
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
...i_manager/training_service/remote_machine/gpuScheduler.ts
+0
-2
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+0
-1
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
...ning_service/remote_machine/remoteMachineJobRestServer.ts
+0
-1
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+2
-7
No files found.
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
cb15be49
...
...
@@ -101,7 +101,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
// tslint:disable-next-line:no-any
const
frameworkcontrollerJobConfig
:
any
=
await
this
.
prepareFrameworkControllerConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
);
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
frameworkcontrollerJobConfig
);
...
...
@@ -112,7 +111,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return
Promise
.
resolve
(
trialJobDetail
);
}
// tslint:disable:no-redundant-jsdoc no-any no-unsafe-any
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
...
...
@@ -171,7 +169,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return
Promise
.
resolve
();
}
// tslint:enable: no-any no-unsafe-any
/**
* upload code files to nfs or azureStroage
...
...
@@ -256,7 +253,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
}
// tslint:disable: no-any no-unsafe-any
private
async
prepareFrameworkControllerConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
):
Promise
<
any
>
{
...
...
@@ -447,7 +443,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
};
}
// tslint:enable: no-any no-unsafe-any
}
export
{
FrameworkControllerTrainingService
};
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
View file @
cb15be49
...
...
@@ -8,7 +8,6 @@ import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient';
import
{
KubeflowOperator
}
from
'
./kubeflowConfig
'
;
// tslint:disable: no-unsafe-any no-any completed-docs
class
TFOperatorClientV1Alpha2
extends
KubernetesCRDClient
{
/**
* constructor, to initialize tfjob CRD definition
...
...
@@ -130,7 +129,6 @@ class KubeflowOperatorClientFactory {
/**
* Factory method to generate operator client
*/
// tslint:disable-next-line:function-name
public
static
createClient
(
kubeflowOperator
:
KubeflowOperator
,
operatorApiVersion
:
string
):
KubernetesCRDClient
{
switch
(
kubeflowOperator
)
{
case
'
tf-operator
'
:
{
...
...
@@ -169,5 +167,4 @@ class KubeflowOperatorClientFactory {
}
}
// tslint:enable: no-unsafe-any
export
{
KubeflowOperatorClientFactory
,
GeneralK8sClient
};
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
View file @
cb15be49
...
...
@@ -26,7 +26,6 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
}
}
// tslint:disable:completed-docs
export
class
KubeflowClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
operator
:
KubeflowOperator
;
constructor
(
...
...
@@ -43,7 +42,6 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
return
'
nfs
'
;
}
// tslint:disable-next-line:function-name
public
static
getInstance
(
jsonObject
:
object
):
KubeflowClusterConfigNFS
{
const
kubeflowClusterConfigObjectNFS
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
);
...
...
@@ -75,7 +73,6 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
return
'
azureStorage
'
;
}
// tslint:disable-next-line:function-name
public
static
getInstance
(
jsonObject
:
object
):
KubeflowClusterConfigAzure
{
const
kubeflowClusterConfigObjectAzure
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
jsonObject
;
...
...
@@ -91,7 +88,6 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
export
class
KubeflowClusterConfigFactory
{
// tslint:disable-next-line:function-name
public
static
generateKubeflowClusterConfig
(
jsonObject
:
object
):
KubeflowClusterConfig
{
const
storageConfig
:
StorageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
storageConfig
===
undefined
)
{
...
...
@@ -156,8 +152,6 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
}
export
class
KubeflowTrialConfigFactory
{
// tslint:disable-next-line:function-name
public
static
generateKubeflowTrialConfig
(
jsonObject
:
object
,
operator
:
KubeflowOperator
):
KubeflowTrialConfig
{
if
(
operator
===
'
tf-operator
'
)
{
const
kubeflowTrialConfigObject
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
jsonObject
;
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
View file @
cb15be49
...
...
@@ -26,7 +26,6 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
return
Promise
.
reject
(
'
kubernetesCRDClient is undefined
'
);
}
// tslint:disable:no-any no-unsafe-any
let
kubernetesJobInfo
:
any
;
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
...
...
@@ -37,7 +36,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
//This is not treat as a error status
return
Promise
.
resolve
();
}
/* eslint-disable require-atomic-updates */
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
conditions
)
{
const
latestCondition
:
any
=
kubernetesJobInfo
.
status
.
conditions
[
kubernetesJobInfo
.
status
.
conditions
.
length
-
1
];
const
tfJobType
:
KubeflowJobStatus
=
<
KubeflowJobStatus
>
latestCondition
.
type
;
...
...
@@ -63,7 +62,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
default
:
}
}
/
/ t
slint
:
enable
:no-any no-unsafe-any
/
* e
slint
-
enable
require-atomic-updates */
return
Promise
.
resolve
();
}
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
cb15be49
...
...
@@ -27,7 +27,6 @@ import { KubeflowClusterConfig, KubeflowClusterConfigAzure, KubeflowClusterConfi
import
{
KubeflowJobInfoCollector
}
from
'
./kubeflowJobInfoCollector
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
// tslint:disable: no-unsafe-any no-any
/**
* Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
...
...
@@ -108,7 +107,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
return
Promise
.
resolve
(
trialJobDetail
);
}
// tslint:disable:no-redundant-jsdoc
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
...
...
@@ -461,7 +459,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
replicas
:
replicaNumber
,
template
:
{
metadata
:
{
// tslint:disable-next-line:no-null-keyword
creationTimestamp
:
null
},
spec
:
spec
...
...
@@ -469,5 +466,4 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
}
}
// tslint:enable: no-unsafe-any no-any
export
{
KubeflowTrainingService
};
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
View file @
cb15be49
...
...
@@ -10,7 +10,6 @@ import { getLogger, Logger } from '../../common/log';
/**
* Generict Kubernetes client, target version >= 1.9
*/
// tslint:disable: no-any no-unsafe-any
class
GeneralK8sClient
{
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
cb15be49
...
...
@@ -6,7 +6,6 @@
export
type
KubernetesStorageKind
=
'
nfs
'
|
'
azureStorage
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
// tslint:disable: completed-docs function-name
export
abstract
class
KubernetesClusterConfig
{
public
readonly
storage
?:
KubernetesStorageKind
;
public
readonly
apiVersion
:
string
;
...
...
@@ -91,7 +90,6 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
}
}
// tslint:disable-next-line:no-unnecessary-class
export
class
KubernetesClusterConfigFactory
{
public
static
generateKubernetesClusterConfig
(
jsonObject
:
object
):
KubernetesClusterConfig
{
...
...
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
View file @
cb15be49
...
...
@@ -25,7 +25,6 @@ export class KubernetesJobRestServer extends ClusterJobRestServer {
this
.
kubernetesTrainingService
=
kubernetesTrainingService
;
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[]):
void
{
if
(
this
.
kubernetesTrainingService
===
undefined
)
{
throw
Error
(
'
kubernetesTrainingService not initialized!
'
);
...
...
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
cb15be49
...
...
@@ -61,7 +61,6 @@ abstract class KubernetesTrainingService {
this
.
logCollection
=
'
none
'
;
}
// tslint:disable:no-any
public
generatePodResource
(
memory
:
number
,
cpuNum
:
number
,
gpuNum
:
number
):
any
{
const
resources
:
any
=
{
memory
:
`
${
memory
}
Mi`
,
...
...
@@ -73,7 +72,7 @@ abstract class KubernetesTrainingService {
}
return
resources
;
}
// tslint:enable:no-any
}
public
async
listTrialJobs
():
Promise
<
TrialJobDetail
[]
>
{
const
jobs
:
TrialJobDetail
[]
=
[];
...
...
@@ -197,7 +196,6 @@ abstract class KubernetesTrainingService {
await
this
.
kubernetesJobRestServer
.
stop
();
this
.
log
.
info
(
'
Kubernetes Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
// tslint:disable-next-line: no-unsafe-any
this
.
log
.
error
(
`Kubernetes Training service rest server stopped failed, error:
${
error
.
message
}
`
);
return
Promise
.
reject
(
error
);
...
...
@@ -206,7 +204,6 @@ abstract class KubernetesTrainingService {
return
Promise
.
resolve
();
}
// tslint:disable: no-unsafe-any no-any
protected
async
createAzureStorage
(
vaultName
:
string
,
valutKeyName
:
string
):
Promise
<
void
>
{
try
{
const
result
:
any
=
await
cpp
.
exec
(
`az keyvault secret show --name
${
valutKeyName
}
--vault-name
${
vaultName
}
`
);
...
...
@@ -253,7 +250,6 @@ abstract class KubernetesTrainingService {
return
Promise
.
resolve
();
}
// tslint:enable: no-unsafe-any no-any
/**
* Genereate run script for different roles(like worker or ps)
...
...
@@ -271,7 +267,6 @@ abstract class KubernetesTrainingService {
if
(
gpuNum
===
0
)
{
nvidiaScript
=
'
export CUDA_VISIBLE_DEVICES=
'
;
}
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
runScript
:
string
=
String
.
Format
(
...
...
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
cb15be49
...
...
@@ -86,7 +86,6 @@ class GPUScheduler {
runGpuMetricsCollector
(
this
.
gpuMetricCollectorScriptFolder
);
}
// tslint:disable:non-literal-fs-path
private
async
updateGPUSummary
():
Promise
<
void
>
{
const
gpuMetricPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
);
if
(
fs
.
existsSync
(
gpuMetricPath
))
{
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
cb15be49
...
...
@@ -31,7 +31,6 @@ import { GPUScheduler } from './gpuScheduler';
* success: true if the buffer contains at least one complete command; otherwise false
* remain: remaining data after the first command
*/
// tslint:disable:newline-per-chained-call informative-docs
function
decodeCommand
(
data
:
Buffer
):
[
boolean
,
string
,
string
,
Buffer
]
{
if
(
data
.
length
<
8
)
{
return
[
false
,
''
,
''
,
data
];
...
...
@@ -46,7 +45,6 @@ function decodeCommand(data: Buffer): [boolean, string, string, Buffer] {
return
[
true
,
commandType
,
content
,
remain
];
}
// tslint:enable:newline-per-chained-call informative-docs
/**
* LocalTrialJobDetail
...
...
@@ -252,7 +250,6 @@ class LocalTrainingService implements TrainingService {
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
if
(
!
this
.
initialized
)
{
this
.
rootDir
=
getExperimentRootDir
();
// tslint:disable-next-line:non-literal-fs-path
if
(
!
fs
.
existsSync
(
this
.
rootDir
))
{
await
cpp
.
exec
(
`powershell.exe mkdir
${
this
.
rootDir
}
`
);
}
...
...
@@ -524,8 +521,8 @@ class LocalTrainingService implements TrainingService {
await
this
.
writeParameterFile
(
trialJobDetail
.
workingDirectory
,
trialJobDetail
.
form
.
hyperParameters
);
const
trialJobProcess
:
cp
.
ChildProcess
=
runScript
(
path
.
join
(
trialJobDetail
.
workingDirectory
,
scriptName
));
this
.
setTrialJobStatus
(
trialJobDetail
,
'
RUNNING
'
);
trialJobDetail
.
startTime
=
Date
.
now
();
trialJobDetail
.
pid
=
trialJobProcess
.
pid
;
trialJobDetail
.
startTime
=
Date
.
now
();
// eslint-disable-line require-atomic-updates
trialJobDetail
.
pid
=
trialJobProcess
.
pid
;
// eslint-disable-line require-atomic-updates
this
.
setExtraProperties
(
trialJobDetail
,
resource
);
let
buffer
:
Buffer
=
Buffer
.
alloc
(
0
);
...
...
src/nni_manager/training_service/pai/hdfsClientUtility.ts
View file @
cb15be49
...
...
@@ -17,7 +17,6 @@ export namespace HDFSClientUtility {
* @param hdfsUserName HDFS user name
*/
export
function
hdfsExpRootDir
(
hdfsUserName
:
string
):
string
{
// tslint:disable-next-line:prefer-template
return
'
/
'
+
unixPathJoin
(
hdfsUserName
,
'
nni
'
,
'
experiments
'
,
getExperimentId
());
}
...
...
@@ -47,10 +46,8 @@ export namespace HDFSClientUtility {
* @param hdfsFilePath hdfs file path(target)
* @param hdfsClient hdfs client
*/
// tslint:disable: no-unsafe-any non-literal-fs-path no-any
export
async
function
copyFileToHdfs
(
localFilePath
:
string
,
hdfsFilePath
:
string
,
hdfsClient
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
// tslint:disable-next-line:non-literal-fs-path
fs
.
exists
(
localFilePath
,
(
exists
:
boolean
)
=>
{
// Detect if local file exist
if
(
exists
)
{
...
...
@@ -90,7 +87,6 @@ export namespace HDFSClientUtility {
for
(
const
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
// tslint:disable-next-line:non-literal-fs-path
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
await
copyFileToHdfs
(
fullFilePath
,
path
.
join
(
hdfsDirectory
,
fileName
),
hdfsClient
);
...
...
@@ -227,5 +223,4 @@ export namespace HDFSClientUtility {
return
deferred
.
promise
;
}
// tslint:enable: no-unsafe-any non-literal-fs-path no-any
}
src/nni_manager/training_service/pai/paiData.ts
View file @
cb15be49
...
...
@@ -52,6 +52,5 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
--pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \
--nni_manager_version '{13}' --log_collection '{14}'`
;
// tslint:disable:no-http-string
export
const
PAI_LOG_PATH_FORMAT
:
string
=
`http://{0}/webhdfs/explorer.html#{1}`
;
src/nni_manager/training_service/pai/paiJobInfoCollector.ts
View file @
cb15be49
...
...
@@ -3,7 +3,6 @@
'
use strict
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
request
from
'
request
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
...
...
@@ -54,7 +53,6 @@ export class PAIJobInfoCollector {
// Rest call to get PAI job info and update status
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const
getJobInfoRequest
:
request
.
Options
=
{
// tslint:disable-next-line:no-http-string
uri
:
`http://
${
paiClusterConfig
.
host
}
/rest-server/api/v1/user/
${
paiClusterConfig
.
userName
}
/jobs/
${
paiTrialJob
.
paiJobName
}
`
,
method
:
'
GET
'
,
json
:
true
,
...
...
@@ -64,7 +62,6 @@ export class PAIJobInfoCollector {
}
};
// tslint:disable: no-unsafe-any no-any cyclomatic-complexity
//TODO : pass in request timeout param?
request
(
getJobInfoRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
500
)
{
...
...
@@ -128,5 +125,4 @@ export class PAIJobInfoCollector {
return
deferred
.
promise
;
}
// tslint:enable: no-unsafe-any no-any
}
src/nni_manager/training_service/pai/paiJobRestServer.ts
View file @
cb15be49
...
...
@@ -34,7 +34,6 @@ export class PAIJobRestServer extends ClusterJobRestServer {
this
.
paiTrainingService
=
component
.
get
(
PAITrainingService
);
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[]):
void
{
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
...
...
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
cb15be49
...
...
@@ -5,7 +5,6 @@
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
// tslint:disable-next-line:no-implicit-dependencies
import
*
as
request
from
'
request
'
;
import
*
as
component
from
'
../../common/component
'
;
...
...
@@ -45,7 +44,6 @@ class PAITrainingService implements TrainingService {
private
paiClusterConfig
?:
PAIClusterConfig
;
private
readonly
jobQueue
:
string
[];
private
stopping
:
boolean
=
false
;
// tslint:disable-next-line:no-any
private
hdfsClient
:
any
;
private
paiToken
?
:
string
;
private
paiTokenUpdateTime
?:
number
;
...
...
@@ -171,7 +169,6 @@ class PAITrainingService implements TrainingService {
return
true
;
}
// tslint:disable:no-http-string
public
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -203,7 +200,6 @@ class PAITrainingService implements TrainingService {
// Set trialjobDetail's early stopped field, to mark the job's cancellation source
trialJobDetail
.
isEarlyStopped
=
isEarlyStopped
;
// tslint:disable-next-line:no-any
request
(
stopJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
this
.
log
.
error
(
`PAI Training service: stop trial
${
trialJobId
}
to PAI Cluster failed!`
);
...
...
@@ -217,8 +213,6 @@ class PAITrainingService implements TrainingService {
return
deferred
.
promise
;
}
// tslint:disable: no-unsafe-any no-any
// tslint:disable-next-line:max-func-body-length
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -298,7 +292,6 @@ class PAITrainingService implements TrainingService {
return
deferred
.
promise
;
}
// tslint:enable: no-unsafe-any
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
...
...
@@ -319,7 +312,6 @@ class PAITrainingService implements TrainingService {
deferred
.
resolve
();
this
.
log
.
info
(
'
PAI Training service rest server stopped successfully.
'
);
}
catch
(
error
)
{
// tslint:disable-next-line: no-unsafe-any
this
.
log
.
error
(
`PAI Training service rest server stopped failed, error:
${
error
.
message
}
`
);
deferred
.
reject
(
error
);
}
...
...
@@ -331,7 +323,6 @@ class PAITrainingService implements TrainingService {
return
this
.
metricsEmitter
;
}
// tslint:disable-next-line:max-func-body-length
private
async
submitTrialJobToPAI
(
trialJobId
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
trialJobDetail
:
PAITrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
...
...
@@ -383,7 +374,6 @@ class PAITrainingService implements TrainingService {
}
const
hdfsCodeDir
:
string
=
HDFSClientUtility
.
getHdfsTrialWorkDir
(
this
.
paiClusterConfig
.
userName
,
trialJobId
);
const
hdfsOutputDir
:
string
=
unixPathJoin
(
hdfsCodeDir
,
'
nnioutput
'
);
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
...
...
@@ -407,7 +397,6 @@ class PAITrainingService implements TrainingService {
)
.
replace
(
/
\r\n
|
\n
|
\r
/gm
,
''
);
// tslint:disable-next-line:no-console
this
.
log
.
info
(
`nniPAItrial command is
${
nniPaiTrialCommand
.
trim
()}
`
);
const
paiTaskRoles
:
PAITaskRole
[]
=
[
new
PAITaskRole
(
...
...
@@ -449,7 +438,7 @@ class PAITrainingService implements TrainingService {
await
HDFSClientUtility
.
copyDirectoryToHdfs
(
trialLocalTempFolder
,
hdfsCodeDir
,
this
.
hdfsClient
);
}
catch
(
error
)
{
this
.
log
.
error
(
`PAI Training service: copy
${
this
.
paiTrialConfig
.
codeDir
}
to HDFS
${
hdfsCodeDir
}
failed, error is
${
error
}
`
);
trialJobDetail
.
status
=
'
FAILED
'
;
trialJobDetail
.
status
=
'
FAILED
'
;
// eslint-disable-line require-atomic-updates
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
...
...
@@ -467,7 +456,6 @@ class PAITrainingService implements TrainingService {
Authorization
:
`Bearer
${
this
.
paiToken
}
`
}
};
// tslint:disable:no-any no-unsafe-any
request
(
submitJobRequest
,
(
error
:
Error
,
response
:
request
.
Response
,
body
:
any
)
=>
{
if
((
error
!==
undefined
&&
error
!==
null
)
||
response
.
statusCode
>=
400
)
{
const
errorMessage
:
string
=
(
error
!==
undefined
&&
error
!==
null
)
?
error
.
message
:
...
...
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
View file @
cb15be49
...
...
@@ -148,7 +148,6 @@ export class GPUScheduler {
}
}
this
.
log
.
debug
(
`designated gpu indices:
${
designatedGpuIndices
}
`
);
// tslint:disable: strict-boolean-expressions
rmMeta
.
gpuSummary
.
gpuInfos
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
// if the GPU has active process, OR be reserved by a job,
// or index not in gpuIndices configuration in machineList,
...
...
@@ -174,7 +173,6 @@ export class GPUScheduler {
return
totalResourceMap
;
}
// tslint:enable: strict-boolean-expressions
private
selectMachine
(
rmMetas
:
RemoteMachineMeta
[]):
RemoteMachineMeta
{
assert
(
rmMetas
!==
undefined
&&
rmMetas
.
length
>
0
);
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
cb15be49
...
...
@@ -186,7 +186,6 @@ export class SSHClientManager {
/**
* Create a new ssh connection client and initialize it
*/
// tslint:disable:non-literal-fs-path
private
initNewSSHClient
():
Promise
<
Client
>
{
const
deferred
:
Deferred
<
Client
>
=
new
Deferred
<
Client
>
();
const
conn
:
Client
=
new
Client
();
...
...
src/nni_manager/training_service/remote_machine/remoteMachineJobRestServer.ts
View file @
cb15be49
...
...
@@ -25,7 +25,6 @@ export class RemoteMachineJobRestServer extends ClusterJobRestServer {
this
.
remoteMachineTrainingService
=
component
.
get
(
RemoteMachineTrainingService
);
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[]):
void
{
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWNls
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
cb15be49
...
...
@@ -206,7 +206,6 @@ class RemoteMachineTrainingService implements TrainingService {
* Submit trial job
* @param form trial job description form
*/
// tslint:disable-next-line:informative-docs
public
async
submitTrialJob
(
form
:
TrialJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
...
...
@@ -255,7 +254,6 @@ class RemoteMachineTrainingService implements TrainingService {
* Cancel trial job
* @param trialJobId ID of trial job
*/
// tslint:disable:informative-docs no-unsafe-any
public
async
cancelTrialJob
(
trialJobId
:
string
,
isEarlyStopped
:
boolean
=
false
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
trialJob
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
...
...
@@ -319,7 +317,6 @@ class RemoteMachineTrainingService implements TrainingService {
throw
new
Error
(
'
trial config parsed failed
'
);
}
// codeDir is not a valid directory, throw Error
// tslint:disable-next-line:non-literal-fs-path
if
(
!
fs
.
lstatSync
(
remoteMachineTrailConfig
.
codeDir
)
.
isDirectory
())
{
throw
new
Error
(
`codeDir
${
remoteMachineTrailConfig
.
codeDir
}
is not a directory`
);
...
...
@@ -438,7 +435,6 @@ class RemoteMachineTrainingService implements TrainingService {
await
SSHClientUtility
.
remoteExeCommand
(
`chmod 777
${
nniRootDir
}
${
nniRootDir
}
/*
${
nniRootDir
}
/scripts/*`
,
conn
);
//Begin to execute gpu_metrics_collection scripts
// tslint:disable-next-line: no-floating-promises
const
script
=
getGpuMetricsCollectorBashScriptContent
(
remoteGpuScriptCollectorDir
);
SSHClientUtility
.
remoteExeCommand
(
`bash -c '
${
script
}
'`
,
conn
);
...
...
@@ -549,7 +545,6 @@ class RemoteMachineTrainingService implements TrainingService {
command
=
`CUDA_VISIBLE_DEVICES=" "
${
this
.
trialConfig
.
command
}
`
;
}
}
// tslint:disable-next-line: strict-boolean-expressions
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
if
(
this
.
remoteRestServerPort
===
undefined
)
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
...
...
@@ -587,7 +582,6 @@ class RemoteMachineTrainingService implements TrainingService {
// Copy files in codeDir to remote working directory
await
SSHClientUtility
.
copyDirectoryToRemote
(
trialLocalTempFolder
,
trialWorkingFolder
,
sshClient
,
this
.
remoteOS
);
// Execute command in remote machine
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
trialWorkingFolder
,
'
run.sh
'
)}
`
,
sshClient
);
}
...
...
@@ -604,6 +598,7 @@ class RemoteMachineTrainingService implements TrainingService {
const
deferred
:
Deferred
<
TrialJobDetail
>
=
new
Deferred
<
TrialJobDetail
>
();
const
jobpidPath
:
string
=
this
.
getJobPidPath
(
trialJob
.
id
);
const
trialReturnCodeFilePath
:
string
=
unixPathJoin
(
this
.
remoteExpRootDir
,
'
trials
'
,
trialJob
.
id
,
'
.nni
'
,
'
code
'
);
/* eslint-disable require-atomic-updates */
try
{
const
killResult
:
number
=
(
await
SSHClientUtility
.
remoteExeCommand
(
`kill -0
\`
cat
${
jobpidPath
}
\`
`
,
sshClient
)).
exitCode
;
// if the process of jobpid is not alive any more
...
...
@@ -640,7 +635,7 @@ class RemoteMachineTrainingService implements TrainingService {
deferred
.
resolve
(
trialJob
);
}
}
/* eslint-enable require-atomic-updates */
return
deferred
.
promise
;
}
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment