Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
ae7a72bc
Commit
ae7a72bc
authored
Jun 19, 2019
by
Hongarc
Committed by
Chi Song
Jun 19, 2019
Browse files
Remove all whitespace at end of line (#1162)
parent
14c1b31c
Changes
176
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
185 additions
and
185 deletions
+185
-185
src/nni_manager/training_service/common/clusterJobRestServer.ts
...i_manager/training_service/common/clusterJobRestServer.ts
+5
-5
src/nni_manager/training_service/common/containerJobData.ts
src/nni_manager/training_service/common/containerJobData.ts
+2
-2
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+2
-2
src/nni_manager/training_service/common/util.ts
src/nni_manager/training_service/common/util.ts
+11
-11
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
...er/training_service/kubernetes/azureStorageClientUtils.ts
+28
-28
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerApiClient.ts
...netes/frameworkcontroller/frameworkcontrollerApiClient.ts
+1
-1
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
...bernetes/frameworkcontroller/frameworkcontrollerConfig.ts
+9
-9
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobInfoCollector.ts
...rameworkcontroller/frameworkcontrollerJobInfoCollector.ts
+4
-4
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobRestServer.ts
...s/frameworkcontroller/frameworkcontrollerJobRestServer.ts
+2
-2
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+37
-37
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
...training_service/kubernetes/kubeflow/kubeflowApiClient.ts
+3
-3
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
...er/training_service/kubernetes/kubeflow/kubeflowConfig.ts
+8
-8
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
...g_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
+7
-7
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
...ning_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
+2
-2
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+44
-44
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
...anager/training_service/kubernetes/kubernetesApiClient.ts
+7
-7
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+9
-9
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+1
-1
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
...training_service/kubernetes/kubernetesJobInfoCollector.ts
+1
-1
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
...er/training_service/kubernetes/kubernetesJobRestServer.ts
+2
-2
No files found.
src/nni_manager/training_service/common/clusterJobRestServer.ts
View file @
ae7a72bc
...
...
@@ -32,7 +32,7 @@ import { Writable } from 'stream';
/**
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
*
*
*/
@
component
.
Singleton
export
abstract
class
ClusterJobRestServer
extends
RestServer
{
...
...
@@ -52,8 +52,8 @@ export abstract class ClusterJobRestServer extends RestServer{
super
();
const
basePort
:
number
=
getBasePort
();
assert
(
basePort
&&
basePort
>
1024
);
this
.
port
=
basePort
+
1
;
this
.
port
=
basePort
+
1
;
}
public
get
clusterRestServerPort
():
number
{
...
...
@@ -62,11 +62,11 @@ export abstract class ClusterJobRestServer extends RestServer{
}
return
this
.
port
;
}
public
get
getErrorMessage
():
string
|
undefined
{
return
this
.
errorMessage
;
}
public
set
setEnableVersionCheck
(
versionCheck
:
boolean
)
{
this
.
enableVersionCheck
=
versionCheck
;
}
...
...
src/nni_manager/training_service/common/containerJobData.ts
View file @
ae7a72bc
...
...
@@ -19,12 +19,12 @@
'
use strict
'
;
export
const
CONTAINER_INSTALL_NNI_SHELL_FORMAT
:
string
=
export
const
CONTAINER_INSTALL_NNI_SHELL_FORMAT
:
string
=
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
return
else
# Install nni
python3 -m pip install --user --upgrade nni
python3 -m pip install --user --upgrade nni
fi`
;
\ No newline at end of file
src/nni_manager/training_service/common/gpuData.ts
View file @
ae7a72bc
...
...
@@ -59,7 +59,7 @@ export class GPUSummary {
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT_LINUX
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT_LINUX
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
...
...
@@ -67,7 +67,7 @@ echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
`
$env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
...
...
src/nni_manager/training_service/common/util.ts
View file @
ae7a72bc
...
...
@@ -34,7 +34,7 @@ import { file } from "../../node_modules/@types/tmp";
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
*
*
* @param codeDir codeDir in nni config file
* @returns file number under codeDir
*/
...
...
@@ -48,9 +48,9 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
}
if
(
fileCount
&&
fileCount
>
1000
)
{
const
errMessage
:
string
=
`Too many files(
${
fileCount
}
found}) in
${
codeDir
}
,`
const
errMessage
:
string
=
`Too many files(
${
fileCount
}
found}) in
${
codeDir
}
,`
+
` please check if it's a valid code dir`
;
throw
new
Error
(
errMessage
);
throw
new
Error
(
errMessage
);
}
return
fileCount
;
...
...
@@ -58,7 +58,7 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
/**
* crete a new directory
* @param directory
* @param directory
*/
export
async
function
execMkdir
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
@@ -85,7 +85,7 @@ export async function execCopydir(source: string, destination: string): Promise<
/**
* crete a new file
* @param filename
* @param filename
*/
export
async
function
execNewFile
(
filename
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
@@ -110,7 +110,7 @@ export function execScript(filePath: string): cp.ChildProcess {
/**
* output the last line of a file
* @param filePath
* @param filePath
*/
export
async
function
execTail
(
filePath
:
string
):
Promise
<
cpp
.
childProcessPromise
.
Result
>
{
let
cmdresult
:
cpp
.
childProcessPromise
.
Result
;
...
...
@@ -124,7 +124,7 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
/**
* delete a directory
* @param directory
* @param directory
*/
export
async
function
execRemove
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
@@ -137,7 +137,7 @@ export async function execRemove(directory: string): Promise<void> {
/**
* kill a process
* @param directory
* @param directory
*/
export
async
function
execKill
(
pid
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
@@ -151,7 +151,7 @@ export async function execKill(pid: string): Promise<void> {
/**
* set environment variable
* @param variable
* @returns command string
* @returns command string
*/
export
function
setEnvironmentVariable
(
variable
:
{
key
:
string
;
value
:
string
}):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
@@ -191,7 +191,7 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
/**
* generate script file name
* @param fileNamePrefix
* @param fileNamePrefix
*/
export
function
getScriptName
(
fileNamePrefix
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
@@ -203,7 +203,7 @@ export function getScriptName(fileNamePrefix: string): string {
/**
* generate script file
* @param gpuMetricCollectorScriptFolder
* @param gpuMetricCollectorScriptFolder
*/
export
function
getgpuMetricsCollectorScriptContent
(
gpuMetricCollectorScriptFolder
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
View file @
ae7a72bc
...
...
@@ -26,11 +26,11 @@ import { getLogger } from '../../common/log';
import
{
mkDirP
}
from
'
../../common/utils
'
;
export
namespace
AzureStorageClientUtility
{
/**
* create azure share
* @param fileServerClient
* @param azureShare
* @param fileServerClient
* @param azureShare
*/
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -44,12 +44,12 @@ export namespace AzureStorageClientUtility {
})
return
deferred
.
promise
;
}
/**
* Create a new directory (NOT recursively) in azure file storage.
* @param fileServerClient
* @param azureFoler
* @param azureShare
* @param fileServerClient
* @param azureFoler
* @param azureShare
*/
export
async
function
createDirectory
(
fileServerClient
:
any
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -67,7 +67,7 @@ export namespace AzureStorageClientUtility {
/**
* Create a new directory recursively in azure file storage
* @param fileServerClient
* @param azureDirectory
* @param azureDirectory
*/
export
async
function
createDirectoryRecursive
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -81,14 +81,14 @@ export namespace AzureStorageClientUtility {
deferred
.
resolve
();
return
deferred
.
promise
;
}
/**
* upload a file to azure storage
* @param fileServerClient
* @param azureDirectory
* @param azureFileName
* @param azureShare
* @param localFilePath
* @param fileServerClient
* @param azureDirectory
* @param azureFileName
* @param azureShare
* @param localFilePath
*/
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -96,20 +96,20 @@ export namespace AzureStorageClientUtility {
if
(
error
){
getLogger
().
error
(
`Upload file failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
}
})
return
deferred
.
promise
;
}
/**
* download a file from azure storage
* @param fileServerClient
* @param azureDirectory
* @param azureFileName
* @param azureShare
* @param localFilePath
* @param fileServerClient
* @param azureDirectory
* @param azureFileName
* @param azureShare
* @param localFilePath
*/
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -118,7 +118,7 @@ export namespace AzureStorageClientUtility {
getLogger
().
error
(
`Download file failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
}
else
{
deferred
.
resolve
();
deferred
.
resolve
();
}
})
return
deferred
.
promise
;
...
...
@@ -153,13 +153,13 @@ export namespace AzureStorageClientUtility {
deferred
.
resolve
();
return
deferred
.
promise
;
}
/**
* downlod a directory from azure
* @param fileServerClient
* @param azureDirectory
* @param azureShare
* @param localDirectory
* @param fileServerClient
* @param azureDirectory
* @param azureShare
* @param localDirectory
*/
export
async
function
downloadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
,
localDirectory
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -184,7 +184,7 @@ export namespace AzureStorageClientUtility {
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
.
name
);
downloadFile
(
fileServerClient
,
azureDirectory
,
fileName
.
name
,
azureShare
,
fullFilePath
)
}
for
(
var
directoryName
of
result
[
'
entries
'
][
'
directories
'
]){
const
fullDirectoryPath
:
string
=
path
.
join
(
localDirectory
,
directoryName
.
name
)
const
fullAzureDirectory
:
string
=
path
.
join
(
azureDirectory
,
directoryName
.
name
)
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerApiClient.ts
View file @
ae7a72bc
...
...
@@ -47,7 +47,7 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
public
get
containerName
():
string
{
return
'
framework
'
;
}
}
}
export
{
FrameworkControllerClient
,
GeneralK8sClient
};
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
View file @
ae7a72bc
...
...
@@ -40,8 +40,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
public
readonly
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
;
public
readonly
name
:
string
;
public
readonly
taskNum
:
number
;
constructor
(
taskNum
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
constructor
(
taskNum
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
this
.
frameworkAttemptCompletionPolicy
=
frameworkAttemptCompletionPolicy
;
...
...
@@ -71,8 +71,8 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
export
class
FrameworkControllerClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
serviceAccountName
:
string
;
constructor
(
serviceAccountName
:
string
,
apiVersion
:
string
,
serviceAccountName
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
)
{
...
...
@@ -94,12 +94,12 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
export
class
FrameworkControllerClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
public
readonly
serviceAccountName
:
string
;
constructor
(
serviceAccountName
:
string
,
apiVersion
:
string
,
keyVault
:
keyVaultConfig
,
azureStorage
:
AzureStorage
,
serviceAccountName
:
string
,
apiVersion
:
string
,
keyVault
:
keyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobInfoCollector.ts
View file @
ae7a72bc
...
...
@@ -32,7 +32,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
super
(
jobMap
);
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
return
Promise
.
resolve
();
...
...
@@ -44,7 +44,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
let
kubernetesJobInfo
:
any
;
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
...
...
@@ -71,9 +71,9 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
break
;
case
'
Failed
'
:
kubernetesTrialJob
.
status
=
'
FAILED
'
;
break
;
break
;
}
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
completionTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
completionTime
);
break
;
default
:
break
;
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobRestServer.ts
View file @
ae7a72bc
...
...
@@ -25,11 +25,11 @@ import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
*
*
*/
@
component
.
Singleton
export
class
FrameworkControllerJobRestServer
extends
KubernetesJobRestServer
{
constructor
()
{
super
(
component
.
get
(
FrameworkControllerTrainingService
));
}
}
}
\ No newline at end of file
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
ae7a72bc
...
...
@@ -37,7 +37,7 @@ import { KubernetesTrialJobDetail } from '../kubernetesData';
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
FrameworkControllerTrialConfig
,
FrameworkControllerClusterConfig
,
FrameworkControllerClusterConfigAzure
,
FrameworkControllerClusterConfigNFS
,
import
{
FrameworkControllerTrialConfig
,
FrameworkControllerClusterConfig
,
FrameworkControllerClusterConfigAzure
,
FrameworkControllerClusterConfigNFS
,
FrameworkControllerClusterConfigFactory
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJobRestServer
}
from
'
./frameworkcontrollerJobRestServer
'
;
import
{
FrameworkControllerClient
}
from
'
./frameworkcontrollerApiClient
'
;
...
...
@@ -56,7 +56,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
constructor
()
{
super
();
this
.
fcJobInfoCollector
=
new
FrameworkControllerJobInfoCollector
(
this
.
trialJobsMap
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
}
...
...
@@ -69,7 +69,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`frameworkcontroller Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
...
...
@@ -101,10 +101,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
//Generate the port used for taskRole
this
.
generateContainerPort
();
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
curTrialSequenceId
,
trialJobId
,
trialWorkingFolder
,
form
);
//upload code files
let
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
'
WAITING
'
,
...
...
@@ -116,14 +116,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
trialJobOutputUrl
);
// Set trial job detail until create frameworkcontroller job successfully
// Set trial job detail until create frameworkcontroller job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
const
frameworkcontrollerJobConfig
=
await
this
.
prepareFrameworkControllerConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
);
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
frameworkcontrollerJobConfig
);
// Set trial job detail until create frameworkcontroller job successfully
// Set trial job detail until create frameworkcontroller job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
...
...
@@ -131,8 +131,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
/**
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
...
...
@@ -145,7 +145,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
try
{
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
...
...
@@ -155,21 +155,21 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
// Creat work dir for current trial in NFS directory
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsFrameworkControllerClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
/**
* generate trial's command for frameworkcontroller
* expose port and execute injector.sh before executing user's command
* @param command
* @param command
*/
private
generateCommandScript
(
command
:
string
):
string
{
let
portScript
=
''
;
...
...
@@ -181,7 +181,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
return
`
${
portScript
}
. /mnt/frameworkbarrier/injector.sh &&
${
command
}
`
;
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
curTrialSequenceId
:
number
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
!
this
.
fcTrialConfig
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
...
...
@@ -196,7 +196,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
for
(
let
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
const
runScriptContent
:
string
=
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
const
runScriptContent
:
string
=
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
this
.
generateCommandScript
(
taskRole
.
command
),
curTrialSequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
`run_
${
taskRole
.
name
}
.sh`
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
}
...
...
@@ -204,11 +204,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
private
async
prepareFrameworkControllerConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
fcTrialConfig
)
{
...
...
@@ -222,18 +222,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
resource
.
limits
=
Object
.
assign
({},
resource
.
requests
);
podResources
.
push
(
resource
);
}
// Generate frameworkcontroller job resource config object
// Generate frameworkcontroller job resource config object
const
frameworkcontrollerJobConfig
:
any
=
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
return
Promise
.
resolve
(
frameworkcontrollerJobConfig
);
}
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
FRAMEWORKCONTROLLER_CLUSTER_CONFIG
:
let
frameworkcontrollerClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
fcClusterConfig
=
FrameworkControllerClusterConfigFactory
.
generateFrameworkControllerClusterConfig
(
frameworkcontrollerClusterJsonObject
);
...
...
@@ -253,7 +253,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
nfsFrameworkControllerClusterConfig
.
nfs
.
server
,
nfsFrameworkControllerClusterConfig
.
nfs
.
path
);
}
}
this
.
kubernetesCRDClient
=
FrameworkControllerClient
.
generateFrameworkControllerClient
();
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
...
...
@@ -269,7 +269,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
...
...
@@ -284,7 +284,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return
Promise
.
resolve
();
}
private
generateContainerPort
()
{
if
(
!
this
.
fcTrialConfig
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
...
...
@@ -312,7 +312,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if
(
!
this
.
fcTrialConfig
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
let
taskRoles
=
[];
for
(
let
index
in
this
.
fcTrialConfig
.
taskRoles
)
{
let
containerPort
=
this
.
fcContainerPortMap
.
get
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
);
...
...
@@ -320,8 +320,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw
new
Error
(
'
Container port is not initialized
'
);
}
let
taskRole
=
this
.
generateTaskRoleConfig
(
trialWorkingFolder
,
this
.
fcTrialConfig
.
taskRoles
[
index
].
image
,
trialWorkingFolder
,
this
.
fcTrialConfig
.
taskRoles
[
index
].
image
,
`run_
${
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
}
.sh`
,
podResources
[
index
],
containerPort
...
...
@@ -330,17 +330,17 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
name
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
taskNumber
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
taskNum
,
frameworkAttemptCompletionPolicy
:
{
minFailedTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minFailedTaskCount
,
minFailedTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minFailedTaskCount
,
minSucceededTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minSucceededTaskCount
},
task
:
taskRole
});
}
return
{
apiVersion
:
`frameworkcontroller.microsoft.com/v1`
,
kind
:
'
Framework
'
,
metadata
:
{
metadata
:
{
name
:
frameworkcontrollerJobName
,
namespace
:
'
default
'
,
labels
:
{
...
...
@@ -356,7 +356,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
};
}
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
):
any
{
if
(
!
this
.
fcClusterConfig
)
{
...
...
@@ -366,7 +366,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if
(
!
this
.
fcTrialConfig
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
let
volumeSpecMap
=
new
Map
<
string
,
object
>
();
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
){
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
...
...
@@ -395,7 +395,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
emptyDir
:
{}
}])
}
let
containers
=
[
{
name
:
'
framework
'
,
...
...
@@ -420,7 +420,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
name
:
'
frameworkbarrier
'
,
image
:
'
frameworkcontroller/frameworkbarrier
'
,
volumeMounts
:
[
{
{
name
:
'
frameworkbarrier-volume
'
,
mountPath
:
'
/mnt/frameworkbarrier
'
}]
...
...
@@ -432,8 +432,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
),
hostNetwork
:
false
};
if
(
this
.
fcClusterConfig
.
serviceAccountName
)
{
spec
.
serviceAccountName
=
this
.
fcClusterConfig
.
serviceAccountName
;
if
(
this
.
fcClusterConfig
.
serviceAccountName
)
{
spec
.
serviceAccountName
=
this
.
fcClusterConfig
.
serviceAccountName
;
}
let
taskRole
=
{
pod
:
{
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
View file @
ae7a72bc
...
...
@@ -27,7 +27,7 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
/**
* Factory method to generate operator cliet
*/
public
static
generateOperatorClient
(
kubeflowOperator
:
KubeflowOperator
,
public
static
generateOperatorClient
(
kubeflowOperator
:
KubeflowOperator
,
operatorApiVersion
:
string
):
KubernetesCRDClient
{
switch
(
kubeflowOperator
)
{
case
'
tf-operator
'
:
{
...
...
@@ -78,7 +78,7 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
public
get
containerName
():
string
{
return
'
tensorflow
'
;
}
}
}
class
TFOperatorClientV1Beta1
extends
KubernetesCRDClient
{
...
...
@@ -97,7 +97,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
public
get
containerName
():
string
{
return
'
tensorflow
'
;
}
}
}
class
TFOperatorClientV1Beta2
extends
KubernetesCRDClient
{
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
View file @
ae7a72bc
...
...
@@ -41,8 +41,8 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
export
class
KubeflowClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
operator
:
KubeflowOperator
;
constructor
(
operator
:
KubeflowOperator
,
apiVersion
:
string
,
operator
:
KubeflowOperator
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
)
{
...
...
@@ -68,12 +68,12 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
export
class
KubeflowClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
public
readonly
operator
:
KubeflowOperator
;
constructor
(
operator
:
KubeflowOperator
,
apiVersion
:
string
,
keyVault
:
keyVaultConfig
,
azureStorage
:
AzureStorage
,
operator
:
KubeflowOperator
,
apiVersion
:
string
,
keyVault
:
keyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
...
...
@@ -124,7 +124,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
public
readonly
replicas
:
number
;
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
this
.
replicas
=
replicas
;
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
View file @
ae7a72bc
...
...
@@ -32,7 +32,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
super
(
jobMap
);
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
return
Promise
.
resolve
();
...
...
@@ -44,9 +44,9 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
let
kubernetesJobInfo
:
any
;
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
return
Promise
.
resolve
();
...
...
@@ -58,8 +58,8 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
switch
(
tfJobType
)
{
case
'
Created
'
:
kubernetesTrialJob
.
status
=
'
WAITING
'
;
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
case
'
Running
'
:
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
kubernetesTrialJob
.
startTime
)
{
...
...
@@ -68,11 +68,11 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
break
;
case
'
Failed
'
:
kubernetesTrialJob
.
status
=
'
FAILED
'
;
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
case
'
Succeeded
'
:
kubernetesTrialJob
.
status
=
'
SUCCEEDED
'
;
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
default
:
break
;
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
View file @
ae7a72bc
...
...
@@ -25,7 +25,7 @@ import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*
*/
@
component
.
Singleton
export
class
KubeflowJobRestServer
extends
KubernetesJobRestServer
{
...
...
@@ -34,5 +34,5 @@ export class KubeflowJobRestServer extends KubernetesJobRestServer{
*/
constructor
()
{
super
(
component
.
get
(
KubeflowTrainingService
));
}
}
}
\ No newline at end of file
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
ae7a72bc
...
...
@@ -57,9 +57,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
private
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
constructor
()
{
super
();
super
();
this
.
kubeflowJobInfoCollector
=
new
KubeflowJobInfoCollector
(
this
.
trialJobsMap
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
this
.
log
.
info
(
'
Construct Kubeflow training service.
'
);
}
...
...
@@ -74,7 +74,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
...
...
@@ -113,22 +113,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
curTrialSequenceId
,
trialJobOutputUrl
);
// Generate kubeflow job resource config object
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
await
this
.
prepareKubeflowConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
);
// Create kubeflow job based on generated kubeflow job resource config
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
kubeflowJobConfig
);
// Set trial job detail until create Kubeflow job successfully
// Set trial job detail until create Kubeflow job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
}
/**
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
...
...
@@ -138,14 +138,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
let
trialJobOutputUrl
:
string
=
''
;
assert
(
!
this
.
kubeflowClusterConfig
.
storage
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
assert
(
!
this
.
kubeflowClusterConfig
.
storage
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
try
{
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
...
...
@@ -155,18 +155,18 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
let
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
...
...
@@ -181,7 +181,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
...
...
@@ -193,7 +193,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
)
{
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_worker.sh
'
),
workerRunScriptContent
,
{
encoding
:
'
utf8
'
});
...
...
@@ -202,7 +202,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
){
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_ps.sh
'
),
psRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
...
...
@@ -210,7 +210,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
){
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_master.sh
'
),
masterRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
...
...
@@ -218,11 +218,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
private
async
prepareKubeflowConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
...
...
@@ -241,10 +241,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
const
workerPodResources
:
any
=
{};
if
(
kubeflowTrialConfig
.
worker
)
{
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
kubeflowTrialConfig
.
worker
.
gpuNum
)
}
workerPodResources
.
limits
=
Object
.
assign
({},
workerPodResources
.
requests
);
...
...
@@ -253,30 +253,30 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
)
{
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
tensorflowTrialConfig
.
ps
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
let
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
pyTorchTrialConfig
.
master
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
}
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
// Generate kubeflow job resource config object
}
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
}
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
let
kubeflowClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
...
...
@@ -296,7 +296,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
...
...
@@ -304,13 +304,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
!
this
.
kubeflowClusterConfig
){
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
)
let
kubeflowTrialJsonObjsect
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
...
...
@@ -319,7 +319,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
...
...
@@ -361,11 +361,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
if
(
tensorflowTrialConfig
.
ps
){
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
tfReplicaSpecs
'
:
replicaSpecsObj
})
...
...
@@ -373,19 +373,19 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
)
{
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
}
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
pytorchReplicaSpecs
'
:
replicaSpecsObj
})
}
return
{
apiVersion
:
`kubeflow.org/
${
this
.
kubernetesCRDClient
.
apiVersion
}
`
,
kind
:
this
.
kubernetesCRDClient
.
jobKind
,
metadata
:
{
metadata
:
{
name
:
kubeflowJobName
,
namespace
:
'
default
'
,
labels
:
{
...
...
@@ -395,7 +395,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
},
spec
:
replicaSpecsObjMap
.
get
(
this
.
kubernetesCRDClient
.
jobKind
)
};
};
}
/**
...
...
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
View file @
ae7a72bc
...
...
@@ -39,7 +39,7 @@ class GeneralK8sClient {
}
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
...
...
@@ -65,8 +65,8 @@ abstract class KubernetesCRDClient {
public
abstract
get
containerName
():
string
;
public
get
jobKind
():
string
{
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
names
&&
this
.
crdSchema
.
spec
.
names
.
kind
)
{
return
this
.
crdSchema
.
spec
.
names
.
kind
;
...
...
@@ -76,15 +76,15 @@ abstract class KubernetesCRDClient {
}
public
get
apiVersion
():
string
{
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
version
)
{
return
this
.
crdSchema
.
spec
.
version
;
}
else
{
throw
new
Error
(
'
KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!
'
);
}
}
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
...
...
@@ -117,7 +117,7 @@ abstract class KubernetesCRDClient {
qs
:
{
labelSelector
:
matchQuery
,
propagationPolicy
:
"
Background
"
}
}
});
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
result
=
Promise
.
resolve
(
true
);
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
ae7a72bc
...
...
@@ -25,7 +25,7 @@ import { MethodNotImplementedError } from '../../common/errors';
export
abstract
class
KubernetesClusterConfig
{
public
readonly
storage
?:
KubernetesStorageKind
;
public
readonly
apiVersion
:
string
;
constructor
(
apiVersion
:
string
,
storage
?:
KubernetesStorageKind
)
{
this
.
storage
=
storage
;
this
.
apiVersion
=
apiVersion
;
...
...
@@ -48,7 +48,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
public
readonly
nfs
:
NFSConfig
;
constructor
(
apiVersion
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
)
{
...
...
@@ -73,11 +73,11 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
public
readonly
keyVault
:
keyVaultConfig
;
public
readonly
azureStorage
:
AzureStorage
;
constructor
(
apiVersion
:
string
,
keyVault
:
keyVaultConfig
,
azureStorage
:
AzureStorage
,
apiVersion
:
string
,
keyVault
:
keyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
super
(
apiVersion
,
storage
);
...
...
@@ -151,7 +151,7 @@ export class keyVaultConfig {
export
class
AzureStorage
{
/**The azure share to storage files */
public
readonly
azureShare
:
string
;
/**The account name of sotrage service */
public
readonly
accountName
:
string
;
constructor
(
azureShare
:
string
,
accountName
:
string
){
...
...
@@ -178,8 +178,8 @@ export class KubernetesTrialConfigTemplate {
/** Required GPU number for trial job. The number should be in [0,100] */
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
gpuNum
:
number
,
constructor
(
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
this
.
command
=
command
;
this
.
gpuNum
=
gpuNum
;
...
...
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
ae7a72bc
...
...
@@ -40,7 +40,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
public
queryJobFailedCount
:
number
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
kubernetesJobName
:
string
,
sequenceId
:
number
,
url
:
string
)
{
this
.
id
=
id
;
this
.
status
=
status
;
...
...
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
View file @
ae7a72bc
...
...
@@ -57,7 +57,7 @@ export class KubernetesJobInfoCollector {
await
Promise
.
all
(
updateKubernetesTrialJobs
);
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
throw
new
MethodNotImplementedError
();
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
View file @
ae7a72bc
...
...
@@ -26,7 +26,7 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*
*/
@
component
.
Singleton
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
...
...
@@ -53,5 +53,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
data
:
singleMetric
});
}
}
}
}
\ No newline at end of file
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment