Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
ba8dccd6
Commit
ba8dccd6
authored
Jun 23, 2019
by
suiguoxin
Browse files
Merge branch 'master' of
https://github.com/microsoft/nni
parents
56a1575b
150ee83a
Changes
198
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
870 additions
and
726 deletions
+870
-726
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+4
-4
src/nni_manager/training_service/common/jobMetrics.ts
src/nni_manager/training_service/common/jobMetrics.ts
+4
-1
src/nni_manager/training_service/common/trialConfig.ts
src/nni_manager/training_service/common/trialConfig.ts
+4
-4
src/nni_manager/training_service/common/util.ts
src/nni_manager/training_service/common/util.ts
+57
-40
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
...er/training_service/kubernetes/azureStorageClientUtils.ts
+109
-79
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerApiClient.ts
...netes/frameworkcontroller/frameworkcontrollerApiClient.ts
+15
-6
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
...bernetes/frameworkcontroller/frameworkcontrollerConfig.ts
+30
-25
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobInfoCollector.ts
...rameworkcontroller/frameworkcontrollerJobInfoCollector.ts
+26
-18
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobRestServer.ts
...s/frameworkcontroller/frameworkcontrollerJobRestServer.ts
+5
-5
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+191
-171
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
...training_service/kubernetes/kubeflow/kubeflowApiClient.ts
+30
-21
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
...er/training_service/kubernetes/kubeflow/kubeflowConfig.ts
+41
-28
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
...g_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
+21
-18
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
...ning_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
+5
-5
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+232
-219
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
...anager/training_service/kubernetes/kubernetesApiClient.ts
+34
-25
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+41
-35
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+4
-5
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
...training_service/kubernetes/kubernetesJobInfoCollector.ts
+8
-9
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
...er/training_service/kubernetes/kubernetesJobRestServer.ts
+9
-8
No files found.
src/nni_manager/training_service/common/gpuData.ts
View file @
ba8dccd6
...
...
@@ -65,11 +65,11 @@ export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string =
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
`
;
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
`
$env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
Write $app.ID | Out-File {1} -NoNewline -encoding utf8
`
\ No newline at end of file
`
;
src/nni_manager/training_service/common/jobMetrics.ts
View file @
ba8dccd6
...
...
@@ -21,7 +21,10 @@
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
// tslint:disable-next-line:max-classes-per-file
/**
* Trial job metrics class
* Representing trial job metrics properties
*/
export
class
JobMetrics
{
public
readonly
jobId
:
string
;
public
readonly
metrics
:
string
[];
...
...
src/nni_manager/training_service/common/trialConfig.ts
View file @
ba8dccd6
...
...
@@ -24,13 +24,13 @@
* Representing trial job configurable properties
*/
export
class
TrialConfig
{
/
**
Trail command
*/
/
/
Trail command
public
readonly
command
:
string
;
/
**
Code directory
*/
/
/
Code directory
public
readonly
codeDir
:
string
;
/
**
Required GPU number for trial job. The number should be in [0,100]
*/
/
/
Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
/**
...
...
src/nni_manager/training_service/common/util.ts
View file @
ba8dccd6
import
{
getLogger
}
from
"
common/log
"
;
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
...
...
@@ -21,16 +19,15 @@ import { getLogger } from "common/log";
'
use strict
'
;
import
{
countFilesRecursively
}
from
'
../../common/utils
'
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
*
as
os
from
'
os
'
;
import
*
as
fs
from
'
fs
'
;
import
{
getNewLine
}
from
'
../../common/utils
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
}
from
'
./gpuData
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
file
}
from
"
../../node_modules/@types/tmp
"
;
import
{
countFilesRecursively
,
getNewLine
,
validateFileNameRecursively
}
from
'
../../common/utils
'
;
import
{
file
}
from
'
../../node_modules/@types/tmp
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
}
from
'
./gpuData
'
;
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
...
...
@@ -38,24 +35,36 @@ import { file } from "../../node_modules/@types/tmp";
* @param codeDir codeDir in nni config file
* @returns file number under codeDir
*/
// tslint:disable: no-redundant-jsdoc
export
async
function
validateCodeDir
(
codeDir
:
string
)
:
Promise
<
number
>
{
let
fileCount
:
number
|
undefined
;
let
fileNameValid
:
boolean
=
true
;
try
{
fileCount
=
await
countFilesRecursively
(
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
throw
new
Error
(
`Call count file error:
${
error
}
`
);
}
try
{
fileNameValid
=
await
validateFileNameRecursively
(
codeDir
);
}
catch
(
error
)
{
throw
new
Error
(
`Validate file name error:
${
error
}
`
);
}
if
(
fileCount
&&
fileCount
>
1000
)
{
if
(
fileCount
!==
undefined
&&
fileCount
>
1000
)
{
const
errMessage
:
string
=
`Too many files(
${
fileCount
}
found}) in
${
codeDir
}
,`
+
` please check if it's a valid code dir`
;
throw
new
Error
(
errMessage
);
}
if
(
!
fileNameValid
)
{
const
errMessage
:
string
=
`File name in
${
codeDir
}
is not valid, please check file names, only support digit number、alphabet and (.-_) in file name.`
;
throw
new
Error
(
errMessage
);
}
return
fileCount
;
}
/**
* crete a new directory
* @param directory
...
...
@@ -66,6 +75,7 @@ export async function execMkdir(directory: string): Promise<void> {
}
else
{
await
cpp
.
exec
(
`mkdir -p
${
directory
}
`
);
}
return
Promise
.
resolve
();
}
...
...
@@ -80,6 +90,7 @@ export async function execCopydir(source: string, destination: string): Promise<
}
else
{
await
cpp
.
exec
(
`cp -r
${
source
}
${
destination
}
`
);
}
return
Promise
.
resolve
();
}
...
...
@@ -93,16 +104,17 @@ export async function execNewFile(filename: string): Promise<void> {
}
else
{
await
cpp
.
exec
(
`touch
${
filename
}
`
);
}
return
Promise
.
resolve
();
}
/**
* run script
* run script
using powershell or bash
* @param filePath
*/
export
function
exec
Script
(
filePath
:
string
):
cp
.
ChildProcess
{
export
function
run
Script
(
filePath
:
string
):
cp
.
ChildProcess
{
if
(
process
.
platform
===
'
win32
'
)
{
return
cp
.
exec
(
`powershell.exe -file
${
filePath
}
`
);
return
cp
.
exec
(
`powershell.exe
-ExecutionPolicy Bypass
-file
${
filePath
}
`
);
}
else
{
return
cp
.
exec
(
`bash
${
filePath
}
`
);
}
...
...
@@ -119,6 +131,7 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
}
else
{
cmdresult
=
await
cpp
.
exec
(
`tail -n 1
${
filePath
}
`
);
}
return
Promise
.
resolve
(
cmdresult
);
}
...
...
@@ -132,6 +145,7 @@ export async function execRemove(directory: string): Promise<void> {
}
else
{
await
cpp
.
exec
(
`rm -rf
${
directory
}
`
);
}
return
Promise
.
resolve
();
}
...
...
@@ -145,37 +159,39 @@ export async function execKill(pid: string): Promise<void> {
}
else
{
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
}
return
Promise
.
resolve
();
}
/**
*
set
environment variable
*
get command of setting
environment variable
* @param variable
* @returns command string
*/
export
function
setEnvironmentVariable
(
variable
:
{
key
:
string
;
value
:
string
}):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
return
`$env:
${
variable
.
key
}
="
${
variable
.
value
}
"`
;
}
else
{
}
else
{
return
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
;
}
}
/**
* Compress files in directory to tar file
* @param source
_p
ath
* @param tar
_p
ath
* @param source
P
ath
* @param tar
P
ath
*/
export
async
function
tarAdd
(
tar
_p
ath
:
string
,
source
_p
ath
:
string
):
Promise
<
void
>
{
export
async
function
tarAdd
(
tar
P
ath
:
string
,
source
P
ath
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
tar_path
=
tar_path
.
split
(
'
\\
'
).
join
(
'
\\\\
'
);
source_path
=
source_path
.
split
(
'
\\
'
).
join
(
'
\\\\
'
);
let
script
:
string
[]
=
[];
const
tarFilePath
:
string
=
tarPath
.
split
(
'
\\
'
)
.
join
(
'
\\\\
'
);
const
sourceFilePath
:
string
=
sourcePath
.
split
(
'
\\
'
)
.
join
(
'
\\\\
'
);
const
script
:
string
[]
=
[];
script
.
push
(
`import os`
,
`import tarfile`
,
String
.
Format
(
`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`
,
tar
_p
ath
,
source
_p
ath
),
String
.
Format
(
`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`
,
tar
FileP
ath
,
source
FileP
ath
),
` for file in files:`
,
` fullpath = os.path.join(root,file)`
,
` tar.add(fullpath, arcname=file)`
,
...
...
@@ -184,8 +200,9 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
const
tarScript
:
string
=
path
.
join
(
os
.
tmpdir
(),
'
tar.py
'
);
await
cpp
.
exec
(
`python
${
tarScript
}
`
);
}
else
{
await
cpp
.
exec
(
`tar -czf
${
tar
_p
ath
}
-C
${
source
_p
ath
}
.`
);
await
cpp
.
exec
(
`tar -czf
${
tar
P
ath
}
-C
${
source
P
ath
}
.`
);
}
return
Promise
.
resolve
();
}
...
...
@@ -195,9 +212,9 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
*/
export
function
getScriptName
(
fileNamePrefix
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
return
fileNamePrefix
+
'
.ps1
'
;
return
String
.
Format
(
'
{0}.ps1
'
,
fileNamePrefix
)
;
}
else
{
return
fileNamePrefix
+
'
.sh
'
;
return
String
.
Format
(
'
{0}.sh
'
,
fileNamePrefix
)
;
}
}
...
...
@@ -206,17 +223,17 @@ export function getScriptName(fileNamePrefix: string): string {
* @param gpuMetricCollectorScriptFolder
*/
export
function
getgpuMetricsCollectorScriptContent
(
gpuMetricCollectorScriptFolder
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
}
else
{
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
}
}
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
View file @
ba8dccd6
...
...
@@ -19,12 +19,15 @@
'
use strict
'
;
import
*
as
fs
from
'
fs
'
import
*
as
azureStorage
from
'
azure-storage
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
getLogger
}
from
'
../../common/log
'
;
import
{
mkDirP
}
from
'
../../common/utils
'
;
// tslint:disable: no-redundant-jsdoc no-any no-unsafe-any
export
namespace
AzureStorageClientUtility
{
/**
...
...
@@ -32,16 +35,18 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient
* @param azureShare
*/
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
fileServerClient
.
createShareIfNotExists
(
azureShare
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
if
(
error
){
getLogger
().
error
(
`Create share failed:,
${
error
}
`
);
deferred
.
reject
(
error
)
}
else
{
deferred
.
resolve
()
}
})
fileServerClient
.
createShareIfNotExists
(
azureShare
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Create share failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
}
else
{
deferred
.
resolve
();
}
});
return
deferred
.
promise
;
}
...
...
@@ -51,16 +56,18 @@ export namespace AzureStorageClientUtility {
* @param azureFoler
* @param azureShare
*/
export
async
function
createDirectory
(
fileServerClient
:
a
ny
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createDirectory
(
fileServerClient
:
a
zureStorage
.
FileService
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
if
(
error
){
getLogger
().
error
(
`Create directory failed:,
${
error
}
`
);
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Create directory failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
}
})
});
return
deferred
.
promise
;
}
...
...
@@ -69,16 +76,18 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient
* @param azureDirectory
*/
export
async
function
createDirectoryRecursive
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createDirectoryRecursive
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
le
t
directories
=
azureDirectory
.
split
(
"
/
"
);
let
rootDirectory
=
""
for
(
le
t
directory
of
directories
){
cons
t
directories
:
string
[]
=
azureDirectory
.
split
(
'
/
'
);
let
rootDirectory
:
string
=
''
;
for
(
cons
t
directory
of
directories
)
{
rootDirectory
+=
directory
;
await
createDirectory
(
fileServerClient
,
rootDirectory
,
azureShare
);
rootDirectory
+=
'
/
'
;
}
deferred
.
resolve
();
return
deferred
.
promise
;
}
...
...
@@ -90,16 +99,20 @@ export namespace AzureStorageClientUtility {
* @param azureShare
* @param localFilePath
*/
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
if
(
error
){
getLogger
().
error
(
`Upload file failed:,
${
error
}
`
);
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Upload file failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
}
})
});
return
deferred
.
promise
;
}
...
...
@@ -111,16 +124,21 @@ export namespace AzureStorageClientUtility {
* @param azureShare
* @param localFilePath
*/
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
if
(
error
){
getLogger
().
error
(
`Download file failed:,
${
error
}
`
);
// tslint:disable-next-line:non-literal-fs-path
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Download file failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
}
})
});
return
deferred
.
promise
;
}
...
...
@@ -131,26 +149,31 @@ export namespace AzureStorageClientUtility {
* @param azureShare : the azure share used
* @param localDirectory : local directory to be uploaded
*/
export
async
function
uploadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
,
localDirectory
:
any
):
Promise
<
void
>
{
// tslint:disable:non-literal-fs-path
export
async
function
uploadDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
,
localDirectory
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
await
createDirectoryRecursive
(
fileServerClient
,
azureDirectory
,
azureShare
);
for
(
le
t
fileName
of
fileNameArray
){
for
(
cons
t
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
if
(
fs
.
lstatSync
(
fullFilePath
).
isFile
())
{
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
await
uploadFileToAzure
(
fileServerClient
,
azureDirectory
,
fileName
,
azureShare
,
fullFilePath
);
}
else
{
// If filePath is a directory, recuisively copy it to azure
await
uploadDirectory
(
fileServerClient
,
azureDirectory
+
'
/
'
+
fileName
,
azureShare
,
fullFilePath
);
await
uploadDirectory
(
fileServerClient
,
String
.
Format
(
'
{0}/{1}
'
,
azureDirectory
,
fileName
)
,
azureShare
,
fullFilePath
);
}
}
catch
(
error
)
{
}
catch
(
error
)
{
deferred
.
reject
(
error
);
return
deferred
.
promise
;
}
}
// All files/directories are copied successfully, resolve
deferred
.
resolve
();
return
deferred
.
promise
;
}
...
...
@@ -161,37 +184,44 @@ export namespace AzureStorageClientUtility {
* @param azureShare
* @param localDirectory
*/
export
async
function
downloadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
,
localDirectory
:
any
):
Promise
<
void
>
{
export
async
function
downloadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureShare
:
any
,
localDirectory
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
mkDirP
(
localDirectory
);
fileServerClient
.
listFilesAndDirectoriesSegmented
(
azureShare
,
azureDirectory
,
'
null
'
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
if
((
'
entries
'
in
result
)
===
false
){
getLogger
().
error
(
`list files failed, can't get entries in result`
);
await
mkDirP
(
localDirectory
);
fileServerClient
.
listFilesAndDirectoriesSegmented
(
azureShare
,
azureDirectory
,
'
null
'
,
async
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
((
'
entries
'
in
result
)
===
false
)
{
getLogger
()
.
error
(
`list files failed, can't get entries in result`
);
throw
new
Error
(
`list files failed, can't get entries in result`
);
}
if
((
'
files
'
in
result
[
'
entries
'
])
===
false
){
getLogger
().
error
(
`list files failed, can't get files in result['entries']`
);
if
((
'
files
'
in
result
.
entries
)
===
false
)
{
getLogger
()
.
error
(
`list files failed, can't get files in result['entries']`
);
throw
new
Error
(
`list files failed, can't get files in result['entries']`
);
}
if
((
'
directories
'
in
result
[
'
directories
'
])
===
false
){
getLogger
().
error
(
`list files failed, can't get directories in result['entries']`
);
if
((
'
directories
'
in
result
.
directories
)
===
false
)
{
getLogger
()
.
error
(
`list files failed, can't get directories in result['entries']`
);
throw
new
Error
(
`list files failed, can't get directories in result['entries']`
);
}
for
(
var
fileName
of
result
[
'
entries
'
][
'
files
'
]
){
for
(
const
fileName
of
result
.
entries
.
files
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
.
name
);
downloadFile
(
fileServerClient
,
azureDirectory
,
fileName
.
name
,
azureShare
,
fullFilePath
)
await
downloadFile
(
fileServerClient
,
azureDirectory
,
fileName
.
name
,
azureShare
,
fullFilePath
)
;
}
for
(
var
directoryName
of
result
[
'
entries
'
][
'
directories
'
]
){
const
fullDirectoryPath
:
string
=
path
.
join
(
localDirectory
,
directoryName
.
name
)
const
fullAzureDirectory
:
string
=
path
.
join
(
azureDirectory
,
directoryName
.
name
)
downloadDirectory
(
fileServerClient
,
fullAzureDirectory
,
azureShare
,
fullDirectoryPath
)
for
(
const
directoryName
of
result
.
entries
.
directories
)
{
const
fullDirectoryPath
:
string
=
path
.
join
(
localDirectory
,
directoryName
.
name
)
;
const
fullAzureDirectory
:
string
=
path
.
join
(
azureDirectory
,
directoryName
.
name
)
;
await
downloadDirectory
(
fileServerClient
,
fullAzureDirectory
,
azureShare
,
fullDirectoryPath
)
;
}
deferred
.
resolve
();
})
});
return
deferred
.
promise
;
}
}
// tslint:enable: no-redundant-jsdoc no-any no-unsafe-any
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerApiClient.ts
View file @
ba8dccd6
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
...
...
@@ -20,21 +21,29 @@
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
{
KubernetesCRDClient
,
GeneralK8s
Client
}
from
'
../kubernetesApiClient
'
;
import
{
GeneralK8sClient
,
KubernetesCRD
Client
}
from
'
../kubernetesApiClient
'
;
abstract
class
FrameworkControllerClient
extends
KubernetesCRDClient
{
/**
* FrameworkController Client
*/
abstract
class
FrameworkControllerClient
extends
KubernetesCRDClient
{
/**
* Factory method to generate operator cliet
* Factory method to generate operator clie
n
t
*/
// tslint:disable-next-line:function-name
public
static
generateFrameworkControllerClient
():
KubernetesCRDClient
{
return
new
FrameworkControllerClientV1
();
}
}
/**
* FrameworkController ClientV1
*/
class
FrameworkControllerClientV1
extends
FrameworkControllerClient
{
/**
* constructor, to initialize frameworkcontroller CRD definition
*/
// tslint:disable: no-unsafe-any no-any
public
constructor
()
{
super
();
this
.
crdSchema
=
JSON
.
parse
(
fs
.
readFileSync
(
'
./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json
'
,
'
utf8
'
));
...
...
@@ -42,8 +51,9 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
frameworkcontroller.microsoft.com
"
].
v1
.
namespaces
(
'
default
'
).
frameworks
;
return
this
.
client
.
apis
[
'
frameworkcontroller.microsoft.com
'
].
v1
.
namespaces
(
'
default
'
).
frameworks
;
}
// tslint:enable: no-unsafe-any no-any
public
get
containerName
():
string
{
return
'
framework
'
;
...
...
@@ -51,4 +61,3 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
}
export
{
FrameworkControllerClient
,
GeneralK8sClient
};
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
View file @
ba8dccd6
...
...
@@ -20,10 +20,11 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
Kubernetes
Trial
Config
,
Kubernetes
TrialConfigTemplat
e
,
KubernetesClusterConfig
Azure
,
Kubernetes
ClusterConfigNFS
,
NFSConfig
,
KubernetesStorageKind
,
keyVault
Config
,
Azure
Storage
,
KubernetesCluster
Config
,
StorageConfig
}
from
'
../kubernetesConfig
'
import
{
AzureStorage
,
KeyVaultConfig
,
Kubernetes
Cluster
Config
,
Kubernetes
ClusterConfigAzur
e
,
KubernetesClusterConfig
NFS
,
Kubernetes
StorageKind
,
KubernetesTrialConfig
,
KubernetesTrialConfigTemplate
,
NFS
Config
,
StorageConfig
}
from
'
../kubernetesConfig
'
;
// tslint:disable:completed-docs
export
class
FrameworkAttemptCompletionPolicy
{
public
readonly
minFailedTaskCount
:
number
;
public
readonly
minSucceededTaskCount
:
number
;
...
...
@@ -36,7 +37,7 @@ export class FrameworkAttemptCompletionPolicy {
/**
* Trial job configuration for FrameworkController
*/
export
class
FrameworkControllerTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
export
class
FrameworkControllerTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
public
readonly
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
;
public
readonly
name
:
string
;
public
readonly
taskNum
:
number
;
...
...
@@ -50,7 +51,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
}
}
export
class
FrameworkControllerTrialConfig
extends
KubernetesTrialConfig
{
export
class
FrameworkControllerTrialConfig
extends
KubernetesTrialConfig
{
public
readonly
taskRoles
:
FrameworkControllerTrialConfigTemplate
[];
public
readonly
codeDir
:
string
;
constructor
(
codeDir
:
string
,
taskRoles
:
FrameworkControllerTrialConfigTemplate
[])
{
...
...
@@ -68,6 +69,7 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
}
}
// tslint:disable:function-name
export
class
FrameworkControllerClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
serviceAccountName
:
string
;
constructor
(
...
...
@@ -81,8 +83,9 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
}
public
static
getInstance
(
jsonObject
:
object
):
FrameworkControllerClusterConfigNFS
{
let
kubeflowClusterConfigObjectNFS
=
<
FrameworkControllerClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
)
const
kubeflowClusterConfigObjectNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
);
return
new
FrameworkControllerClusterConfigNFS
(
kubeflowClusterConfigObjectNFS
.
serviceAccountName
,
kubeflowClusterConfigObjectNFS
.
apiVersion
,
...
...
@@ -98,16 +101,17 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf
constructor
(
serviceAccountName
:
string
,
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
this
.
serviceAccountName
=
serviceAccountName
;
}
public
static
getInstance
(
jsonObject
:
object
):
FrameworkControllerClusterConfigAzure
{
let
kubeflowClusterConfigObjectAzure
=
<
FrameworkControllerClusterConfigAzure
>
jsonObject
;
const
kubeflowClusterConfigObjectAzure
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
jsonObject
;
return
new
FrameworkControllerClusterConfigAzure
(
kubeflowClusterConfigObjectAzure
.
serviceAccountName
,
kubeflowClusterConfigObjectAzure
.
apiVersion
,
...
...
@@ -121,11 +125,11 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf
export
class
FrameworkControllerClusterConfigFactory
{
public
static
generateFrameworkControllerClusterConfig
(
jsonObject
:
object
):
FrameworkControllerClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
!
storageConfig
)
{
throw
new
Error
(
"
Invalid json object as a StorageConfig instance
"
);
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
storageConfig
===
undefined
)
{
throw
new
Error
(
'
Invalid json object as a StorageConfig instance
'
);
}
if
(
storageConfig
.
storage
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
if
(
storageConfig
.
storage
!==
undefined
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
return
FrameworkControllerClusterConfigAzure
.
getInstance
(
jsonObject
);
}
else
if
(
storageConfig
.
storage
===
undefined
||
storageConfig
.
storage
===
'
nfs
'
)
{
return
FrameworkControllerClusterConfigNFS
.
getInstance
(
jsonObject
);
...
...
@@ -134,6 +138,7 @@ export class FrameworkControllerClusterConfigFactory {
}
}
export
type
FrameworkControllerJobStatus
=
'
AttemptRunning
'
|
'
Completed
'
|
'
AttemptCreationPending
'
|
'
AttemptCreationRequested
'
|
'
AttemptPreparing
'
|
'
AttemptCompleted
'
;
export
type
FrameworkControllerJobStatus
=
'
AttemptRunning
'
|
'
Completed
'
|
'
AttemptCreationPending
'
|
'
AttemptCreationRequested
'
|
'
AttemptPreparing
'
|
'
AttemptCompleted
'
;
export
type
FrameworkControllerJobCompleteStatus
=
'
Succeeded
'
|
'
Failed
'
;
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobInfoCollector.ts
View file @
ba8dccd6
...
...
@@ -19,15 +19,15 @@
'
use strict
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesJobInfoCollector
}
from
'
../kubernetesJobInfoCollector
'
;
import
{
FrameworkControllerJobStatus
,
FrameworkControllerJob
Complete
Status
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJob
Complete
Status
,
FrameworkControllerJobStatus
}
from
'
./frameworkcontrollerConfig
'
;
/**
* Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally
*/
export
class
FrameworkControllerJobInfoCollector
extends
KubernetesJobInfoCollector
{
export
class
FrameworkControllerJobInfoCollector
extends
KubernetesJobInfoCollector
{
constructor
(
jobMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
)
{
super
(
jobMap
);
}
...
...
@@ -38,47 +38,55 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
return
Promise
.
resolve
();
}
if
(
kubernetesCRDClient
===
undefined
)
{
if
(
kubernetesCRDClient
===
undefined
)
{
return
Promise
.
reject
(
'
kubernetesCRDClient is undefined
'
);
}
// tslint:disable-next-line:no-any
let
kubernetesJobInfo
:
any
;
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
return
Promise
.
resolve
();
}
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
state
)
{
// tslint:disable: no-unsafe-any
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
state
)
{
const
frameworkJobType
:
FrameworkControllerJobStatus
=
<
FrameworkControllerJobStatus
>
kubernetesJobInfo
.
status
.
state
;
switch
(
frameworkJobType
)
{
case
'
AttemptCreationPending
'
||
'
AttemptCreationRequested
'
||
'
AttemptPreparing
'
:
switch
(
frameworkJobType
)
{
case
'
AttemptCreationPending
'
:
case
'
AttemptCreationRequested
'
:
case
'
AttemptPreparing
'
:
kubernetesTrialJob
.
status
=
'
WAITING
'
;
break
;
case
'
AttemptRunning
'
:
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
kubernetesTrialJob
.
startTime
)
{
if
(
kubernetesTrialJob
.
startTime
===
undefined
)
{
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
startTime
);
}
break
;
case
'
Completed
'
:
const
completedJobType
:
FrameworkControllerJobCompleteStatus
=
<
FrameworkControllerJobCompleteStatus
>
kubernetesJobInfo
.
status
.
attemptStatus
.
completionStatus
.
type
.
name
;
switch
(
completedJobType
)
{
const
completedJobType
:
FrameworkControllerJobCompleteStatus
=
<
FrameworkControllerJobCompleteStatus
>
kubernetesJobInfo
.
status
.
attemptStatus
.
completionStatus
.
type
.
name
;
switch
(
completedJobType
)
{
case
'
Succeeded
'
:
kubernetesTrialJob
.
status
=
'
SUCCEEDED
'
;
break
;
case
'
Failed
'
:
kubernetesTrialJob
.
status
=
'
FAILED
'
;
break
;
default
:
}
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
completionTime
);
break
;
default
:
break
;
}
}
return
Promise
.
resolve
();
}
// tslint:enable: no-unsafe-any
}
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobRestServer.ts
View file @
ba8dccd6
...
...
@@ -20,15 +20,15 @@
'
use strict
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
;
import
{
FrameworkControllerTrainingService
}
from
'
./frameworkcontrollerTrainingService
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
/**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
*
*/
@
component
.
Singleton
export
class
FrameworkControllerJobRestServer
extends
KubernetesJobRestServer
{
export
class
FrameworkControllerJobRestServer
extends
KubernetesJobRestServer
{
constructor
()
{
super
(
component
.
get
(
FrameworkControllerTrainingService
));
}
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
ba8dccd6
...
...
@@ -17,31 +17,29 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
'
use strict
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
JobApplicationForm
,
TrialJobApplicationForm
,
TrialJobDetail
,
NNIManagerIpConfig
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
FrameworkControllerTrialConfig
,
FrameworkControllerClusterConfig
,
FrameworkControllerClusterConfigAzure
,
FrameworkControllerClusterConfigNFS
,
FrameworkControllerClusterConfigFactory
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJobRestServer
}
from
'
./frameworkcontrollerJobRestServer
'
;
import
{
FrameworkControllerClient
}
from
'
./frameworkcontrollerApiClient
'
;
import
{
FrameworkControllerClusterConfig
,
FrameworkControllerClusterConfigAzure
,
FrameworkControllerClusterConfigFactory
,
FrameworkControllerClusterConfigNFS
,
FrameworkControllerTrialConfig
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJobInfoCollector
}
from
'
./frameworkcontrollerJobInfoCollector
'
;
import
{
FrameworkControllerJobRestServer
}
from
'
./frameworkcontrollerJobRestServer
'
;
/**
* Training Service implementation for frameworkcontroller
...
...
@@ -49,8 +47,8 @@ import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInf
@
component
.
Singleton
class
FrameworkControllerTrainingService
extends
KubernetesTrainingService
implements
KubernetesTrainingService
{
private
fcTrialConfig
?:
FrameworkControllerTrialConfig
;
// frameworkcontroller trial configuration
private
fcJobInfoCollector
:
FrameworkControllerJobInfoCollector
;
// frameworkcontroller job info collector
private
fcContainerPortMap
=
new
Map
<
string
,
number
>
();
// store frameworkcontroller container port
private
readonly
fcJobInfoCollector
:
FrameworkControllerJobInfoCollector
;
// frameworkcontroller job info collector
private
readonly
fcContainerPortMap
:
Map
<
string
,
number
>
=
new
Map
<
string
,
number
>
();
// store frameworkcontroller container port
private
fcClusterConfig
?:
FrameworkControllerClusterConfig
;
constructor
()
{
...
...
@@ -62,7 +60,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
public
async
run
():
Promise
<
void
>
{
this
.
kubernetesJobRestServer
=
component
.
get
(
FrameworkControllerJobRestServer
);
if
(
!
this
.
kubernetesJobRestServer
)
{
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
await
this
.
kubernetesJobRestServer
.
start
();
...
...
@@ -72,7 +70,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
...
...
@@ -80,14 +78,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
fcClusterConfig
)
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontrollerClusterConfig is not initialized
'
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
kubernetesCRDClient is undefined
'
);
}
if
(
!
this
.
kubernetesRestServerPort
)
{
if
(
this
.
kubernetesRestServerPort
===
undefined
)
{
const
restServer
:
FrameworkControllerJobRestServer
=
component
.
get
(
FrameworkControllerJobRestServer
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
...
...
@@ -97,13 +95,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// Set trial's NFS working folder
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
const
frameworkcontrollerJobName
=
`nniexp
${
this
.
experimentId
}
trial
${
trialJobId
}
`
.
toLowerCase
();
const
frameworkcontrollerJobName
:
string
=
`nniexp
${
this
.
experimentId
}
trial
${
trialJobId
}
`
.
toLowerCase
();
//Generate the port used for taskRole
this
.
generateContainerPort
();
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
curTrialSequenceId
,
trialJobId
,
trialWorkingFolder
,
form
);
//upload code files
le
t
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
cons
t
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
...
...
@@ -120,7 +118,9 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
const
frameworkcontrollerJobConfig
=
await
this
.
prepareFrameworkControllerConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
);
// tslint:disable-next-line:no-any
const
frameworkcontrollerJobConfig
:
any
=
await
this
.
prepareFrameworkControllerConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
);
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
frameworkcontrollerJobConfig
);
// Set trial job detail until create frameworkcontroller job successfully
...
...
@@ -129,6 +129,67 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return
Promise
.
resolve
(
trialJobDetail
);
}
// tslint:disable:no-redundant-jsdoc no-any no-unsafe-any
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
FRAMEWORKCONTROLLER_CLUSTER_CONFIG
:
const
frameworkcontrollerClusterJsonObject
:
any
=
JSON
.
parse
(
value
);
this
.
fcClusterConfig
=
FrameworkControllerClusterConfigFactory
.
generateFrameworkControllerClusterConfig
(
frameworkcontrollerClusterJsonObject
);
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
const
azureFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
this
.
azureStorageAccountName
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureFrameworkControllerClusterConfig
.
keyVault
.
vaultName
,
azureFrameworkControllerClusterConfig
.
keyVault
.
name
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
await
this
.
createNFSStorage
(
nfsFrameworkControllerClusterConfig
.
nfs
.
server
,
nfsFrameworkControllerClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
FrameworkControllerClient
.
generateFrameworkControllerClient
();
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
const
frameworkcontrollerTrialJsonObjsect
:
any
=
JSON
.
parse
(
value
);
this
.
fcTrialConfig
=
new
FrameworkControllerTrialConfig
(
frameworkcontrollerTrialJsonObjsect
.
codeDir
,
frameworkcontrollerTrialJsonObjsect
.
taskRoles
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
}
return
Promise
.
resolve
();
}
// tslint:enable: no-any no-unsafe-any
/**
* upload code files to nfs or azureStroage
* @param trialJobId
...
...
@@ -136,33 +197,40 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* return: trialJobOutputUrl
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
if
(
!
this
.
fcClusterConfig
)
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
let
trialJobOutputUrl
:
string
=
''
;
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
try
{
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
try
{
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
}
catch
(
error
){
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/\
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsFrameworkControllerClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
...
...
@@ -172,126 +240,76 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* @param command
*/
private
generateCommandScript
(
command
:
string
):
string
{
let
portScript
=
''
;
if
(
!
this
.
fcTrialConfig
)
{
let
portScript
:
string
=
''
;
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
for
(
le
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
for
(
cons
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
portScript
+=
`FB_
${
taskRole
.
name
.
toUpperCase
()}
_PORT=
${
this
.
fcContainerPortMap
.
get
(
taskRole
.
name
)}
`
;
}
return
`
${
portScript
}
. /mnt/frameworkbarrier/injector.sh &&
${
command
}
`
;
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
curTrialSequenceId
:
number
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
!
this
.
fcTrialConfig
)
{
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
curTrialSequenceId
:
number
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
this
.
fcTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
run
ScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
const
install
ScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
run
ScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
install
ScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
for
(
let
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
const
runScriptContent
:
string
=
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
this
.
generateCommandScript
(
taskRole
.
command
),
curTrialSequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
);
for
(
const
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
const
runScriptContent
:
string
=
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
this
.
generateCommandScript
(
taskRole
.
command
),
curTrialSequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
`run_
${
taskRole
.
name
}
.sh`
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
}
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
;
if
(
trialForm
!==
undefined
&&
trialForm
.
hyperParameters
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
private
async
prepareFrameworkControllerConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
):
Promise
<
any
>
{
// tslint:disable: no-any no-unsafe-any
private
async
prepareFrameworkControllerConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
fcTrialConfig
)
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
const
podResources
:
any
=
[];
for
(
le
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
le
t
resource
:
any
=
{};
for
(
cons
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
cons
t
resource
:
any
=
{};
resource
.
requests
=
this
.
generatePodResource
(
taskRole
.
memoryMB
,
taskRole
.
cpuNum
,
taskRole
.
gpuNum
);
resource
.
limits
=
Object
.
assign
({},
resource
.
requests
)
;
resource
.
limits
=
{...
resource
.
requests
}
;
podResources
.
push
(
resource
);
}
// Generate frameworkcontroller job resource config object
const
frameworkcontrollerJobConfig
:
any
=
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
const
frameworkcontrollerJobConfig
:
any
=
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
return
Promise
.
resolve
(
frameworkcontrollerJobConfig
);
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
FRAMEWORKCONTROLLER_CLUSTER_CONFIG
:
let
frameworkcontrollerClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
fcClusterConfig
=
FrameworkControllerClusterConfigFactory
.
generateFrameworkControllerClusterConfig
(
frameworkcontrollerClusterJsonObject
);
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
let
azureFrameworkControllerClusterConfig
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
this
.
azureStorageAccountName
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureFrameworkControllerClusterConfig
.
keyVault
.
vaultName
,
azureFrameworkControllerClusterConfig
.
keyVault
.
name
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsFrameworkControllerClusterConfig
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
await
this
.
createNFSStorage
(
nfsFrameworkControllerClusterConfig
.
nfs
.
server
,
nfsFrameworkControllerClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
FrameworkControllerClient
.
generateFrameworkControllerClient
();
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
let
frameworkcontrollerTrialJsonObjsect
=
JSON
.
parse
(
value
);
this
.
fcTrialConfig
=
new
FrameworkControllerTrialConfig
(
frameworkcontrollerTrialJsonObjsect
.
codeDir
,
frameworkcontrollerTrialJsonObjsect
.
taskRoles
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
break
;
}
return
Promise
.
resolve
();
}
private
generateContainerPort
()
{
if
(
!
this
.
fcTrialConfig
)
{
private
generateContainerPort
():
void
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
let
port
=
4000
;
//The default port used in container
for
(
le
t
index
in
this
.
fcTrialConfig
.
taskRoles
)
{
let
port
:
number
=
4000
;
//The default port used in container
for
(
cons
t
index
of
this
.
fcTrialConfig
.
taskRoles
.
keys
()
)
{
this
.
fcContainerPortMap
.
set
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
port
);
port
+=
1
;
}
...
...
@@ -304,22 +322,23 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* @param frameworkcontrollerJobName job name
* @param podResources pod template
*/
private
generateFrameworkControllerJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
,
podResources
:
any
)
:
any
{
if
(
!
this
.
fcClusterConfig
)
{
private
generateFrameworkControllerJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
,
podResources
:
any
)
:
any
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller Cluster config is not initialized
'
);
}
if
(
!
this
.
fcTrialConfig
)
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
le
t
taskRoles
=
[];
for
(
le
t
index
in
this
.
fcTrialConfig
.
taskRoles
)
{
le
t
containerPort
=
this
.
fcContainerPortMap
.
get
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
);
if
(
!
containerPort
)
{
cons
t
taskRoles
:
any
=
[];
for
(
cons
t
index
of
this
.
fcTrialConfig
.
taskRoles
.
keys
()
)
{
cons
t
containerPort
:
number
|
undefined
=
this
.
fcContainerPortMap
.
get
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
);
if
(
containerPort
===
undefined
)
{
throw
new
Error
(
'
Container port is not initialized
'
);
}
le
t
taskRole
=
this
.
generateTaskRoleConfig
(
cons
t
taskRole
:
any
=
this
.
generateTaskRoleConfig
(
trialWorkingFolder
,
this
.
fcTrialConfig
.
taskRoles
[
index
].
image
,
`run_
${
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
}
.sh`
,
...
...
@@ -356,19 +375,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
};
}
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
):
any
{
if
(
!
this
.
fcClusterConfig
)
{
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
):
any
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller Cluster config is not initialized
'
);
}
if
(
!
this
.
fcTrialConfig
)
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
le
t
volumeSpecMap
=
new
Map
<
string
,
object
>
();
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
){
cons
t
volumeSpecMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -380,9 +398,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
},
{
name
:
'
frameworkbarrier-volume
'
,
emptyDir
:
{}
}])
}
else
{
let
frameworkcontrollerClusterConfigNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
}]);
}
else
{
const
frameworkcontrollerClusterConfigNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -393,19 +412,19 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
},
{
name
:
'
frameworkbarrier-volume
'
,
emptyDir
:
{}
}])
}])
;
}
le
t
containers
=
[
cons
t
containers
:
any
=
[
{
name
:
'
framework
'
,
image
:
replicaImage
,
command
:
[
"
sh
"
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
command
:
[
'
sh
'
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
volumeMounts
:
[
{
name
:
'
nni-vol
'
,
mountPath
:
this
.
CONTAINER_MOUNT_PATH
},{
},
{
name
:
'
frameworkbarrier-volume
'
,
mountPath
:
'
/mnt/frameworkbarrier
'
}],
...
...
@@ -413,9 +432,9 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
ports
:
[{
containerPort
:
containerPort
}]
}]
}]
;
le
t
initContainers
=
[
cons
t
initContainers
:
any
=
[
{
name
:
'
frameworkbarrier
'
,
image
:
'
frameworkcontroller/frameworkbarrier
'
,
...
...
@@ -424,24 +443,25 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
name
:
'
frameworkbarrier-volume
'
,
mountPath
:
'
/mnt/frameworkbarrier
'
}]
}]
le
t
spec
:
any
=
{
}]
;
cons
t
spec
:
any
=
{
containers
:
containers
,
initContainers
:
initContainers
,
restartPolicy
:
'
OnFailure
'
,
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
),
hostNetwork
:
false
};
if
(
this
.
fcClusterConfig
.
serviceAccountName
)
{
if
(
this
.
fcClusterConfig
.
serviceAccountName
!==
undefined
)
{
spec
.
serviceAccountName
=
this
.
fcClusterConfig
.
serviceAccountName
;
}
let
taskRole
=
{
return
{
pod
:
{
spec
:
spec
}
};
}
return
taskRole
;
}
// tslint:enable: no-any no-unsafe-any
}
export
{
FrameworkControllerTrainingService
}
export
{
FrameworkControllerTrainingService
}
;
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
View file @
ba8dccd6
...
...
@@ -20,18 +20,22 @@
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubeflowOperator
}
from
'
./kubeflowConfig
'
;
import
{
KubernetesCRDClient
,
GeneralK8sClient
}
from
'
../kubernetesApiClient
'
;
abstract
class
KubeflowOperatorClient
extends
KubernetesCRDClient
{
/**
* KubeflowOperator Client
*/
abstract
class
KubeflowOperatorClient
extends
KubernetesCRDClient
{
/**
* Factory method to generate operator cliet
* Factory method to generate operator clie
n
t
*/
// tslint:disable-next-line:function-name
public
static
generateOperatorClient
(
kubeflowOperator
:
KubeflowOperator
,
operatorApiVersion
:
string
):
KubernetesCRDClient
{
switch
(
kubeflowOperator
)
{
switch
(
kubeflowOperator
)
{
case
'
tf-operator
'
:
{
switch
(
operatorApiVersion
)
{
switch
(
operatorApiVersion
)
{
case
'
v1alpha2
'
:
{
return
new
TFOperatorClientV1Alpha2
();
}
...
...
@@ -41,11 +45,12 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case
'
v1beta2
'
:
{
return
new
TFOperatorClientV1Beta2
();
}
default
:
throw
new
Error
(
`Invalid tf-operator apiVersion
${
operatorApiVersion
}
`
);
}
break
;
}
case
'
pytorch-operator
'
:
{
switch
(
operatorApiVersion
)
{
switch
(
operatorApiVersion
)
{
case
'
v1alpha2
'
:
{
return
new
PyTorchOperatorClientV1Alpha2
();
}
...
...
@@ -55,13 +60,17 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case
'
v1beta2
'
:
{
return
new
PyTorchOperatorClientV1Beta2
();
}
default
:
throw
new
Error
(
`Invalid pytorch-operator apiVersion
${
operatorApiVersion
}
`
);
}
}
default
:
throw
new
Error
(
`Invalid operator
${
kubeflowOperator
}
`
);
}
throw
new
Error
(
`Invalid operator
${
kubeflowOperator
}
or apiVersion
${
operatorApiVersion
}
`
);
}
}
// tslint:disable: no-unsafe-any no-any completed-docs
class
TFOperatorClientV1Alpha2
extends
KubeflowOperatorClient
{
/**
* constructor, to initialize tfjob CRD definition
...
...
@@ -73,7 +82,7 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1alpha2
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1alpha2
.
namespaces
(
'
default
'
).
tfjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -92,7 +101,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta1
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta1
.
namespaces
(
'
default
'
).
tfjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -111,7 +120,7 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta2
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta2
.
namespaces
(
'
default
'
).
tfjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -130,7 +139,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1alpha2
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1alpha2
.
namespaces
(
'
default
'
).
pytorchjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -149,7 +158,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta1
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta1
.
namespaces
(
'
default
'
).
pytorchjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -168,7 +177,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta2
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta2
.
namespaces
(
'
default
'
).
pytorchjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -176,5 +185,5 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
}
}
// tslint:enable: no-unsafe-any
export
{
KubeflowOperatorClient
,
GeneralK8sClient
};
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
View file @
ba8dccd6
...
...
@@ -20,16 +20,20 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
KubernetesClusterConfigAzure
,
KubernetesClusterConfigNFS
,
KubernetesStorageKind
,
NFSConfig
,
AzureStorage
,
keyVaultConfig
,
KubernetesTrialConfig
,
KubernetesTrialConfigTemplate
,
StorageConfig
,
KubernetesClusterConfig
}
from
'
../kubernetesConfig
'
import
{
MethodNotImplementedError
}
from
'
../../../common/errors
'
;
import
{
AzureStorage
,
KeyVaultConfig
,
KubernetesClusterConfig
,
KubernetesClusterConfigAzure
,
KubernetesClusterConfigNFS
,
KubernetesStorageKind
,
KubernetesTrialConfig
,
KubernetesTrialConfigTemplate
,
NFSConfig
,
StorageConfig
}
from
'
../kubernetesConfig
'
;
/
**
operator types that kubeflow supported
*/
/
/
operator types that kubeflow supported
export
type
KubeflowOperator
=
'
tf-operator
'
|
'
pytorch-operator
'
;
export
type
DistTrainRole
=
'
worker
'
|
'
ps
'
|
'
master
'
;
export
type
KubeflowJobStatus
=
'
Created
'
|
'
Running
'
|
'
Failed
'
|
'
Succeeded
'
;
export
type
OperatorApiVersion
=
'
v1alpha2
'
|
'
v1beta1
'
|
'
v1beta2
'
;
/**
* Kubeflow Cluster Configuration
*/
export
class
KubeflowClusterConfig
extends
KubernetesClusterConfig
{
public
readonly
operator
:
KubeflowOperator
;
constructor
(
apiVersion
:
string
,
operator
:
KubeflowOperator
)
{
...
...
@@ -38,6 +42,7 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
}
}
// tslint:disable:completed-docs
export
class
KubeflowClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
operator
:
KubeflowOperator
;
constructor
(
...
...
@@ -54,9 +59,11 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
return
'
nfs
'
;
}
// tslint:disable-next-line:function-name
public
static
getInstance
(
jsonObject
:
object
):
KubeflowClusterConfigNFS
{
let
kubeflowClusterConfigObjectNFS
=
<
KubeflowClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
)
const
kubeflowClusterConfigObjectNFS
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
);
return
new
KubeflowClusterConfigNFS
(
kubeflowClusterConfigObjectNFS
.
operator
,
kubeflowClusterConfigObjectNFS
.
apiVersion
,
...
...
@@ -66,26 +73,28 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
}
}
export
class
KubeflowClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
export
class
KubeflowClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
public
readonly
operator
:
KubeflowOperator
;
constructor
(
operator
:
KubeflowOperator
,
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
this
.
operator
=
operator
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
azureStorage
'
;
}
// tslint:disable-next-line:function-name
public
static
getInstance
(
jsonObject
:
object
):
KubeflowClusterConfigAzure
{
let
kubeflowClusterConfigObjectAzure
=
<
KubeflowClusterConfigAzure
>
jsonObject
;
const
kubeflowClusterConfigObjectAzure
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
jsonObject
;
return
new
KubeflowClusterConfigAzure
(
kubeflowClusterConfigObjectAzure
.
operator
,
kubeflowClusterConfigObjectAzure
.
apiVersion
,
...
...
@@ -98,12 +107,13 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
export
class
KubeflowClusterConfigFactory
{
// tslint:disable-next-line:function-name
public
static
generateKubeflowClusterConfig
(
jsonObject
:
object
):
KubeflowClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
!
storageConfig
)
{
throw
new
Error
(
"
Invalid json object as a StorageConfig instance
"
);
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
storageConfig
===
undefined
)
{
throw
new
Error
(
'
Invalid json object as a StorageConfig instance
'
);
}
if
(
storageConfig
.
storage
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
if
(
storageConfig
.
storage
!==
undefined
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
return
KubeflowClusterConfigAzure
.
getInstance
(
jsonObject
);
}
else
if
(
storageConfig
.
storage
===
undefined
||
storageConfig
.
storage
===
'
nfs
'
)
{
return
KubeflowClusterConfigNFS
.
getInstance
(
jsonObject
);
...
...
@@ -122,7 +132,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
}
}
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
public
readonly
replicas
:
number
;
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
...
...
@@ -163,16 +173,19 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
export
class
KubeflowTrialConfigFactory
{
// tslint:disable-next-line:function-name
public
static
generateKubeflowTrialConfig
(
jsonObject
:
object
,
operator
:
KubeflowOperator
):
KubeflowTrialConfig
{
if
(
operator
===
'
tf-operator
'
){
let
kubeflowTrialConfigObject
=
<
KubeflowTrialConfigTensorflow
>
jsonObject
;
if
(
operator
===
'
tf-operator
'
)
{
const
kubeflowTrialConfigObject
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
jsonObject
;
return
new
KubeflowTrialConfigTensorflow
(
kubeflowTrialConfigObject
.
codeDir
,
kubeflowTrialConfigObject
.
worker
,
kubeflowTrialConfigObject
.
ps
);
}
else
if
(
operator
===
'
pytorch-operator
'
){
let
kubeflowTrialConfigObject
=
<
KubeflowTrialConfigPytorch
>
jsonObject
;
}
else
if
(
operator
===
'
pytorch-operator
'
)
{
const
kubeflowTrialConfigObject
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
jsonObject
;
return
new
KubeflowTrialConfigPytorch
(
kubeflowTrialConfigObject
.
codeDir
,
kubeflowTrialConfigObject
.
master
,
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
View file @
ba8dccd6
...
...
@@ -19,15 +19,15 @@
'
use strict
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesJobInfoCollector
}
from
'
../kubernetesJobInfoCollector
'
;
import
{
KubeflowJobStatus
}
from
'
./kubeflowConfig
'
;
/**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/
export
class
KubeflowJobInfoCollector
extends
KubernetesJobInfoCollector
{
export
class
KubeflowJobInfoCollector
extends
KubernetesJobInfoCollector
{
constructor
(
jobMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
)
{
super
(
jobMap
);
}
...
...
@@ -38,31 +38,33 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
return
Promise
.
resolve
();
}
if
(
kubernetesCRDClient
===
undefined
)
{
if
(
kubernetesCRDClient
===
undefined
)
{
return
Promise
.
reject
(
'
kubernetesCRDClient is undefined
'
);
}
// tslint:disable:no-any no-unsafe-any
let
kubernetesJobInfo
:
any
;
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
}
catch
(
error
)
{
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
return
Promise
.
resolve
();
}
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
conditions
)
{
const
latestCondition
=
kubernetesJobInfo
.
status
.
conditions
[
kubernetesJobInfo
.
status
.
conditions
.
length
-
1
];
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
conditions
)
{
const
latestCondition
:
any
=
kubernetesJobInfo
.
status
.
conditions
[
kubernetesJobInfo
.
status
.
conditions
.
length
-
1
];
const
tfJobType
:
KubeflowJobStatus
=
<
KubeflowJobStatus
>
latestCondition
.
type
;
switch
(
tfJobType
)
{
switch
(
tfJobType
)
{
case
'
Created
'
:
kubernetesTrialJob
.
status
=
'
WAITING
'
;
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
case
'
Running
'
:
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
kubernetesTrialJob
.
startTime
)
{
if
(
kubernetesTrialJob
.
startTime
===
undefined
)
{
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
}
break
;
...
...
@@ -75,9 +77,10 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
default
:
break
;
}
}
// tslint:enable:no-any no-unsafe-any
return
Promise
.
resolve
();
}
}
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
View file @
ba8dccd6
...
...
@@ -20,15 +20,15 @@
'
use strict
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
;
import
{
KubeflowTrainingService
}
from
'
./kubeflowTrainingService
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*/
@
component
.
Singleton
export
class
KubeflowJobRestServer
extends
KubernetesJobRestServer
{
export
class
KubeflowJobRestServer
extends
KubernetesJobRestServer
{
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
ba8dccd6
...
...
@@ -17,35 +17,34 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
JobApplicationForm
,
TrialJobApplicationForm
,
TrialJobDetail
,
NNIManagerIpConfig
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
KubeflowClusterConfigNFS
,
KubeflowClusterConfigAzure
,
KubeflowTrialConfigPytorch
,
KubeflowTrialConfigTensorflow
,
KubeflowClusterConfigFactory
,
KubeflowTrialConfigFactory
,
KubeflowTrialConfig
,
KubeflowClusterConfig
}
from
'
./kubeflowConfig
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
KubeflowOperatorClient
}
from
'
./kubeflowApiClient
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
import
{
KubeflowClusterConfig
,
KubeflowClusterConfigAzure
,
KubeflowClusterConfigFactory
,
KubeflowClusterConfigNFS
,
KubeflowTrialConfig
,
KubeflowTrialConfigFactory
,
KubeflowTrialConfigPytorch
,
KubeflowTrialConfigTensorflow
}
from
'
./kubeflowConfig
'
;
import
{
KubeflowJobInfoCollector
}
from
'
./kubeflowJobInfoCollector
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
// tslint:disable: no-unsafe-any no-any
/**
* Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
...
...
@@ -54,7 +53,7 @@ import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
class
KubeflowTrainingService
extends
KubernetesTrainingService
implements
KubernetesTrainingService
{
private
kubeflowClusterConfig
?:
KubeflowClusterConfig
;
private
kubeflowTrialConfig
?:
KubeflowTrialConfig
;
private
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
private
readonly
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
constructor
()
{
super
();
...
...
@@ -67,7 +66,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
public
async
run
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Run Kubeflow training service.
'
);
this
.
kubernetesJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
if
(
!
this
.
kubernetesJobRestServer
)
{
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
await
this
.
kubernetesJobRestServer
.
start
();
...
...
@@ -77,7 +76,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
...
...
@@ -86,17 +85,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow job operator client is undefined
'
);
}
if
(
!
this
.
kubernetesRestServerPort
)
{
if
(
this
.
kubernetesRestServerPort
===
undefined
)
{
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
kubeflowJobName
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
kubeflowJobName
:
string
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
curTrialSequenceId
:
number
=
this
.
generateSequenceId
();
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
//prepare the runscript
...
...
@@ -125,6 +124,72 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
return
Promise
.
resolve
(
trialJobDetail
);
}
// tslint:disable:no-redundant-jsdoc
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
const
kubeflowClusterJsonObject
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
await
this
.
createNFSStorage
(
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
);
const
kubeflowTrialJsonObjsect
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
}
return
Promise
.
resolve
();
}
/**
* upload code files to nfs or azureStroage
* @param trialJobId
...
...
@@ -132,54 +197,61 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* return: trialJobOutputUrl
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
let
trialJobOutputUrl
:
string
=
''
;
assert
(
!
this
.
kubeflowClusterConfig
.
storage
assert
(
this
.
kubeflowClusterConfig
.
storage
===
undefined
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
try
{
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
try
{
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
}
catch
(
error
){
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
\
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
le
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
cons
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
// initialize kubeflow trial config to specific type
let
kubeflowTrialConfig
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
kubeflowTrialConfig
:
any
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
;
}
//create tmp trial working folder locally.
...
...
@@ -192,149 +264,89 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
)
{
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_worker.sh
'
),
workerRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
// Write parameter server file content run_ps.sh to local tmp folders
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
le
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
)
{
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
cons
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_ps.sh
'
),
psRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
!==
undefined
)
{
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_master.sh
'
),
masterRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
;
if
(
trialForm
!==
undefined
&&
trialForm
.
hyperParameters
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
private
async
prepareKubeflowConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
// initialize kubeflow trial config to specific type
let
kubeflowTrialConfig
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
kubeflowTrialConfig
:
any
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
;
}
const
workerPodResources
:
any
=
{};
if
(
kubeflowTrialConfig
.
worker
)
{
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
kubeflowTrialConfig
.
worker
.
gpuNum
)
kubeflowTrialConfig
.
worker
.
gpuNum
)
;
}
workerPodResources
.
limits
=
Object
.
assign
({},
workerPodResources
.
requests
)
;
workerPodResources
.
limits
=
{...
workerPodResources
.
requests
}
;
le
t
nonWorkerResources
:
any
=
{};
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
le
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
)
{
cons
t
nonWorkerResources
:
any
=
{};
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
cons
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
tensorflowTrialConfig
.
ps
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
tensorflowTrialConfig
.
ps
.
gpuNum
)
;
nonWorkerResources
.
limits
=
{...
nonWorkerResources
.
requests
};
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
le
t
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
cons
t
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
pyTorchTrialConfig
.
master
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
pyTorchTrialConfig
.
master
.
gpuNum
);
nonWorkerResources
.
limits
=
{...
nonWorkerResources
.
requests
};
}
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
let
kubeflowClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
let
azureKubeflowClusterConfig
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsKubeflowClusterConfig
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
await
this
.
createNFSStorage
(
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
!
this
.
kubeflowClusterConfig
){
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
)
let
kubeflowTrialJsonObjsect
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
break
;
}
return
Promise
.
resolve
();
}
/**
* Generate kubeflow resource config file
* @param trialJobId trial job id
...
...
@@ -343,43 +355,42 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master
*/
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
any
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
const
replicaSpecsObj
:
any
=
{};
le
t
replicaSpecsObjMap
=
new
Map
<
string
,
object
>
();
cons
t
replicaSpecsObjMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
le
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
cons
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
if
(
tensorflowTrialConfig
.
ps
){
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
tfReplicaSpecs
'
:
replicaSpecsObj
})
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
)
{
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
tfReplicaSpecs
:
replicaSpecsObj
});
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
!==
undefined
)
{
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
}
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
pytorchReplicaSpecs
'
:
replicaSpecsObj
})
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
pytorchReplicaSpecs
:
replicaSpecsObj
})
;
}
return
{
...
...
@@ -406,21 +417,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param runScriptFile script file name
* @param podResources pod resource config section
*/
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
):
any
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
):
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
le
t
volumeSpecMap
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
){
cons
t
volumeSpecMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -429,9 +441,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
shareName
:
`
${
this
.
azureStorageShare
}
`
,
readonly
:
false
}
}])
}
else
{
le
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
}])
;
}
else
{
cons
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -439,13 +451,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
server
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
server
}
`
,
path
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
path
}
`
}
}])
}])
;
}
return
{
replicas
:
replicaNumber
,
template
:
{
metadata
:
{
// tslint:disable-next-line:no-null-keyword
creationTimestamp
:
null
},
spec
:
{
...
...
@@ -455,7 +468,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// TODO: change the name based on operator's type
name
:
this
.
kubernetesCRDClient
.
containerName
,
image
:
replicaImage
,
args
:
[
"
sh
"
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
args
:
[
'
sh
'
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
volumeMounts
:
[
{
name
:
'
nni-vol
'
,
...
...
@@ -470,5 +483,5 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
};
}
}
export
{
KubeflowTrainingService
}
// tslint:enable: no-unsafe-any no-any
export
{
KubeflowTrainingService
}
;
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
View file @
ba8dccd6
...
...
@@ -19,44 +19,46 @@
'
use strict
'
;
import
*
as
os
from
'
os
'
import
*
as
path
from
'
path
'
;
import
{
Client1_10
,
config
}
from
'
kubernetes-client
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
var
K8SClient
=
require
(
'
kubernetes-client
'
).
Client
;
var
K8SConfig
=
require
(
'
kubernetes-client
'
).
config
;
/**
* Generict Kubernetes client, target version >= 1.9
*/
// tslint:disable: no-any no-unsafe-any
class
GeneralK8sClient
{
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
constructor
()
{
this
.
client
=
new
K8S
Client
({
config
:
K8SC
onfig
.
fromKubeconfig
(),
version
:
'
1.9
'
});
this
.
client
=
new
Client
1_10
({
config
:
c
onfig
.
fromKubeconfig
(),
version
:
'
1.9
'
});
this
.
client
.
loadSpec
();
}
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`Create secrets failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
}
/**
* Kubernetes CRD client
*/
abstract
class
KubernetesCRDClient
{
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
protected
crdSchema
:
any
;
constructor
()
{
this
.
client
=
new
K8S
Client
({
config
:
K8SC
onfig
.
fromKubeconfig
()
});
this
.
client
=
new
Client
1_10
({
config
:
c
onfig
.
fromKubeconfig
()
});
this
.
client
.
loadSpec
();
}
...
...
@@ -65,7 +67,7 @@ abstract class KubernetesCRDClient {
public
abstract
get
containerName
():
string
;
public
get
jobKind
():
string
{
if
(
this
.
crdSchema
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
names
&&
this
.
crdSchema
.
spec
.
names
.
kind
)
{
...
...
@@ -76,7 +78,7 @@ abstract class KubernetesCRDClient {
}
public
get
apiVersion
():
string
{
if
(
this
.
crdSchema
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
version
)
{
return
this
.
crdSchema
.
spec
.
version
;
...
...
@@ -88,43 +90,50 @@ abstract class KubernetesCRDClient {
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`Create kubernetes job failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
//TODO : replace any
public
async
getKubernetesJob
(
kubeflowJobName
:
string
):
Promise
<
any
>
{
let
result
:
Promise
<
any
>
;
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
).
get
();
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
)
.
get
();
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
response
.
body
);
}
else
{
result
=
Promise
.
reject
(
`KubeflowOperatorClient get tfjobs failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
public
async
deleteKubernetesJob
(
labels
:
Map
<
string
,
string
>
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
// construct match query from labels for deleting tfjob
const
matchQuery
:
string
=
Array
.
from
(
labels
.
keys
()).
map
(
labelKey
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
).
join
(
'
,
'
);
const
matchQuery
:
string
=
Array
.
from
(
labels
.
keys
())
.
map
((
labelKey
:
string
)
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
)
.
join
(
'
,
'
);
try
{
const
deleteResult
:
any
=
await
this
.
operator
().
delete
({
const
deleteResult
:
any
=
await
this
.
operator
()
.
delete
({
qs
:
{
labelSelector
:
matchQuery
,
propagationPolicy
:
"
Background
"
propagationPolicy
:
'
Background
'
}
});
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`KubeflowOperatorClient, delete labels
${
matchQuery
}
get wrong statusCode
${
deleteResult
.
statusCode
}
`
);
result
=
Promise
.
reject
(
`KubeflowOperatorClient, delete labels
${
matchQuery
}
get wrong statusCode
${
deleteResult
.
statusCode
}
`
);
}
}
catch
(
err
)
{
}
catch
(
err
)
{
result
=
Promise
.
reject
(
err
);
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
ba8dccd6
...
...
@@ -22,6 +22,7 @@
export
type
KubernetesStorageKind
=
'
nfs
'
|
'
azureStorage
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
// tslint:disable: completed-docs function-name
export
abstract
class
KubernetesClusterConfig
{
public
readonly
storage
?:
KubernetesStorageKind
;
public
readonly
apiVersion
:
string
;
...
...
@@ -31,7 +32,7 @@ export abstract class KubernetesClusterConfig {
this
.
apiVersion
=
apiVersion
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
throw
new
MethodNotImplementedError
();
}
}
...
...
@@ -56,12 +57,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
this
.
nfs
=
nfs
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
nfs
'
;
}
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigNFS
{
let
kubernetesClusterConfigObjectNFS
=
<
KubernetesClusterConfigNFS
>
jsonObject
;
const
kubernetesClusterConfigObjectNFS
:
KubernetesClusterConfigNFS
=
<
KubernetesClusterConfigNFS
>
jsonObject
;
return
new
KubernetesClusterConfigNFS
(
kubernetesClusterConfigObjectNFS
.
apiVersion
,
kubernetesClusterConfigObjectNFS
.
nfs
,
...
...
@@ -71,12 +73,12 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
}
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
public
readonly
keyVault
:
k
eyVaultConfig
;
public
readonly
keyVault
:
K
eyVaultConfig
;
public
readonly
azureStorage
:
AzureStorage
;
constructor
(
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
...
...
@@ -85,12 +87,13 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
this
.
azureStorage
=
azureStorage
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
azureStorage
'
;
}
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigAzure
{
let
kubernetesClusterConfigObjectAzure
=
<
KubernetesClusterConfigAzure
>
jsonObject
;
const
kubernetesClusterConfigObjectAzure
:
KubernetesClusterConfigAzure
=
<
KubernetesClusterConfigAzure
>
jsonObject
;
return
new
KubernetesClusterConfigAzure
(
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
...
...
@@ -100,27 +103,30 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
}
}
// tslint:disable-next-line:no-unnecessary-class
export
class
KubernetesClusterConfigFactory
{
public
static
generateKubernetesClusterConfig
(
jsonObject
:
object
):
KubernetesClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
switch
(
storageConfig
.
storage
)
{
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
switch
(
storageConfig
.
storage
)
{
case
'
azureStorage
'
:
return
KubernetesClusterConfigAzure
.
getInstance
(
jsonObject
);
case
'
nfs
'
||
undefined
:
case
'
nfs
'
:
case
undefined
:
return
KubernetesClusterConfigNFS
.
getInstance
(
jsonObject
);
}
default
:
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
}
}
/**
* NFS configuration to store Kubeflow job related files
*/
export
class
NFSConfig
{
/
**
IP Adress of NFS server
*/
/
/
IP Adress of NFS server
public
readonly
server
:
string
;
/
**
exported NFS path on NFS server
*/
/
/
exported NFS path on NFS server
public
readonly
path
:
string
;
constructor
(
server
:
string
,
path
:
string
)
{
...
...
@@ -133,13 +139,13 @@ export class NFSConfig {
* KeyVault configuration to store the key of Azure Storage Service
* Refer https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2
*/
export
class
k
eyVaultConfig
{
/
**
The vault-name to specify vault
*/
export
class
K
eyVaultConfig
{
/
/
The vault-name to specify vault
public
readonly
vaultName
:
string
;
/
**
The name to specify private key
*/
/
/
The name to specify private key
public
readonly
name
:
string
;
constructor
(
vaultName
:
string
,
name
:
string
){
constructor
(
vaultName
:
string
,
name
:
string
)
{
this
.
vaultName
=
vaultName
;
this
.
name
=
name
;
}
...
...
@@ -149,12 +155,12 @@ export class keyVaultConfig {
* Azure Storage Service
*/
export
class
AzureStorage
{
/
**
The azure share to storage files
*/
/
/
The azure share to storage files
public
readonly
azureShare
:
string
;
/
**
The account name of sotrage service
*/
/
/
The account name of sotrage service
public
readonly
accountName
:
string
;
constructor
(
azureShare
:
string
,
accountName
:
string
){
constructor
(
azureShare
:
string
,
accountName
:
string
)
{
this
.
azureShare
=
azureShare
;
this
.
accountName
=
accountName
;
}
...
...
@@ -164,19 +170,19 @@ export class AzureStorage {
* Trial job configuration for Kubernetes
*/
export
class
KubernetesTrialConfigTemplate
{
/
**
CPU number
*/
/
/
CPU number
public
readonly
cpuNum
:
number
;
/
**
Memory
*/
/
/
Memory
public
readonly
memoryMB
:
number
;
/
**
Docker image
*/
/
/
Docker image
public
readonly
image
:
string
;
/
**
Trail command
*/
/
/
Trail command
public
readonly
command
:
string
;
/
**
Required GPU number for trial job. The number should be in [0,100]
*/
/
/
Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
gpuNum
:
number
,
...
...
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
ba8dccd6
...
...
@@ -24,7 +24,6 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo
/**
* KubeflowTrialJobDetail
*/
// tslint:disable-next-line:max-classes-per-file
export
class
KubernetesTrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
status
:
TrialJobStatus
;
...
...
@@ -55,7 +54,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
}
}
export
const
K
ubernetesScriptFormat
=
export
const
k
ubernetesScriptFormat
:
string
=
`#!/bin/bash
export NNI_PLATFORM={0}
export NNI_SYS_DIR=$PWD/nni/{1}
...
...
@@ -71,5 +70,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10}
--nni_manager_version '{11}' --log_collection '{12}'`
+
`
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10}
\
--nni_manager_version '{11}' --log_collection '{12}'
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
;
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
View file @
ba8dccd6
...
...
@@ -20,11 +20,10 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
MethodNotImplementedError
,
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
/**
...
...
@@ -43,15 +42,15 @@ export class KubernetesJobInfoCollector {
public
async
retrieveTrialStatus
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
)
:
Promise
<
void
>
{
assert
(
kubernetesCRDClient
!==
undefined
);
const
updateKubernetesTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
le
t
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
!
kubernetesTrialJob
)
{
for
(
cons
t
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
kubernetesTrialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
if
(
Date
.
now
()
-
kubernetesTrialJob
.
submitTime
<
20
*
1000
)
{
if
(
Date
.
now
()
-
kubernetesTrialJob
.
submitTime
<
20
*
1000
)
{
return
Promise
.
resolve
();
}
updateKubernetesTrialJobs
.
push
(
this
.
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
,
kubernetesTrialJob
))
updateKubernetesTrialJobs
.
push
(
this
.
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
,
kubernetesTrialJob
))
;
}
await
Promise
.
all
(
updateKubernetesTrialJobs
);
...
...
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
View file @
ba8dccd6
...
...
@@ -19,19 +19,19 @@
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
KubernetesTrainingService
}
from
'
./kubernetesTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*/
@
component
.
Singleton
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
@
Inject
private
kubernetesTrainingService
?
:
KubernetesTrainingService
;
private
readonly
kubernetesTrainingService
?
:
KubernetesTrainingService
;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
...
...
@@ -41,8 +41,9 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
this
.
kubernetesTrainingService
=
kubernetesTrainingService
;
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
if
(
!
this
.
kubernetesTrainingService
)
{
if
(
this
.
kubernetesTrainingService
===
undefined
)
{
throw
Error
(
'
kubernetesTrainingService not initialized!
'
);
}
// Split metrics array into single metric, then emit
...
...
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment