Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
ba8dccd6
Commit
ba8dccd6
authored
Jun 23, 2019
by
suiguoxin
Browse files
Merge branch 'master' of
https://github.com/microsoft/nni
parents
56a1575b
150ee83a
Changes
208
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
870 additions
and
726 deletions
+870
-726
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+4
-4
src/nni_manager/training_service/common/jobMetrics.ts
src/nni_manager/training_service/common/jobMetrics.ts
+4
-1
src/nni_manager/training_service/common/trialConfig.ts
src/nni_manager/training_service/common/trialConfig.ts
+4
-4
src/nni_manager/training_service/common/util.ts
src/nni_manager/training_service/common/util.ts
+57
-40
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
...er/training_service/kubernetes/azureStorageClientUtils.ts
+109
-79
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerApiClient.ts
...netes/frameworkcontroller/frameworkcontrollerApiClient.ts
+15
-6
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
...bernetes/frameworkcontroller/frameworkcontrollerConfig.ts
+30
-25
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobInfoCollector.ts
...rameworkcontroller/frameworkcontrollerJobInfoCollector.ts
+26
-18
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobRestServer.ts
...s/frameworkcontroller/frameworkcontrollerJobRestServer.ts
+5
-5
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+191
-171
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
...training_service/kubernetes/kubeflow/kubeflowApiClient.ts
+30
-21
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
...er/training_service/kubernetes/kubeflow/kubeflowConfig.ts
+41
-28
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
...g_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
+21
-18
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
...ning_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
+5
-5
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+232
-219
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
...anager/training_service/kubernetes/kubernetesApiClient.ts
+34
-25
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+41
-35
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+4
-5
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
...training_service/kubernetes/kubernetesJobInfoCollector.ts
+8
-9
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
...er/training_service/kubernetes/kubernetesJobRestServer.ts
+9
-8
No files found.
src/nni_manager/training_service/common/gpuData.ts
View file @
ba8dccd6
...
...
@@ -59,17 +59,17 @@ export class GPUSummary {
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT_LINUX
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT_LINUX
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
`
;
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
`
$env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
Write $app.ID | Out-File {1} -NoNewline -encoding utf8
`
\ No newline at end of file
`
;
src/nni_manager/training_service/common/jobMetrics.ts
View file @
ba8dccd6
...
...
@@ -21,7 +21,10 @@
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
// tslint:disable-next-line:max-classes-per-file
/**
* Trial job metrics class
* Representing trial job metrics properties
*/
export
class
JobMetrics
{
public
readonly
jobId
:
string
;
public
readonly
metrics
:
string
[];
...
...
src/nni_manager/training_service/common/trialConfig.ts
View file @
ba8dccd6
...
...
@@ -24,13 +24,13 @@
* Representing trial job configurable properties
*/
export
class
TrialConfig
{
/
**
Trail command
*/
/
/
Trail command
public
readonly
command
:
string
;
/
**
Code directory
*/
/
/
Code directory
public
readonly
codeDir
:
string
;
/
**
Required GPU number for trial job. The number should be in [0,100]
*/
/
/
Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
/**
...
...
@@ -44,4 +44,4 @@ export class TrialConfig {
this
.
codeDir
=
codeDir
;
this
.
gpuNum
=
gpuNum
;
}
}
\ No newline at end of file
}
src/nni_manager/training_service/common/util.ts
View file @
ba8dccd6
import
{
getLogger
}
from
"
common/log
"
;
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
...
...
@@ -21,44 +19,55 @@ import { getLogger } from "common/log";
'
use strict
'
;
import
{
countFilesRecursively
}
from
'
../../common/utils
'
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
*
as
os
from
'
os
'
;
import
*
as
fs
from
'
fs
'
;
import
{
getNewLine
}
from
'
../../common/utils
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
}
from
'
./gpuData
'
;
import
*
as
os
from
'
os
'
;
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
file
}
from
"
../../node_modules/@types/tmp
"
;
import
{
countFilesRecursively
,
getNewLine
,
validateFileNameRecursively
}
from
'
../../common/utils
'
;
import
{
file
}
from
'
../../node_modules/@types/tmp
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
}
from
'
./gpuData
'
;
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
*
*
* @param codeDir codeDir in nni config file
* @returns file number under codeDir
*/
// tslint:disable: no-redundant-jsdoc
export
async
function
validateCodeDir
(
codeDir
:
string
)
:
Promise
<
number
>
{
let
fileCount
:
number
|
undefined
;
let
fileNameValid
:
boolean
=
true
;
try
{
fileCount
=
await
countFilesRecursively
(
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
throw
new
Error
(
`Call count file error:
${
error
}
`
);
}
try
{
fileNameValid
=
await
validateFileNameRecursively
(
codeDir
);
}
catch
(
error
)
{
throw
new
Error
(
`Validate file name error:
${
error
}
`
);
}
if
(
fileCount
&&
fileCount
>
1000
)
{
const
errMessage
:
string
=
`Too many files(
${
fileCount
}
found}) in
${
codeDir
}
,`
if
(
fileCount
!==
undefined
&&
fileCount
>
1000
)
{
const
errMessage
:
string
=
`Too many files(
${
fileCount
}
found}) in
${
codeDir
}
,`
+
` please check if it's a valid code dir`
;
throw
new
Error
(
errMessage
);
throw
new
Error
(
errMessage
);
}
if
(
!
fileNameValid
)
{
const
errMessage
:
string
=
`File name in
${
codeDir
}
is not valid, please check file names, only support digit number、alphabet and (.-_) in file name.`
;
throw
new
Error
(
errMessage
);
}
return
fileCount
;
}
/**
* crete a new directory
* @param directory
* @param directory
*/
export
async
function
execMkdir
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
@@ -66,6 +75,7 @@ export async function execMkdir(directory: string): Promise<void> {
}
else
{
await
cpp
.
exec
(
`mkdir -p
${
directory
}
`
);
}
return
Promise
.
resolve
();
}
...
...
@@ -80,12 +90,13 @@ export async function execCopydir(source: string, destination: string): Promise<
}
else
{
await
cpp
.
exec
(
`cp -r
${
source
}
${
destination
}
`
);
}
return
Promise
.
resolve
();
}
/**
* crete a new file
* @param filename
* @param filename
*/
export
async
function
execNewFile
(
filename
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
@@ -93,16 +104,17 @@ export async function execNewFile(filename: string): Promise<void> {
}
else
{
await
cpp
.
exec
(
`touch
${
filename
}
`
);
}
return
Promise
.
resolve
();
}
/**
* run script
* run script
using powershell or bash
* @param filePath
*/
export
function
exec
Script
(
filePath
:
string
):
cp
.
ChildProcess
{
export
function
run
Script
(
filePath
:
string
):
cp
.
ChildProcess
{
if
(
process
.
platform
===
'
win32
'
)
{
return
cp
.
exec
(
`powershell.exe -file
${
filePath
}
`
);
return
cp
.
exec
(
`powershell.exe
-ExecutionPolicy Bypass
-file
${
filePath
}
`
);
}
else
{
return
cp
.
exec
(
`bash
${
filePath
}
`
);
}
...
...
@@ -110,7 +122,7 @@ export function execScript(filePath: string): cp.ChildProcess {
/**
* output the last line of a file
* @param filePath
* @param filePath
*/
export
async
function
execTail
(
filePath
:
string
):
Promise
<
cpp
.
childProcessPromise
.
Result
>
{
let
cmdresult
:
cpp
.
childProcessPromise
.
Result
;
...
...
@@ -119,12 +131,13 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
}
else
{
cmdresult
=
await
cpp
.
exec
(
`tail -n 1
${
filePath
}
`
);
}
return
Promise
.
resolve
(
cmdresult
);
}
/**
* delete a directory
* @param directory
* @param directory
*/
export
async
function
execRemove
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
@@ -132,12 +145,13 @@ export async function execRemove(directory: string): Promise<void> {
}
else
{
await
cpp
.
exec
(
`rm -rf
${
directory
}
`
);
}
return
Promise
.
resolve
();
}
/**
* kill a process
* @param directory
* @param directory
*/
export
async
function
execKill
(
pid
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
@@ -145,37 +159,39 @@ export async function execKill(pid: string): Promise<void> {
}
else
{
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
}
return
Promise
.
resolve
();
}
/**
*
set
environment variable
*
get command of setting
environment variable
* @param variable
* @returns command string
* @returns command string
*/
export
function
setEnvironmentVariable
(
variable
:
{
key
:
string
;
value
:
string
}):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
return
`$env:
${
variable
.
key
}
="
${
variable
.
value
}
"`
;
}
else
{
}
else
{
return
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
;
}
}
/**
* Compress files in directory to tar file
* @param source
_p
ath
* @param tar
_p
ath
* @param source
P
ath
* @param tar
P
ath
*/
export
async
function
tarAdd
(
tar
_p
ath
:
string
,
source
_p
ath
:
string
):
Promise
<
void
>
{
export
async
function
tarAdd
(
tar
P
ath
:
string
,
source
P
ath
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
tar_path
=
tar_path
.
split
(
'
\\
'
).
join
(
'
\\\\
'
);
source_path
=
source_path
.
split
(
'
\\
'
).
join
(
'
\\\\
'
);
let
script
:
string
[]
=
[];
const
tarFilePath
:
string
=
tarPath
.
split
(
'
\\
'
)
.
join
(
'
\\\\
'
);
const
sourceFilePath
:
string
=
sourcePath
.
split
(
'
\\
'
)
.
join
(
'
\\\\
'
);
const
script
:
string
[]
=
[];
script
.
push
(
`import os`
,
`import tarfile`
,
String
.
Format
(
`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`
,
tar
_p
ath
,
source
_p
ath
),
String
.
Format
(
`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`
,
tar
FileP
ath
,
source
FileP
ath
),
` for file in files:`
,
` fullpath = os.path.join(root,file)`
,
` tar.add(fullpath, arcname=file)`
,
...
...
@@ -184,39 +200,40 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
const
tarScript
:
string
=
path
.
join
(
os
.
tmpdir
(),
'
tar.py
'
);
await
cpp
.
exec
(
`python
${
tarScript
}
`
);
}
else
{
await
cpp
.
exec
(
`tar -czf
${
tar
_p
ath
}
-C
${
source
_p
ath
}
.`
);
await
cpp
.
exec
(
`tar -czf
${
tar
P
ath
}
-C
${
source
P
ath
}
.`
);
}
return
Promise
.
resolve
();
}
/**
* generate script file name
* @param fileNamePrefix
* @param fileNamePrefix
*/
export
function
getScriptName
(
fileNamePrefix
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
return
fileNamePrefix
+
'
.ps1
'
;
return
String
.
Format
(
'
{0}.ps1
'
,
fileNamePrefix
)
;
}
else
{
return
fileNamePrefix
+
'
.sh
'
;
return
String
.
Format
(
'
{0}.sh
'
,
fileNamePrefix
)
;
}
}
/**
* generate script file
* @param gpuMetricCollectorScriptFolder
* @param gpuMetricCollectorScriptFolder
*/
export
function
getgpuMetricsCollectorScriptContent
(
gpuMetricCollectorScriptFolder
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
}
else
{
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
}
}
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
View file @
ba8dccd6
...
...
@@ -19,108 +19,126 @@
'
use strict
'
;
import
*
as
fs
from
'
fs
'
import
*
as
azureStorage
from
'
azure-storage
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
getLogger
}
from
'
../../common/log
'
;
import
{
mkDirP
}
from
'
../../common/utils
'
;
// tslint:disable: no-redundant-jsdoc no-any no-unsafe-any
export
namespace
AzureStorageClientUtility
{
/**
* create azure share
* @param fileServerClient
* @param azureShare
* @param fileServerClient
* @param azureShare
*/
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
fileServerClient
.
createShareIfNotExists
(
azureShare
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
if
(
error
){
getLogger
().
error
(
`Create share failed:,
${
error
}
`
);
deferred
.
reject
(
error
)
}
else
{
deferred
.
resolve
()
fileServerClient
.
createShareIfNotExists
(
azureShare
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Create share failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
}
else
{
deferred
.
resolve
();
}
})
});
return
deferred
.
promise
;
}
/**
* Create a new directory (NOT recursively) in azure file storage.
* @param fileServerClient
* @param azureFoler
* @param azureShare
* @param fileServerClient
* @param azureFoler
* @param azureShare
*/
export
async
function
createDirectory
(
fileServerClient
:
a
ny
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createDirectory
(
fileServerClient
:
a
zureStorage
.
FileService
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
if
(
error
){
getLogger
().
error
(
`Create directory failed:,
${
error
}
`
);
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Create directory failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
}
})
});
return
deferred
.
promise
;
}
/**
* Create a new directory recursively in azure file storage
* @param fileServerClient
* @param azureDirectory
* @param azureDirectory
*/
export
async
function
createDirectoryRecursive
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createDirectoryRecursive
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
le
t
directories
=
azureDirectory
.
split
(
"
/
"
);
let
rootDirectory
=
""
for
(
le
t
directory
of
directories
){
cons
t
directories
:
string
[]
=
azureDirectory
.
split
(
'
/
'
);
let
rootDirectory
:
string
=
''
;
for
(
cons
t
directory
of
directories
)
{
rootDirectory
+=
directory
;
await
createDirectory
(
fileServerClient
,
rootDirectory
,
azureShare
);
rootDirectory
+=
'
/
'
;
}
deferred
.
resolve
();
return
deferred
.
promise
;
}
/**
* upload a file to azure storage
* @param fileServerClient
* @param azureDirectory
* @param azureFileName
* @param azureShare
* @param localFilePath
* @param fileServerClient
* @param azureDirectory
* @param azureFileName
* @param azureShare
* @param localFilePath
*/
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
if
(
error
){
getLogger
().
error
(
`Upload file failed:,
${
error
}
`
);
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Upload file failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
}
})
});
return
deferred
.
promise
;
}
/**
* download a file from azure storage
* @param fileServerClient
* @param azureDirectory
* @param azureFileName
* @param azureShare
* @param localFilePath
* @param fileServerClient
* @param azureDirectory
* @param azureFileName
* @param azureShare
* @param localFilePath
*/
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
if
(
error
){
getLogger
().
error
(
`Download file failed:,
${
error
}
`
);
// tslint:disable-next-line:non-literal-fs-path
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Download file failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
}
else
{
deferred
.
resolve
();
}
else
{
deferred
.
resolve
();
}
})
});
return
deferred
.
promise
;
}
...
...
@@ -131,67 +149,79 @@ export namespace AzureStorageClientUtility {
* @param azureShare : the azure share used
* @param localDirectory : local directory to be uploaded
*/
export
async
function
uploadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
,
localDirectory
:
any
):
Promise
<
void
>
{
// tslint:disable:non-literal-fs-path
export
async
function
uploadDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
,
localDirectory
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
await
createDirectoryRecursive
(
fileServerClient
,
azureDirectory
,
azureShare
);
for
(
le
t
fileName
of
fileNameArray
){
for
(
cons
t
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
if
(
fs
.
lstatSync
(
fullFilePath
).
isFile
())
{
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
await
uploadFileToAzure
(
fileServerClient
,
azureDirectory
,
fileName
,
azureShare
,
fullFilePath
);
}
else
{
// If filePath is a directory, recuisively copy it to azure
await
uploadDirectory
(
fileServerClient
,
azureDirectory
+
'
/
'
+
fileName
,
azureShare
,
fullFilePath
);
await
uploadDirectory
(
fileServerClient
,
String
.
Format
(
'
{0}/{1}
'
,
azureDirectory
,
fileName
)
,
azureShare
,
fullFilePath
);
}
}
catch
(
error
)
{
}
catch
(
error
)
{
deferred
.
reject
(
error
);
return
deferred
.
promise
;
}
}
// All files/directories are copied successfully, resolve
deferred
.
resolve
();
return
deferred
.
promise
;
}
/**
* downlod a directory from azure
* @param fileServerClient
* @param azureDirectory
* @param azureShare
* @param localDirectory
* @param fileServerClient
* @param azureDirectory
* @param azureShare
* @param localDirectory
*/
export
async
function
downloadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
,
localDirectory
:
any
):
Promise
<
void
>
{
export
async
function
downloadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureShare
:
any
,
localDirectory
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
mkDirP
(
localDirectory
);
fileServerClient
.
listFilesAndDirectoriesSegmented
(
azureShare
,
azureDirectory
,
'
null
'
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
if
((
'
entries
'
in
result
)
===
false
){
getLogger
().
error
(
`list files failed, can't get entries in result`
);
await
mkDirP
(
localDirectory
);
fileServerClient
.
listFilesAndDirectoriesSegmented
(
azureShare
,
azureDirectory
,
'
null
'
,
async
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
((
'
entries
'
in
result
)
===
false
)
{
getLogger
()
.
error
(
`list files failed, can't get entries in result`
);
throw
new
Error
(
`list files failed, can't get entries in result`
);
}
if
((
'
files
'
in
result
[
'
entries
'
])
===
false
){
getLogger
().
error
(
`list files failed, can't get files in result['entries']`
);
if
((
'
files
'
in
result
.
entries
)
===
false
)
{
getLogger
()
.
error
(
`list files failed, can't get files in result['entries']`
);
throw
new
Error
(
`list files failed, can't get files in result['entries']`
);
}
if
((
'
directories
'
in
result
[
'
directories
'
])
===
false
){
getLogger
().
error
(
`list files failed, can't get directories in result['entries']`
);
if
((
'
directories
'
in
result
.
directories
)
===
false
)
{
getLogger
()
.
error
(
`list files failed, can't get directories in result['entries']`
);
throw
new
Error
(
`list files failed, can't get directories in result['entries']`
);
}
for
(
var
fileName
of
result
[
'
entries
'
][
'
files
'
]
){
for
(
const
fileName
of
result
.
entries
.
files
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
.
name
);
downloadFile
(
fileServerClient
,
azureDirectory
,
fileName
.
name
,
azureShare
,
fullFilePath
)
await
downloadFile
(
fileServerClient
,
azureDirectory
,
fileName
.
name
,
azureShare
,
fullFilePath
)
;
}
for
(
var
directoryName
of
result
[
'
entries
'
][
'
directories
'
]
){
const
fullDirectoryPath
:
string
=
path
.
join
(
localDirectory
,
directoryName
.
name
)
const
fullAzureDirectory
:
string
=
path
.
join
(
azureDirectory
,
directoryName
.
name
)
downloadDirectory
(
fileServerClient
,
fullAzureDirectory
,
azureShare
,
fullDirectoryPath
)
for
(
const
directoryName
of
result
.
entries
.
directories
)
{
const
fullDirectoryPath
:
string
=
path
.
join
(
localDirectory
,
directoryName
.
name
)
;
const
fullAzureDirectory
:
string
=
path
.
join
(
azureDirectory
,
directoryName
.
name
)
;
await
downloadDirectory
(
fileServerClient
,
fullAzureDirectory
,
azureShare
,
fullDirectoryPath
)
;
}
deferred
.
resolve
();
})
});
return
deferred
.
promise
;
}
}
// tslint:enable: no-redundant-jsdoc no-any no-unsafe-any
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerApiClient.ts
View file @
ba8dccd6
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
...
...
@@ -20,21 +21,29 @@
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
{
KubernetesCRDClient
,
GeneralK8s
Client
}
from
'
../kubernetesApiClient
'
;
import
{
GeneralK8sClient
,
KubernetesCRD
Client
}
from
'
../kubernetesApiClient
'
;
abstract
class
FrameworkControllerClient
extends
KubernetesCRDClient
{
/**
* FrameworkController Client
*/
abstract
class
FrameworkControllerClient
extends
KubernetesCRDClient
{
/**
* Factory method to generate operator cliet
* Factory method to generate operator clie
n
t
*/
// tslint:disable-next-line:function-name
public
static
generateFrameworkControllerClient
():
KubernetesCRDClient
{
return
new
FrameworkControllerClientV1
();
}
}
/**
* FrameworkController ClientV1
*/
class
FrameworkControllerClientV1
extends
FrameworkControllerClient
{
/**
* constructor, to initialize frameworkcontroller CRD definition
*/
// tslint:disable: no-unsafe-any no-any
public
constructor
()
{
super
();
this
.
crdSchema
=
JSON
.
parse
(
fs
.
readFileSync
(
'
./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json
'
,
'
utf8
'
));
...
...
@@ -42,13 +51,13 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
frameworkcontroller.microsoft.com
"
].
v1
.
namespaces
(
'
default
'
).
frameworks
;
return
this
.
client
.
apis
[
'
frameworkcontroller.microsoft.com
'
].
v1
.
namespaces
(
'
default
'
).
frameworks
;
}
// tslint:enable: no-unsafe-any no-any
public
get
containerName
():
string
{
return
'
framework
'
;
}
}
}
export
{
FrameworkControllerClient
,
GeneralK8sClient
};
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
View file @
ba8dccd6
...
...
@@ -20,10 +20,11 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
Kubernetes
Trial
Config
,
Kubernetes
TrialConfigTemplat
e
,
KubernetesClusterConfig
Azure
,
Kubernetes
ClusterConfigNFS
,
NFSConfig
,
KubernetesStorageKind
,
keyVault
Config
,
Azure
Storage
,
KubernetesCluster
Config
,
StorageConfig
}
from
'
../kubernetesConfig
'
import
{
AzureStorage
,
KeyVaultConfig
,
Kubernetes
Cluster
Config
,
Kubernetes
ClusterConfigAzur
e
,
KubernetesClusterConfig
NFS
,
Kubernetes
StorageKind
,
KubernetesTrialConfig
,
KubernetesTrialConfigTemplate
,
NFS
Config
,
StorageConfig
}
from
'
../kubernetesConfig
'
;
// tslint:disable:completed-docs
export
class
FrameworkAttemptCompletionPolicy
{
public
readonly
minFailedTaskCount
:
number
;
public
readonly
minSucceededTaskCount
:
number
;
...
...
@@ -36,13 +37,13 @@ export class FrameworkAttemptCompletionPolicy {
/**
* Trial job configuration for FrameworkController
*/
export
class
FrameworkControllerTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
export
class
FrameworkControllerTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
public
readonly
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
;
public
readonly
name
:
string
;
public
readonly
taskNum
:
number
;
constructor
(
taskNum
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
)
{
constructor
(
taskNum
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
this
.
frameworkAttemptCompletionPolicy
=
frameworkAttemptCompletionPolicy
;
this
.
name
=
name
;
...
...
@@ -50,7 +51,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
}
}
export
class
FrameworkControllerTrialConfig
extends
KubernetesTrialConfig
{
export
class
FrameworkControllerTrialConfig
extends
KubernetesTrialConfig
{
public
readonly
taskRoles
:
FrameworkControllerTrialConfigTemplate
[];
public
readonly
codeDir
:
string
;
constructor
(
codeDir
:
string
,
taskRoles
:
FrameworkControllerTrialConfigTemplate
[])
{
...
...
@@ -68,11 +69,12 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
}
}
// tslint:disable:function-name
export
class
FrameworkControllerClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
serviceAccountName
:
string
;
constructor
(
serviceAccountName
:
string
,
apiVersion
:
string
,
serviceAccountName
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
)
{
...
...
@@ -81,8 +83,9 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
}
public
static
getInstance
(
jsonObject
:
object
):
FrameworkControllerClusterConfigNFS
{
let
kubeflowClusterConfigObjectNFS
=
<
FrameworkControllerClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
)
const
kubeflowClusterConfigObjectNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
);
return
new
FrameworkControllerClusterConfigNFS
(
kubeflowClusterConfigObjectNFS
.
serviceAccountName
,
kubeflowClusterConfigObjectNFS
.
apiVersion
,
...
...
@@ -94,20 +97,21 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
export
class
FrameworkControllerClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
public
readonly
serviceAccountName
:
string
;
constructor
(
serviceAccountName
:
string
,
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
azureStorage
:
AzureStorage
,
serviceAccountName
:
string
,
apiVersion
:
string
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
this
.
serviceAccountName
=
serviceAccountName
;
}
public
static
getInstance
(
jsonObject
:
object
):
FrameworkControllerClusterConfigAzure
{
let
kubeflowClusterConfigObjectAzure
=
<
FrameworkControllerClusterConfigAzure
>
jsonObject
;
const
kubeflowClusterConfigObjectAzure
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
jsonObject
;
return
new
FrameworkControllerClusterConfigAzure
(
kubeflowClusterConfigObjectAzure
.
serviceAccountName
,
kubeflowClusterConfigObjectAzure
.
apiVersion
,
...
...
@@ -121,11 +125,11 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf
export
class
FrameworkControllerClusterConfigFactory
{
public
static
generateFrameworkControllerClusterConfig
(
jsonObject
:
object
):
FrameworkControllerClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
!
storageConfig
)
{
throw
new
Error
(
"
Invalid json object as a StorageConfig instance
"
);
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
storageConfig
===
undefined
)
{
throw
new
Error
(
'
Invalid json object as a StorageConfig instance
'
);
}
if
(
storageConfig
.
storage
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
if
(
storageConfig
.
storage
!==
undefined
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
return
FrameworkControllerClusterConfigAzure
.
getInstance
(
jsonObject
);
}
else
if
(
storageConfig
.
storage
===
undefined
||
storageConfig
.
storage
===
'
nfs
'
)
{
return
FrameworkControllerClusterConfigNFS
.
getInstance
(
jsonObject
);
...
...
@@ -134,6 +138,7 @@ export class FrameworkControllerClusterConfigFactory {
}
}
export
type
FrameworkControllerJobStatus
=
'
AttemptRunning
'
|
'
Completed
'
|
'
AttemptCreationPending
'
|
'
AttemptCreationRequested
'
|
'
AttemptPreparing
'
|
'
AttemptCompleted
'
;
export
type
FrameworkControllerJobStatus
=
'
AttemptRunning
'
|
'
Completed
'
|
'
AttemptCreationPending
'
|
'
AttemptCreationRequested
'
|
'
AttemptPreparing
'
|
'
AttemptCompleted
'
;
export
type
FrameworkControllerJobCompleteStatus
=
'
Succeeded
'
|
'
Failed
'
;
\ No newline at end of file
export
type
FrameworkControllerJobCompleteStatus
=
'
Succeeded
'
|
'
Failed
'
;
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobInfoCollector.ts
View file @
ba8dccd6
...
...
@@ -19,66 +19,74 @@
'
use strict
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesJobInfoCollector
}
from
'
../kubernetesJobInfoCollector
'
;
import
{
FrameworkControllerJobStatus
,
FrameworkControllerJob
Complete
Status
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJob
Complete
Status
,
FrameworkControllerJobStatus
}
from
'
./frameworkcontrollerConfig
'
;
/**
* Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally
*/
export
class
FrameworkControllerJobInfoCollector
extends
KubernetesJobInfoCollector
{
export
class
FrameworkControllerJobInfoCollector
extends
KubernetesJobInfoCollector
{
constructor
(
jobMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
)
{
super
(
jobMap
);
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
return
Promise
.
resolve
();
}
if
(
kubernetesCRDClient
===
undefined
)
{
if
(
kubernetesCRDClient
===
undefined
)
{
return
Promise
.
reject
(
'
kubernetesCRDClient is undefined
'
);
}
// tslint:disable-next-line:no-any
let
kubernetesJobInfo
:
any
;
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
return
Promise
.
resolve
();
}
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
state
)
{
// tslint:disable: no-unsafe-any
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
state
)
{
const
frameworkJobType
:
FrameworkControllerJobStatus
=
<
FrameworkControllerJobStatus
>
kubernetesJobInfo
.
status
.
state
;
switch
(
frameworkJobType
)
{
case
'
AttemptCreationPending
'
||
'
AttemptCreationRequested
'
||
'
AttemptPreparing
'
:
switch
(
frameworkJobType
)
{
case
'
AttemptCreationPending
'
:
case
'
AttemptCreationRequested
'
:
case
'
AttemptPreparing
'
:
kubernetesTrialJob
.
status
=
'
WAITING
'
;
break
;
case
'
AttemptRunning
'
:
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
kubernetesTrialJob
.
startTime
)
{
if
(
kubernetesTrialJob
.
startTime
===
undefined
)
{
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
startTime
);
}
break
;
case
'
Completed
'
:
const
completedJobType
:
FrameworkControllerJobCompleteStatus
=
<
FrameworkControllerJobCompleteStatus
>
kubernetesJobInfo
.
status
.
attemptStatus
.
completionStatus
.
type
.
name
;
switch
(
completedJobType
)
{
const
completedJobType
:
FrameworkControllerJobCompleteStatus
=
<
FrameworkControllerJobCompleteStatus
>
kubernetesJobInfo
.
status
.
attemptStatus
.
completionStatus
.
type
.
name
;
switch
(
completedJobType
)
{
case
'
Succeeded
'
:
kubernetesTrialJob
.
status
=
'
SUCCEEDED
'
;
break
;
case
'
Failed
'
:
kubernetesTrialJob
.
status
=
'
FAILED
'
;
break
;
break
;
default
:
}
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
completionTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
completionTime
);
break
;
default
:
break
;
}
}
return
Promise
.
resolve
();
}
}
\ No newline at end of file
// tslint:enable: no-unsafe-any
}
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobRestServer.ts
View file @
ba8dccd6
...
...
@@ -20,16 +20,16 @@
'
use strict
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
;
import
{
FrameworkControllerTrainingService
}
from
'
./frameworkcontrollerTrainingService
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
/**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
*
*
*/
@
component
.
Singleton
export
class
FrameworkControllerJobRestServer
extends
KubernetesJobRestServer
{
export
class
FrameworkControllerJobRestServer
extends
KubernetesJobRestServer
{
constructor
()
{
super
(
component
.
get
(
FrameworkControllerTrainingService
));
}
}
\ No newline at end of file
}
}
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
ba8dccd6
...
...
@@ -17,31 +17,29 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
'
use strict
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
JobApplicationForm
,
TrialJobApplicationForm
,
TrialJobDetail
,
NNIManagerIpConfig
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
FrameworkControllerTrialConfig
,
FrameworkControllerClusterConfig
,
FrameworkControllerClusterConfigAzure
,
FrameworkControllerClusterConfigNFS
,
FrameworkControllerClusterConfigFactory
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJobRestServer
}
from
'
./frameworkcontrollerJobRestServer
'
;
import
{
FrameworkControllerClient
}
from
'
./frameworkcontrollerApiClient
'
;
import
{
FrameworkControllerClusterConfig
,
FrameworkControllerClusterConfigAzure
,
FrameworkControllerClusterConfigFactory
,
FrameworkControllerClusterConfigNFS
,
FrameworkControllerTrialConfig
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJobInfoCollector
}
from
'
./frameworkcontrollerJobInfoCollector
'
;
import
{
FrameworkControllerJobRestServer
}
from
'
./frameworkcontrollerJobRestServer
'
;
/**
* Training Service implementation for frameworkcontroller
...
...
@@ -49,30 +47,30 @@ import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInf
@
component
.
Singleton
class
FrameworkControllerTrainingService
extends
KubernetesTrainingService
implements
KubernetesTrainingService
{
private
fcTrialConfig
?:
FrameworkControllerTrialConfig
;
// frameworkcontroller trial configuration
private
fcJobInfoCollector
:
FrameworkControllerJobInfoCollector
;
// frameworkcontroller job info collector
private
fcContainerPortMap
=
new
Map
<
string
,
number
>
();
// store frameworkcontroller container port
private
readonly
fcJobInfoCollector
:
FrameworkControllerJobInfoCollector
;
// frameworkcontroller job info collector
private
readonly
fcContainerPortMap
:
Map
<
string
,
number
>
=
new
Map
<
string
,
number
>
();
// store frameworkcontroller container port
private
fcClusterConfig
?:
FrameworkControllerClusterConfig
;
constructor
()
{
super
();
this
.
fcJobInfoCollector
=
new
FrameworkControllerJobInfoCollector
(
this
.
trialJobsMap
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
}
public
async
run
():
Promise
<
void
>
{
this
.
kubernetesJobRestServer
=
component
.
get
(
FrameworkControllerJobRestServer
);
if
(
!
this
.
kubernetesJobRestServer
)
{
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`frameworkcontroller Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
...
...
@@ -80,14 +78,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
fcClusterConfig
)
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontrollerClusterConfig is not initialized
'
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
kubernetesCRDClient is undefined
'
);
}
if
(
!
this
.
kubernetesRestServerPort
)
{
if
(
this
.
kubernetesRestServerPort
===
undefined
)
{
const
restServer
:
FrameworkControllerJobRestServer
=
component
.
get
(
FrameworkControllerJobRestServer
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
...
...
@@ -97,14 +95,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// Set trial's NFS working folder
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
const
frameworkcontrollerJobName
=
`nniexp
${
this
.
experimentId
}
trial
${
trialJobId
}
`
.
toLowerCase
();
const
frameworkcontrollerJobName
:
string
=
`nniexp
${
this
.
experimentId
}
trial
${
trialJobId
}
`
.
toLowerCase
();
//Generate the port used for taskRole
this
.
generateContainerPort
();
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
curTrialSequenceId
,
trialJobId
,
trialWorkingFolder
,
form
);
//upload code files
le
t
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
cons
t
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
'
WAITING
'
,
...
...
@@ -116,182 +114,202 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
trialJobOutputUrl
);
// Set trial job detail until create frameworkcontroller job successfully
// Set trial job detail until create frameworkcontroller job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
const
frameworkcontrollerJobConfig
=
await
this
.
prepareFrameworkControllerConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
);
// tslint:disable-next-line:no-any
const
frameworkcontrollerJobConfig
:
any
=
await
this
.
prepareFrameworkControllerConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
);
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
frameworkcontrollerJobConfig
);
// Set trial job detail until create frameworkcontroller job successfully
// Set trial job detail until create frameworkcontroller job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
}
// tslint:disable:no-redundant-jsdoc no-any no-unsafe-any
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
FRAMEWORKCONTROLLER_CLUSTER_CONFIG
:
const
frameworkcontrollerClusterJsonObject
:
any
=
JSON
.
parse
(
value
);
this
.
fcClusterConfig
=
FrameworkControllerClusterConfigFactory
.
generateFrameworkControllerClusterConfig
(
frameworkcontrollerClusterJsonObject
);
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
const
azureFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
this
.
azureStorageAccountName
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureFrameworkControllerClusterConfig
.
keyVault
.
vaultName
,
azureFrameworkControllerClusterConfig
.
keyVault
.
name
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
await
this
.
createNFSStorage
(
nfsFrameworkControllerClusterConfig
.
nfs
.
server
,
nfsFrameworkControllerClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
FrameworkControllerClient
.
generateFrameworkControllerClient
();
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
const
frameworkcontrollerTrialJsonObjsect
:
any
=
JSON
.
parse
(
value
);
this
.
fcTrialConfig
=
new
FrameworkControllerTrialConfig
(
frameworkcontrollerTrialJsonObjsect
.
codeDir
,
frameworkcontrollerTrialJsonObjsect
.
taskRoles
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
}
return
Promise
.
resolve
();
}
// tslint:enable: no-any no-unsafe-any
/**
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
if
(
!
this
.
fcClusterConfig
)
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
let
trialJobOutputUrl
:
string
=
''
;
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
try
{
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
try
{
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
}
catch
(
error
){
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/\
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
// Creat work dir for current trial in NFS directory
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsFrameworkControllerClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
/**
* generate trial's command for frameworkcontroller
* expose port and execute injector.sh before executing user's command
* @param command
* @param command
*/
private
generateCommandScript
(
command
:
string
):
string
{
let
portScript
=
''
;
if
(
!
this
.
fcTrialConfig
)
{
let
portScript
:
string
=
''
;
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
for
(
le
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
for
(
cons
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
portScript
+=
`FB_
${
taskRole
.
name
.
toUpperCase
()}
_PORT=
${
this
.
fcContainerPortMap
.
get
(
taskRole
.
name
)}
`
;
}
return
`
${
portScript
}
. /mnt/frameworkbarrier/injector.sh &&
${
command
}
`
;
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
curTrialSequenceId
:
number
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
!
this
.
fcTrialConfig
)
{
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
curTrialSequenceId
:
number
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
this
.
fcTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
run
ScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
const
install
ScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
run
ScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
install
ScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
for
(
let
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
const
runScriptContent
:
string
=
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
this
.
generateCommandScript
(
taskRole
.
command
),
curTrialSequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
);
for
(
const
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
const
runScriptContent
:
string
=
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
this
.
generateCommandScript
(
taskRole
.
command
),
curTrialSequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
`run_
${
taskRole
.
name
}
.sh`
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
}
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
;
if
(
trialForm
!==
undefined
&&
trialForm
.
hyperParameters
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
private
async
prepareFrameworkControllerConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
fcTrialConfig
)
{
// tslint:disable: no-any no-unsafe-any
private
async
prepareFrameworkControllerConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
):
Promise
<
any
>
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
const
podResources
:
any
=
[];
for
(
le
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
le
t
resource
:
any
=
{};
for
(
cons
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
cons
t
resource
:
any
=
{};
resource
.
requests
=
this
.
generatePodResource
(
taskRole
.
memoryMB
,
taskRole
.
cpuNum
,
taskRole
.
gpuNum
);
resource
.
limits
=
Object
.
assign
({},
resource
.
requests
)
;
resource
.
limits
=
{...
resource
.
requests
}
;
podResources
.
push
(
resource
);
}
// Generate frameworkcontroller job resource config object
const
frameworkcontrollerJobConfig
:
any
=
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
// Generate frameworkcontroller job resource config object
const
frameworkcontrollerJobConfig
:
any
=
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
return
Promise
.
resolve
(
frameworkcontrollerJobConfig
);
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
FRAMEWORKCONTROLLER_CLUSTER_CONFIG
:
let
frameworkcontrollerClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
fcClusterConfig
=
FrameworkControllerClusterConfigFactory
.
generateFrameworkControllerClusterConfig
(
frameworkcontrollerClusterJsonObject
);
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
let
azureFrameworkControllerClusterConfig
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
this
.
azureStorageAccountName
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureFrameworkControllerClusterConfig
.
keyVault
.
vaultName
,
azureFrameworkControllerClusterConfig
.
keyVault
.
name
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsFrameworkControllerClusterConfig
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
await
this
.
createNFSStorage
(
nfsFrameworkControllerClusterConfig
.
nfs
.
server
,
nfsFrameworkControllerClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
FrameworkControllerClient
.
generateFrameworkControllerClient
();
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
let
frameworkcontrollerTrialJsonObjsect
=
JSON
.
parse
(
value
);
this
.
fcTrialConfig
=
new
FrameworkControllerTrialConfig
(
frameworkcontrollerTrialJsonObjsect
.
codeDir
,
frameworkcontrollerTrialJsonObjsect
.
taskRoles
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
break
;
}
return
Promise
.
resolve
();
}
private
generateContainerPort
()
{
if
(
!
this
.
fcTrialConfig
)
{
private
generateContainerPort
()
:
void
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
let
port
=
4000
;
//The default port used in container
for
(
le
t
index
in
this
.
fcTrialConfig
.
taskRoles
)
{
let
port
:
number
=
4000
;
//The default port used in container
for
(
cons
t
index
of
this
.
fcTrialConfig
.
taskRoles
.
keys
()
)
{
this
.
fcContainerPortMap
.
set
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
port
);
port
+=
1
;
}
...
...
@@ -304,24 +322,25 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* @param frameworkcontrollerJobName job name
* @param podResources pod template
*/
private
generateFrameworkControllerJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
,
podResources
:
any
)
:
any
{
if
(
!
this
.
fcClusterConfig
)
{
private
generateFrameworkControllerJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
,
podResources
:
any
)
:
any
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller Cluster config is not initialized
'
);
}
if
(
!
this
.
fcTrialConfig
)
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
le
t
taskRoles
=
[];
for
(
le
t
index
in
this
.
fcTrialConfig
.
taskRoles
)
{
le
t
containerPort
=
this
.
fcContainerPortMap
.
get
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
);
if
(
!
containerPort
)
{
cons
t
taskRoles
:
any
=
[];
for
(
cons
t
index
of
this
.
fcTrialConfig
.
taskRoles
.
keys
()
)
{
cons
t
containerPort
:
number
|
undefined
=
this
.
fcContainerPortMap
.
get
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
);
if
(
containerPort
===
undefined
)
{
throw
new
Error
(
'
Container port is not initialized
'
);
}
le
t
taskRole
=
this
.
generateTaskRoleConfig
(
trialWorkingFolder
,
this
.
fcTrialConfig
.
taskRoles
[
index
].
image
,
cons
t
taskRole
:
any
=
this
.
generateTaskRoleConfig
(
trialWorkingFolder
,
this
.
fcTrialConfig
.
taskRoles
[
index
].
image
,
`run_
${
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
}
.sh`
,
podResources
[
index
],
containerPort
...
...
@@ -330,17 +349,17 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
name
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
taskNumber
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
taskNum
,
frameworkAttemptCompletionPolicy
:
{
minFailedTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minFailedTaskCount
,
minFailedTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minFailedTaskCount
,
minSucceededTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minSucceededTaskCount
},
task
:
taskRole
});
}
return
{
apiVersion
:
`frameworkcontroller.microsoft.com/v1`
,
kind
:
'
Framework
'
,
metadata
:
{
metadata
:
{
name
:
frameworkcontrollerJobName
,
namespace
:
'
default
'
,
labels
:
{
...
...
@@ -356,19 +375,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
};
}
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
):
any
{
if
(
!
this
.
fcClusterConfig
)
{
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
):
any
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller Cluster config is not initialized
'
);
}
if
(
!
this
.
fcTrialConfig
)
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
le
t
volumeSpecMap
=
new
Map
<
string
,
object
>
();
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
){
cons
t
volumeSpecMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -380,9 +398,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
},
{
name
:
'
frameworkbarrier-volume
'
,
emptyDir
:
{}
}])
}
else
{
let
frameworkcontrollerClusterConfigNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
}]);
}
else
{
const
frameworkcontrollerClusterConfigNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -393,19 +412,19 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
},
{
name
:
'
frameworkbarrier-volume
'
,
emptyDir
:
{}
}])
}])
;
}
le
t
containers
=
[
cons
t
containers
:
any
=
[
{
name
:
'
framework
'
,
image
:
replicaImage
,
command
:
[
"
sh
"
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
command
:
[
'
sh
'
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
volumeMounts
:
[
{
name
:
'
nni-vol
'
,
mountPath
:
this
.
CONTAINER_MOUNT_PATH
},{
},
{
name
:
'
frameworkbarrier-volume
'
,
mountPath
:
'
/mnt/frameworkbarrier
'
}],
...
...
@@ -413,35 +432,36 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
ports
:
[{
containerPort
:
containerPort
}]
}]
}]
;
le
t
initContainers
=
[
cons
t
initContainers
:
any
=
[
{
name
:
'
frameworkbarrier
'
,
image
:
'
frameworkcontroller/frameworkbarrier
'
,
volumeMounts
:
[
{
{
name
:
'
frameworkbarrier-volume
'
,
mountPath
:
'
/mnt/frameworkbarrier
'
}]
}]
le
t
spec
:
any
=
{
}]
;
cons
t
spec
:
any
=
{
containers
:
containers
,
initContainers
:
initContainers
,
restartPolicy
:
'
OnFailure
'
,
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
),
hostNetwork
:
false
};
if
(
this
.
fcClusterConfig
.
serviceAccountName
)
{
spec
.
serviceAccountName
=
this
.
fcClusterConfig
.
serviceAccountName
;
if
(
this
.
fcClusterConfig
.
serviceAccountName
!==
undefined
)
{
spec
.
serviceAccountName
=
this
.
fcClusterConfig
.
serviceAccountName
;
}
let
taskRole
=
{
return
{
pod
:
{
spec
:
spec
}
}
return
taskRole
;
};
}
// tslint:enable: no-any no-unsafe-any
}
export
{
FrameworkControllerTrainingService
}
export
{
FrameworkControllerTrainingService
}
;
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
View file @
ba8dccd6
...
...
@@ -20,18 +20,22 @@
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubeflowOperator
}
from
'
./kubeflowConfig
'
;
import
{
KubernetesCRDClient
,
GeneralK8sClient
}
from
'
../kubernetesApiClient
'
;
abstract
class
KubeflowOperatorClient
extends
KubernetesCRDClient
{
/**
* KubeflowOperator Client
*/
abstract
class
KubeflowOperatorClient
extends
KubernetesCRDClient
{
/**
* Factory method to generate operator cliet
* Factory method to generate operator clie
n
t
*/
public
static
generateOperatorClient
(
kubeflowOperator
:
KubeflowOperator
,
operatorApiVersion
:
string
):
KubernetesCRDClient
{
switch
(
kubeflowOperator
)
{
// tslint:disable-next-line:function-name
public
static
generateOperatorClient
(
kubeflowOperator
:
KubeflowOperator
,
operatorApiVersion
:
string
):
KubernetesCRDClient
{
switch
(
kubeflowOperator
)
{
case
'
tf-operator
'
:
{
switch
(
operatorApiVersion
)
{
switch
(
operatorApiVersion
)
{
case
'
v1alpha2
'
:
{
return
new
TFOperatorClientV1Alpha2
();
}
...
...
@@ -41,11 +45,12 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case
'
v1beta2
'
:
{
return
new
TFOperatorClientV1Beta2
();
}
default
:
throw
new
Error
(
`Invalid tf-operator apiVersion
${
operatorApiVersion
}
`
);
}
break
;
}
case
'
pytorch-operator
'
:
{
switch
(
operatorApiVersion
)
{
switch
(
operatorApiVersion
)
{
case
'
v1alpha2
'
:
{
return
new
PyTorchOperatorClientV1Alpha2
();
}
...
...
@@ -55,13 +60,17 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case
'
v1beta2
'
:
{
return
new
PyTorchOperatorClientV1Beta2
();
}
default
:
throw
new
Error
(
`Invalid pytorch-operator apiVersion
${
operatorApiVersion
}
`
);
}
}
}
default
:
throw
new
Error
(
`Invalid operator
${
kubeflowOperator
}
`
);
}
throw
new
Error
(
`Invalid operator
${
kubeflowOperator
}
or apiVersion
${
operatorApiVersion
}
`
);
}
}
// tslint:disable: no-unsafe-any no-any completed-docs
class
TFOperatorClientV1Alpha2
extends
KubeflowOperatorClient
{
/**
* constructor, to initialize tfjob CRD definition
...
...
@@ -73,12 +82,12 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1alpha2
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1alpha2
.
namespaces
(
'
default
'
).
tfjobs
;
}
public
get
containerName
():
string
{
return
'
tensorflow
'
;
}
}
}
class
TFOperatorClientV1Beta1
extends
KubernetesCRDClient
{
...
...
@@ -92,12 +101,12 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta1
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta1
.
namespaces
(
'
default
'
).
tfjobs
;
}
public
get
containerName
():
string
{
return
'
tensorflow
'
;
}
}
}
class
TFOperatorClientV1Beta2
extends
KubernetesCRDClient
{
...
...
@@ -111,12 +120,12 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta2
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta2
.
namespaces
(
'
default
'
).
tfjobs
;
}
public
get
containerName
():
string
{
return
'
tensorflow
'
;
}
}
}
class
PyTorchOperatorClientV1Alpha2
extends
KubeflowOperatorClient
{
...
...
@@ -130,7 +139,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1alpha2
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1alpha2
.
namespaces
(
'
default
'
).
pytorchjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -149,7 +158,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta1
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta1
.
namespaces
(
'
default
'
).
pytorchjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -168,7 +177,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta2
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta2
.
namespaces
(
'
default
'
).
pytorchjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -176,5 +185,5 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
}
}
// tslint:enable: no-unsafe-any
export
{
KubeflowOperatorClient
,
GeneralK8sClient
};
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
View file @
ba8dccd6
...
...
@@ -20,16 +20,20 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
KubernetesClusterConfigAzure
,
KubernetesClusterConfigNFS
,
KubernetesStorageKind
,
NFSConfig
,
AzureStorage
,
keyVaultConfig
,
KubernetesTrialConfig
,
KubernetesTrialConfigTemplate
,
StorageConfig
,
KubernetesClusterConfig
}
from
'
../kubernetesConfig
'
import
{
MethodNotImplementedError
}
from
'
../../../common/errors
'
;
import
{
AzureStorage
,
KeyVaultConfig
,
KubernetesClusterConfig
,
KubernetesClusterConfigAzure
,
KubernetesClusterConfigNFS
,
KubernetesStorageKind
,
KubernetesTrialConfig
,
KubernetesTrialConfigTemplate
,
NFSConfig
,
StorageConfig
}
from
'
../kubernetesConfig
'
;
/
**
operator types that kubeflow supported
*/
/
/
operator types that kubeflow supported
export
type
KubeflowOperator
=
'
tf-operator
'
|
'
pytorch-operator
'
;
export
type
DistTrainRole
=
'
worker
'
|
'
ps
'
|
'
master
'
;
export
type
KubeflowJobStatus
=
'
Created
'
|
'
Running
'
|
'
Failed
'
|
'
Succeeded
'
;
export
type
OperatorApiVersion
=
'
v1alpha2
'
|
'
v1beta1
'
|
'
v1beta2
'
;
/**
* Kubeflow Cluster Configuration
*/
export
class
KubeflowClusterConfig
extends
KubernetesClusterConfig
{
public
readonly
operator
:
KubeflowOperator
;
constructor
(
apiVersion
:
string
,
operator
:
KubeflowOperator
)
{
...
...
@@ -38,11 +42,12 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
}
}
// tslint:disable:completed-docs
export
class
KubeflowClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
operator
:
KubeflowOperator
;
constructor
(
operator
:
KubeflowOperator
,
apiVersion
:
string
,
operator
:
KubeflowOperator
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
)
{
...
...
@@ -54,9 +59,11 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
return
'
nfs
'
;
}
// tslint:disable-next-line:function-name
public
static
getInstance
(
jsonObject
:
object
):
KubeflowClusterConfigNFS
{
let
kubeflowClusterConfigObjectNFS
=
<
KubeflowClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
)
const
kubeflowClusterConfigObjectNFS
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
);
return
new
KubeflowClusterConfigNFS
(
kubeflowClusterConfigObjectNFS
.
operator
,
kubeflowClusterConfigObjectNFS
.
apiVersion
,
...
...
@@ -66,26 +73,28 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
}
}
export
class
KubeflowClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
export
class
KubeflowClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
public
readonly
operator
:
KubeflowOperator
;
constructor
(
operator
:
KubeflowOperator
,
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
azureStorage
:
AzureStorage
,
operator
:
KubeflowOperator
,
apiVersion
:
string
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
this
.
operator
=
operator
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
azureStorage
'
;
}
// tslint:disable-next-line:function-name
public
static
getInstance
(
jsonObject
:
object
):
KubeflowClusterConfigAzure
{
let
kubeflowClusterConfigObjectAzure
=
<
KubeflowClusterConfigAzure
>
jsonObject
;
const
kubeflowClusterConfigObjectAzure
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
jsonObject
;
return
new
KubeflowClusterConfigAzure
(
kubeflowClusterConfigObjectAzure
.
operator
,
kubeflowClusterConfigObjectAzure
.
apiVersion
,
...
...
@@ -98,12 +107,13 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
export
class
KubeflowClusterConfigFactory
{
// tslint:disable-next-line:function-name
public
static
generateKubeflowClusterConfig
(
jsonObject
:
object
):
KubeflowClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
!
storageConfig
)
{
throw
new
Error
(
"
Invalid json object as a StorageConfig instance
"
);
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
storageConfig
===
undefined
)
{
throw
new
Error
(
'
Invalid json object as a StorageConfig instance
'
);
}
if
(
storageConfig
.
storage
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
if
(
storageConfig
.
storage
!==
undefined
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
return
KubeflowClusterConfigAzure
.
getInstance
(
jsonObject
);
}
else
if
(
storageConfig
.
storage
===
undefined
||
storageConfig
.
storage
===
'
nfs
'
)
{
return
KubeflowClusterConfigNFS
.
getInstance
(
jsonObject
);
...
...
@@ -122,10 +132,10 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
}
}
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
public
readonly
replicas
:
number
;
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
this
.
replicas
=
replicas
;
}
...
...
@@ -163,22 +173,25 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
export
class
KubeflowTrialConfigFactory
{
// tslint:disable-next-line:function-name
public
static
generateKubeflowTrialConfig
(
jsonObject
:
object
,
operator
:
KubeflowOperator
):
KubeflowTrialConfig
{
if
(
operator
===
'
tf-operator
'
){
let
kubeflowTrialConfigObject
=
<
KubeflowTrialConfigTensorflow
>
jsonObject
;
if
(
operator
===
'
tf-operator
'
)
{
const
kubeflowTrialConfigObject
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
jsonObject
;
return
new
KubeflowTrialConfigTensorflow
(
kubeflowTrialConfigObject
.
codeDir
,
kubeflowTrialConfigObject
.
worker
,
kubeflowTrialConfigObject
.
ps
);
}
else
if
(
operator
===
'
pytorch-operator
'
){
let
kubeflowTrialConfigObject
=
<
KubeflowTrialConfigPytorch
>
jsonObject
;
}
else
if
(
operator
===
'
pytorch-operator
'
)
{
const
kubeflowTrialConfigObject
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
jsonObject
;
return
new
KubeflowTrialConfigPytorch
(
kubeflowTrialConfigObject
.
codeDir
,
kubeflowTrialConfigObject
.
master
,
kubeflowTrialConfigObject
.
worker
);
}
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
}
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
View file @
ba8dccd6
...
...
@@ -19,65 +19,68 @@
'
use strict
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesJobInfoCollector
}
from
'
../kubernetesJobInfoCollector
'
;
import
{
KubeflowJobStatus
}
from
'
./kubeflowConfig
'
;
/**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/
export
class
KubeflowJobInfoCollector
extends
KubernetesJobInfoCollector
{
export
class
KubeflowJobInfoCollector
extends
KubernetesJobInfoCollector
{
constructor
(
jobMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
)
{
super
(
jobMap
);
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
return
Promise
.
resolve
();
}
if
(
kubernetesCRDClient
===
undefined
)
{
if
(
kubernetesCRDClient
===
undefined
)
{
return
Promise
.
reject
(
'
kubernetesCRDClient is undefined
'
);
}
// tslint:disable:no-any no-unsafe-any
let
kubernetesJobInfo
:
any
;
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
return
Promise
.
resolve
();
}
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
conditions
)
{
const
latestCondition
=
kubernetesJobInfo
.
status
.
conditions
[
kubernetesJobInfo
.
status
.
conditions
.
length
-
1
];
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
conditions
)
{
const
latestCondition
:
any
=
kubernetesJobInfo
.
status
.
conditions
[
kubernetesJobInfo
.
status
.
conditions
.
length
-
1
];
const
tfJobType
:
KubeflowJobStatus
=
<
KubeflowJobStatus
>
latestCondition
.
type
;
switch
(
tfJobType
)
{
switch
(
tfJobType
)
{
case
'
Created
'
:
kubernetesTrialJob
.
status
=
'
WAITING
'
;
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
case
'
Running
'
:
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
kubernetesTrialJob
.
startTime
)
{
if
(
kubernetesTrialJob
.
startTime
===
undefined
)
{
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
}
break
;
case
'
Failed
'
:
kubernetesTrialJob
.
status
=
'
FAILED
'
;
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
case
'
Succeeded
'
:
kubernetesTrialJob
.
status
=
'
SUCCEEDED
'
;
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
default
:
break
;
}
}
// tslint:enable:no-any no-unsafe-any
return
Promise
.
resolve
();
}
}
\ No newline at end of file
}
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
View file @
ba8dccd6
...
...
@@ -20,19 +20,19 @@
'
use strict
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
;
import
{
KubeflowTrainingService
}
from
'
./kubeflowTrainingService
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*
*/
@
component
.
Singleton
export
class
KubeflowJobRestServer
extends
KubernetesJobRestServer
{
export
class
KubeflowJobRestServer
extends
KubernetesJobRestServer
{
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor
()
{
super
(
component
.
get
(
KubeflowTrainingService
));
}
}
\ No newline at end of file
}
}
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
ba8dccd6
...
...
@@ -17,35 +17,34 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'
use strict
'
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
JobApplicationForm
,
TrialJobApplicationForm
,
TrialJobDetail
,
NNIManagerIpConfig
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
KubeflowClusterConfigNFS
,
KubeflowClusterConfigAzure
,
KubeflowTrialConfigPytorch
,
KubeflowTrialConfigTensorflow
,
KubeflowClusterConfigFactory
,
KubeflowTrialConfigFactory
,
KubeflowTrialConfig
,
KubeflowClusterConfig
}
from
'
./kubeflowConfig
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
KubeflowOperatorClient
}
from
'
./kubeflowApiClient
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
import
{
KubeflowClusterConfig
,
KubeflowClusterConfigAzure
,
KubeflowClusterConfigFactory
,
KubeflowClusterConfigNFS
,
KubeflowTrialConfig
,
KubeflowTrialConfigFactory
,
KubeflowTrialConfigPytorch
,
KubeflowTrialConfigTensorflow
}
from
'
./kubeflowConfig
'
;
import
{
KubeflowJobInfoCollector
}
from
'
./kubeflowJobInfoCollector
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
// tslint:disable: no-unsafe-any no-any
/**
* Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
...
...
@@ -54,12 +53,12 @@ import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
class
KubeflowTrainingService
extends
KubernetesTrainingService
implements
KubernetesTrainingService
{
private
kubeflowClusterConfig
?:
KubeflowClusterConfig
;
private
kubeflowTrialConfig
?:
KubeflowTrialConfig
;
private
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
private
readonly
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
constructor
()
{
super
();
super
();
this
.
kubeflowJobInfoCollector
=
new
KubeflowJobInfoCollector
(
this
.
trialJobsMap
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
this
.
log
.
info
(
'
Construct Kubeflow training service.
'
);
}
...
...
@@ -67,17 +66,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
public
async
run
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Run Kubeflow training service.
'
);
this
.
kubernetesJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
if
(
!
this
.
kubernetesJobRestServer
)
{
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
...
...
@@ -86,17 +85,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow job operator client is undefined
'
);
}
if
(
!
this
.
kubernetesRestServerPort
)
{
if
(
this
.
kubernetesRestServerPort
===
undefined
)
{
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
kubeflowJobName
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
kubeflowJobName
:
string
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
curTrialSequenceId
:
number
=
this
.
generateSequenceId
();
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
//prepare the runscript
...
...
@@ -113,226 +112,239 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
curTrialSequenceId
,
trialJobOutputUrl
);
// Generate kubeflow job resource config object
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
await
this
.
prepareKubeflowConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
);
// Create kubeflow job based on generated kubeflow job resource config
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
kubeflowJobConfig
);
// Set trial job detail until create Kubeflow job successfully
// Set trial job detail until create Kubeflow job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
}
// tslint:disable:no-redundant-jsdoc
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
const
kubeflowClusterJsonObject
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
await
this
.
createNFSStorage
(
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
);
const
kubeflowTrialJsonObjsect
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
}
return
Promise
.
resolve
();
}
/**
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
let
trialJobOutputUrl
:
string
=
''
;
assert
(
!
this
.
kubeflowClusterConfig
.
storage
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
assert
(
this
.
kubeflowClusterConfig
.
storage
===
undefined
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
try
{
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
try
{
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
}
catch
(
error
){
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
\
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
le
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
cons
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
// initialize kubeflow trial config to specific type
let
kubeflowTrialConfig
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
kubeflowTrialConfig
:
any
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
;
}
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
)
{
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_worker.sh
'
),
workerRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
// Write parameter server file content run_ps.sh to local tmp folders
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
){
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
}
// Write parameter server file content run_ps.sh to local tmp folders
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_ps.sh
'
),
psRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
){
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
!==
undefined
)
{
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_master.sh
'
),
masterRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
;
if
(
trialForm
!==
undefined
&&
trialForm
.
hyperParameters
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
private
async
prepareKubeflowConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
// initialize kubeflow trial config to specific type
let
kubeflowTrialConfig
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
kubeflowTrialConfig
:
any
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
;
}
const
workerPodResources
:
any
=
{};
if
(
kubeflowTrialConfig
.
worker
)
{
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
kubeflowTrialConfig
.
worker
.
gpuNum
)
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
kubeflowTrialConfig
.
worker
.
gpuNum
)
;
}
workerPodResources
.
limits
=
Object
.
assign
({},
workerPodResources
.
requests
)
;
le
t
nonWorkerResources
:
any
=
{};
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
le
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
)
{
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
tensorflowTrialConfig
.
ps
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
workerPodResources
.
limits
=
{...
workerPodResources
.
requests
}
;
cons
t
nonWorkerResources
:
any
=
{};
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
cons
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
tensorflowTrialConfig
.
ps
.
gpuNum
)
;
nonWorkerResources
.
limits
=
{...
nonWorkerResources
.
requests
};
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
let
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
pyTorchTrialConfig
.
master
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
const
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
pyTorchTrialConfig
.
master
.
gpuNum
);
nonWorkerResources
.
limits
=
{...
nonWorkerResources
.
requests
};
}
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
let
kubeflowClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
let
azureKubeflowClusterConfig
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsKubeflowClusterConfig
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
await
this
.
createNFSStorage
(
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
!
this
.
kubeflowClusterConfig
){
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
)
let
kubeflowTrialJsonObjsect
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
break
;
}
return
Promise
.
resolve
();
}
/**
...
...
@@ -343,49 +355,48 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master
*/
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
any
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
const
replicaSpecsObj
:
any
=
{};
let
replicaSpecsObjMap
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
if
(
tensorflowTrialConfig
.
ps
){
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
const
replicaSpecsObjMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
tfReplicaSpecs
'
:
replicaSpecsObj
})
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
)
{
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
tfReplicaSpecs
:
replicaSpecsObj
});
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
!==
undefined
)
{
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
}
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
pytorchReplicaSpecs
'
:
replicaSpecsObj
})
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
pytorchReplicaSpecs
:
replicaSpecsObj
})
;
}
return
{
apiVersion
:
`kubeflow.org/
${
this
.
kubernetesCRDClient
.
apiVersion
}
`
,
kind
:
this
.
kubernetesCRDClient
.
jobKind
,
metadata
:
{
metadata
:
{
name
:
kubeflowJobName
,
namespace
:
'
default
'
,
labels
:
{
...
...
@@ -395,7 +406,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
},
spec
:
replicaSpecsObjMap
.
get
(
this
.
kubernetesCRDClient
.
jobKind
)
};
};
}
/**
...
...
@@ -406,21 +417,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param runScriptFile script file name
* @param podResources pod resource config section
*/
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
):
any
{
if
(
!
this
.
kubeflowClusterConfig
)
{
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
):
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
le
t
volumeSpecMap
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
){
cons
t
volumeSpecMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -429,9 +441,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
shareName
:
`
${
this
.
azureStorageShare
}
`
,
readonly
:
false
}
}])
}
else
{
le
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
}])
;
}
else
{
cons
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
name
:
'
nni-vol
'
,
...
...
@@ -439,13 +451,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
server
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
server
}
`
,
path
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
path
}
`
}
}])
}])
;
}
return
{
replicas
:
replicaNumber
,
template
:
{
metadata
:
{
// tslint:disable-next-line:no-null-keyword
creationTimestamp
:
null
},
spec
:
{
...
...
@@ -455,7 +468,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// TODO: change the name based on operator's type
name
:
this
.
kubernetesCRDClient
.
containerName
,
image
:
replicaImage
,
args
:
[
"
sh
"
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
args
:
[
'
sh
'
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
volumeMounts
:
[
{
name
:
'
nni-vol
'
,
...
...
@@ -470,5 +483,5 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
};
}
}
export
{
KubeflowTrainingService
}
// tslint:enable: no-unsafe-any no-any
export
{
KubeflowTrainingService
}
;
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
View file @
ba8dccd6
...
...
@@ -19,44 +19,46 @@
'
use strict
'
;
import
*
as
os
from
'
os
'
import
*
as
path
from
'
path
'
;
import
{
Client1_10
,
config
}
from
'
kubernetes-client
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
var
K8SClient
=
require
(
'
kubernetes-client
'
).
Client
;
var
K8SConfig
=
require
(
'
kubernetes-client
'
).
config
;
/**
* Generict Kubernetes client, target version >= 1.9
*/
// tslint:disable: no-any no-unsafe-any
class
GeneralK8sClient
{
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
constructor
()
{
this
.
client
=
new
K8S
Client
({
config
:
K8SC
onfig
.
fromKubeconfig
(),
version
:
'
1.9
'
});
this
.
client
=
new
Client
1_10
({
config
:
c
onfig
.
fromKubeconfig
(),
version
:
'
1.9
'
});
this
.
client
.
loadSpec
();
}
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`Create secrets failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
}
/**
* Kubernetes CRD client
*/
abstract
class
KubernetesCRDClient
{
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
protected
crdSchema
:
any
;
constructor
()
{
this
.
client
=
new
K8S
Client
({
config
:
K8SC
onfig
.
fromKubeconfig
()
});
this
.
client
=
new
Client
1_10
({
config
:
c
onfig
.
fromKubeconfig
()
});
this
.
client
.
loadSpec
();
}
...
...
@@ -65,8 +67,8 @@ abstract class KubernetesCRDClient {
public
abstract
get
containerName
():
string
;
public
get
jobKind
():
string
{
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
names
&&
this
.
crdSchema
.
spec
.
names
.
kind
)
{
return
this
.
crdSchema
.
spec
.
names
.
kind
;
...
...
@@ -76,55 +78,62 @@ abstract class KubernetesCRDClient {
}
public
get
apiVersion
():
string
{
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
version
)
{
return
this
.
crdSchema
.
spec
.
version
;
}
else
{
throw
new
Error
(
'
KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!
'
);
}
}
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`Create kubernetes job failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
//TODO : replace any
public
async
getKubernetesJob
(
kubeflowJobName
:
string
):
Promise
<
any
>
{
let
result
:
Promise
<
any
>
;
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
).
get
();
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
)
.
get
();
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
response
.
body
);
}
else
{
result
=
Promise
.
reject
(
`KubeflowOperatorClient get tfjobs failed, statusCode is
${
response
.
statusCode
}
`
);
}
return
result
;
}
public
async
deleteKubernetesJob
(
labels
:
Map
<
string
,
string
>
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
// construct match query from labels for deleting tfjob
const
matchQuery
:
string
=
Array
.
from
(
labels
.
keys
()).
map
(
labelKey
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
).
join
(
'
,
'
);
const
matchQuery
:
string
=
Array
.
from
(
labels
.
keys
())
.
map
((
labelKey
:
string
)
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
)
.
join
(
'
,
'
);
try
{
const
deleteResult
:
any
=
await
this
.
operator
().
delete
({
const
deleteResult
:
any
=
await
this
.
operator
()
.
delete
({
qs
:
{
labelSelector
:
matchQuery
,
propagationPolicy
:
"
Background
"
}
propagationPolicy
:
'
Background
'
}
});
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
result
=
Promise
.
resolve
(
true
);
}
else
{
result
=
Promise
.
reject
(
`KubeflowOperatorClient, delete labels
${
matchQuery
}
get wrong statusCode
${
deleteResult
.
statusCode
}
`
);
result
=
Promise
.
reject
(
`KubeflowOperatorClient, delete labels
${
matchQuery
}
get wrong statusCode
${
deleteResult
.
statusCode
}
`
);
}
}
catch
(
err
)
{
}
catch
(
err
)
{
result
=
Promise
.
reject
(
err
);
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
ba8dccd6
...
...
@@ -22,16 +22,17 @@
export
type
KubernetesStorageKind
=
'
nfs
'
|
'
azureStorage
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
// tslint:disable: completed-docs function-name
export
abstract
class
KubernetesClusterConfig
{
public
readonly
storage
?:
KubernetesStorageKind
;
public
readonly
apiVersion
:
string
;
constructor
(
apiVersion
:
string
,
storage
?:
KubernetesStorageKind
)
{
this
.
storage
=
storage
;
this
.
apiVersion
=
apiVersion
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
throw
new
MethodNotImplementedError
();
}
}
...
...
@@ -48,7 +49,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
public
readonly
nfs
:
NFSConfig
;
constructor
(
apiVersion
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
)
{
...
...
@@ -56,12 +57,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
this
.
nfs
=
nfs
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
nfs
'
;
}
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigNFS
{
let
kubernetesClusterConfigObjectNFS
=
<
KubernetesClusterConfigNFS
>
jsonObject
;
const
kubernetesClusterConfigObjectNFS
:
KubernetesClusterConfigNFS
=
<
KubernetesClusterConfigNFS
>
jsonObject
;
return
new
KubernetesClusterConfigNFS
(
kubernetesClusterConfigObjectNFS
.
apiVersion
,
kubernetesClusterConfigObjectNFS
.
nfs
,
...
...
@@ -71,13 +73,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
}
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
public
readonly
keyVault
:
k
eyVaultConfig
;
public
readonly
keyVault
:
K
eyVaultConfig
;
public
readonly
azureStorage
:
AzureStorage
;
constructor
(
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
azureStorage
:
AzureStorage
,
apiVersion
:
string
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
)
{
super
(
apiVersion
,
storage
);
...
...
@@ -85,12 +87,13 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
this
.
azureStorage
=
azureStorage
;
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
azureStorage
'
;
}
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigAzure
{
let
kubernetesClusterConfigObjectAzure
=
<
KubernetesClusterConfigAzure
>
jsonObject
;
const
kubernetesClusterConfigObjectAzure
:
KubernetesClusterConfigAzure
=
<
KubernetesClusterConfigAzure
>
jsonObject
;
return
new
KubernetesClusterConfigAzure
(
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
...
...
@@ -100,17 +103,20 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
}
}
// tslint:disable-next-line:no-unnecessary-class
export
class
KubernetesClusterConfigFactory
{
public
static
generateKubernetesClusterConfig
(
jsonObject
:
object
):
KubernetesClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
switch
(
storageConfig
.
storage
)
{
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
switch
(
storageConfig
.
storage
)
{
case
'
azureStorage
'
:
return
KubernetesClusterConfigAzure
.
getInstance
(
jsonObject
);
case
'
nfs
'
||
undefined
:
case
'
nfs
'
:
case
undefined
:
return
KubernetesClusterConfigNFS
.
getInstance
(
jsonObject
);
default
:
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
}
...
...
@@ -118,9 +124,9 @@ export class KubernetesClusterConfigFactory {
* NFS configuration to store Kubeflow job related files
*/
export
class
NFSConfig
{
/
**
IP Adress of NFS server
*/
/
/
IP Adress of NFS server
public
readonly
server
:
string
;
/
**
exported NFS path on NFS server
*/
/
/
exported NFS path on NFS server
public
readonly
path
:
string
;
constructor
(
server
:
string
,
path
:
string
)
{
...
...
@@ -133,13 +139,13 @@ export class NFSConfig {
* KeyVault configuration to store the key of Azure Storage Service
* Refer https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2
*/
export
class
k
eyVaultConfig
{
/
**
The vault-name to specify vault
*/
export
class
K
eyVaultConfig
{
/
/
The vault-name to specify vault
public
readonly
vaultName
:
string
;
/
**
The name to specify private key
*/
/
/
The name to specify private key
public
readonly
name
:
string
;
constructor
(
vaultName
:
string
,
name
:
string
){
constructor
(
vaultName
:
string
,
name
:
string
)
{
this
.
vaultName
=
vaultName
;
this
.
name
=
name
;
}
...
...
@@ -149,12 +155,12 @@ export class keyVaultConfig {
* Azure Storage Service
*/
export
class
AzureStorage
{
/
**
The azure share to storage files
*/
/
/
The azure share to storage files
public
readonly
azureShare
:
string
;
/
**
The account name of sotrage service
*/
/
/
The account name of sotrage service
public
readonly
accountName
:
string
;
constructor
(
azureShare
:
string
,
accountName
:
string
){
constructor
(
azureShare
:
string
,
accountName
:
string
)
{
this
.
azureShare
=
azureShare
;
this
.
accountName
=
accountName
;
}
...
...
@@ -164,23 +170,23 @@ export class AzureStorage {
* Trial job configuration for Kubernetes
*/
export
class
KubernetesTrialConfigTemplate
{
/
**
CPU number
*/
/
/
CPU number
public
readonly
cpuNum
:
number
;
/
**
Memory
*/
/
/
Memory
public
readonly
memoryMB
:
number
;
/
**
Docker image
*/
/
/
Docker image
public
readonly
image
:
string
;
/
**
Trail command
*/
/
/
Trail command
public
readonly
command
:
string
;
/
**
Required GPU number for trial job. The number should be in [0,100]
*/
/
/
Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
constructor
(
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
this
.
command
=
command
;
this
.
gpuNum
=
gpuNum
;
this
.
cpuNum
=
cpuNum
;
...
...
@@ -195,4 +201,4 @@ export class KubernetesTrialConfig {
constructor
(
codeDir
:
string
)
{
this
.
codeDir
=
codeDir
;
}
}
\ No newline at end of file
}
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
ba8dccd6
...
...
@@ -24,7 +24,6 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo
/**
* KubeflowTrialJobDetail
*/
// tslint:disable-next-line:max-classes-per-file
export
class
KubernetesTrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
status
:
TrialJobStatus
;
...
...
@@ -40,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
public
queryJobFailedCount
:
number
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
kubernetesJobName
:
string
,
sequenceId
:
number
,
url
:
string
)
{
this
.
id
=
id
;
this
.
status
=
status
;
...
...
@@ -55,7 +54,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
}
}
export
const
K
ubernetesScriptFormat
=
export
const
k
ubernetesScriptFormat
:
string
=
`#!/bin/bash
export NNI_PLATFORM={0}
export NNI_SYS_DIR=$PWD/nni/{1}
...
...
@@ -71,5 +70,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10}
--nni_manager_version '{11}' --log_collection '{12}'`
+
`
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10}
\
--nni_manager_version '{11}' --log_collection '{12}'
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
;
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
View file @
ba8dccd6
...
...
@@ -20,11 +20,10 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
{
MethodNotImplementedError
,
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
/**
...
...
@@ -43,22 +42,22 @@ export class KubernetesJobInfoCollector {
public
async
retrieveTrialStatus
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
)
:
Promise
<
void
>
{
assert
(
kubernetesCRDClient
!==
undefined
);
const
updateKubernetesTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
le
t
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
!
kubernetesTrialJob
)
{
for
(
cons
t
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
kubernetesTrialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
if
(
Date
.
now
()
-
kubernetesTrialJob
.
submitTime
<
20
*
1000
)
{
if
(
Date
.
now
()
-
kubernetesTrialJob
.
submitTime
<
20
*
1000
)
{
return
Promise
.
resolve
();
}
updateKubernetesTrialJobs
.
push
(
this
.
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
,
kubernetesTrialJob
))
updateKubernetesTrialJobs
.
push
(
this
.
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
,
kubernetesTrialJob
))
;
}
await
Promise
.
all
(
updateKubernetesTrialJobs
);
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
throw
new
MethodNotImplementedError
();
}
}
\ No newline at end of file
}
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
View file @
ba8dccd6
...
...
@@ -19,19 +19,19 @@
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
KubernetesTrainingService
}
from
'
./kubernetesTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*
*/
@
component
.
Singleton
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
@
Inject
private
kubernetesTrainingService
?
:
KubernetesTrainingService
;
private
readonly
kubernetesTrainingService
?
:
KubernetesTrainingService
;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
...
...
@@ -41,8 +41,9 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
this
.
kubernetesTrainingService
=
kubernetesTrainingService
;
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
if
(
!
this
.
kubernetesTrainingService
)
{
if
(
this
.
kubernetesTrainingService
===
undefined
)
{
throw
Error
(
'
kubernetesTrainingService not initialized!
'
);
}
// Split metrics array into single metric, then emit
...
...
@@ -53,5 +54,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
data
:
singleMetric
});
}
}
}
\ No newline at end of file
}
}
Prev
1
2
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment