Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
ba8dccd6
Commit
ba8dccd6
authored
Jun 23, 2019
by
suiguoxin
Browse files
Merge branch 'master' of
https://github.com/microsoft/nni
parents
56a1575b
150ee83a
Changes
208
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
870 additions
and
726 deletions
+870
-726
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+4
-4
src/nni_manager/training_service/common/jobMetrics.ts
src/nni_manager/training_service/common/jobMetrics.ts
+4
-1
src/nni_manager/training_service/common/trialConfig.ts
src/nni_manager/training_service/common/trialConfig.ts
+4
-4
src/nni_manager/training_service/common/util.ts
src/nni_manager/training_service/common/util.ts
+57
-40
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
...er/training_service/kubernetes/azureStorageClientUtils.ts
+109
-79
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerApiClient.ts
...netes/frameworkcontroller/frameworkcontrollerApiClient.ts
+15
-6
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
...bernetes/frameworkcontroller/frameworkcontrollerConfig.ts
+30
-25
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobInfoCollector.ts
...rameworkcontroller/frameworkcontrollerJobInfoCollector.ts
+26
-18
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobRestServer.ts
...s/frameworkcontroller/frameworkcontrollerJobRestServer.ts
+5
-5
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+191
-171
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
...training_service/kubernetes/kubeflow/kubeflowApiClient.ts
+30
-21
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
...er/training_service/kubernetes/kubeflow/kubeflowConfig.ts
+41
-28
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
...g_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
+21
-18
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
...ning_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
+5
-5
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+232
-219
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
...anager/training_service/kubernetes/kubernetesApiClient.ts
+34
-25
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+41
-35
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+4
-5
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
...training_service/kubernetes/kubernetesJobInfoCollector.ts
+8
-9
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
...er/training_service/kubernetes/kubernetesJobRestServer.ts
+9
-8
No files found.
src/nni_manager/training_service/common/gpuData.ts
View file @
ba8dccd6
...
@@ -59,17 +59,17 @@ export class GPUSummary {
...
@@ -59,17 +59,17 @@ export class GPUSummary {
}
}
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT_LINUX
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT_LINUX
:
string
=
`
`
#!/bin/bash
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
python3 -m nni_gpu_tool.gpu_metrics_collector
`
`
;
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
`
`
$env:METRIC_OUTPUT_DIR="{0}"
$env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
Write $app.ID | Out-File {1} -NoNewline -encoding utf8
Write $app.ID | Out-File {1} -NoNewline -encoding utf8
`
`
;
\ No newline at end of file
src/nni_manager/training_service/common/jobMetrics.ts
View file @
ba8dccd6
...
@@ -21,7 +21,10 @@
...
@@ -21,7 +21,10 @@
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
// tslint:disable-next-line:max-classes-per-file
/**
* Trial job metrics class
* Representing trial job metrics properties
*/
export
class
JobMetrics
{
export
class
JobMetrics
{
public
readonly
jobId
:
string
;
public
readonly
jobId
:
string
;
public
readonly
metrics
:
string
[];
public
readonly
metrics
:
string
[];
...
...
src/nni_manager/training_service/common/trialConfig.ts
View file @
ba8dccd6
...
@@ -24,13 +24,13 @@
...
@@ -24,13 +24,13 @@
* Representing trial job configurable properties
* Representing trial job configurable properties
*/
*/
export
class
TrialConfig
{
export
class
TrialConfig
{
/
**
Trail command
*/
/
/
Trail command
public
readonly
command
:
string
;
public
readonly
command
:
string
;
/
**
Code directory
*/
/
/
Code directory
public
readonly
codeDir
:
string
;
public
readonly
codeDir
:
string
;
/
**
Required GPU number for trial job. The number should be in [0,100]
*/
/
/
Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
public
readonly
gpuNum
:
number
;
/**
/**
...
@@ -44,4 +44,4 @@ export class TrialConfig {
...
@@ -44,4 +44,4 @@ export class TrialConfig {
this
.
codeDir
=
codeDir
;
this
.
codeDir
=
codeDir
;
this
.
gpuNum
=
gpuNum
;
this
.
gpuNum
=
gpuNum
;
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/common/util.ts
View file @
ba8dccd6
import
{
getLogger
}
from
"
common/log
"
;
/**
/**
* Copyright (c) Microsoft Corporation
* Copyright (c) Microsoft Corporation
* All rights reserved.
* All rights reserved.
...
@@ -21,44 +19,55 @@ import { getLogger } from "common/log";
...
@@ -21,44 +19,55 @@ import { getLogger } from "common/log";
'
use strict
'
;
'
use strict
'
;
import
{
countFilesRecursively
}
from
'
../../common/utils
'
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cp
from
'
child_process
'
;
import
*
as
cp
from
'
child_process
'
;
import
*
as
os
from
'
os
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
{
getNewLine
}
from
'
../../common/utils
'
;
import
*
as
os
from
'
os
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
}
from
'
./gpuData
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
file
}
from
"
../../node_modules/@types/tmp
"
;
import
{
countFilesRecursively
,
getNewLine
,
validateFileNameRecursively
}
from
'
../../common/utils
'
;
import
{
file
}
from
'
../../node_modules/@types/tmp
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
}
from
'
./gpuData
'
;
/**
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
*
*
* @param codeDir codeDir in nni config file
* @param codeDir codeDir in nni config file
* @returns file number under codeDir
* @returns file number under codeDir
*/
*/
// tslint:disable: no-redundant-jsdoc
export
async
function
validateCodeDir
(
codeDir
:
string
)
:
Promise
<
number
>
{
export
async
function
validateCodeDir
(
codeDir
:
string
)
:
Promise
<
number
>
{
let
fileCount
:
number
|
undefined
;
let
fileCount
:
number
|
undefined
;
let
fileNameValid
:
boolean
=
true
;
try
{
try
{
fileCount
=
await
countFilesRecursively
(
codeDir
);
fileCount
=
await
countFilesRecursively
(
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
throw
new
Error
(
`Call count file error:
${
error
}
`
);
throw
new
Error
(
`Call count file error:
${
error
}
`
);
}
}
try
{
fileNameValid
=
await
validateFileNameRecursively
(
codeDir
);
}
catch
(
error
)
{
throw
new
Error
(
`Validate file name error:
${
error
}
`
);
}
if
(
fileCount
&&
fileCount
>
1000
)
{
if
(
fileCount
!==
undefined
&&
fileCount
>
1000
)
{
const
errMessage
:
string
=
`Too many files(
${
fileCount
}
found}) in
${
codeDir
}
,`
const
errMessage
:
string
=
`Too many files(
${
fileCount
}
found}) in
${
codeDir
}
,`
+
` please check if it's a valid code dir`
;
+
` please check if it's a valid code dir`
;
throw
new
Error
(
errMessage
);
throw
new
Error
(
errMessage
);
}
if
(
!
fileNameValid
)
{
const
errMessage
:
string
=
`File name in
${
codeDir
}
is not valid, please check file names, only support digit number、alphabet and (.-_) in file name.`
;
throw
new
Error
(
errMessage
);
}
}
return
fileCount
;
return
fileCount
;
}
}
/**
/**
* crete a new directory
* crete a new directory
* @param directory
* @param directory
*/
*/
export
async
function
execMkdir
(
directory
:
string
):
Promise
<
void
>
{
export
async
function
execMkdir
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
@@ -66,6 +75,7 @@ export async function execMkdir(directory: string): Promise<void> {
...
@@ -66,6 +75,7 @@ export async function execMkdir(directory: string): Promise<void> {
}
else
{
}
else
{
await
cpp
.
exec
(
`mkdir -p
${
directory
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
directory
}
`
);
}
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
...
@@ -80,12 +90,13 @@ export async function execCopydir(source: string, destination: string): Promise<
...
@@ -80,12 +90,13 @@ export async function execCopydir(source: string, destination: string): Promise<
}
else
{
}
else
{
await
cpp
.
exec
(
`cp -r
${
source
}
${
destination
}
`
);
await
cpp
.
exec
(
`cp -r
${
source
}
${
destination
}
`
);
}
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
/**
/**
* crete a new file
* crete a new file
* @param filename
* @param filename
*/
*/
export
async
function
execNewFile
(
filename
:
string
):
Promise
<
void
>
{
export
async
function
execNewFile
(
filename
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
@@ -93,16 +104,17 @@ export async function execNewFile(filename: string): Promise<void> {
...
@@ -93,16 +104,17 @@ export async function execNewFile(filename: string): Promise<void> {
}
else
{
}
else
{
await
cpp
.
exec
(
`touch
${
filename
}
`
);
await
cpp
.
exec
(
`touch
${
filename
}
`
);
}
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
/**
/**
* run script
* run script
using powershell or bash
* @param filePath
* @param filePath
*/
*/
export
function
exec
Script
(
filePath
:
string
):
cp
.
ChildProcess
{
export
function
run
Script
(
filePath
:
string
):
cp
.
ChildProcess
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
return
cp
.
exec
(
`powershell.exe -file
${
filePath
}
`
);
return
cp
.
exec
(
`powershell.exe
-ExecutionPolicy Bypass
-file
${
filePath
}
`
);
}
else
{
}
else
{
return
cp
.
exec
(
`bash
${
filePath
}
`
);
return
cp
.
exec
(
`bash
${
filePath
}
`
);
}
}
...
@@ -110,7 +122,7 @@ export function execScript(filePath: string): cp.ChildProcess {
...
@@ -110,7 +122,7 @@ export function execScript(filePath: string): cp.ChildProcess {
/**
/**
* output the last line of a file
* output the last line of a file
* @param filePath
* @param filePath
*/
*/
export
async
function
execTail
(
filePath
:
string
):
Promise
<
cpp
.
childProcessPromise
.
Result
>
{
export
async
function
execTail
(
filePath
:
string
):
Promise
<
cpp
.
childProcessPromise
.
Result
>
{
let
cmdresult
:
cpp
.
childProcessPromise
.
Result
;
let
cmdresult
:
cpp
.
childProcessPromise
.
Result
;
...
@@ -119,12 +131,13 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
...
@@ -119,12 +131,13 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
}
else
{
}
else
{
cmdresult
=
await
cpp
.
exec
(
`tail -n 1
${
filePath
}
`
);
cmdresult
=
await
cpp
.
exec
(
`tail -n 1
${
filePath
}
`
);
}
}
return
Promise
.
resolve
(
cmdresult
);
return
Promise
.
resolve
(
cmdresult
);
}
}
/**
/**
* delete a directory
* delete a directory
* @param directory
* @param directory
*/
*/
export
async
function
execRemove
(
directory
:
string
):
Promise
<
void
>
{
export
async
function
execRemove
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
@@ -132,12 +145,13 @@ export async function execRemove(directory: string): Promise<void> {
...
@@ -132,12 +145,13 @@ export async function execRemove(directory: string): Promise<void> {
}
else
{
}
else
{
await
cpp
.
exec
(
`rm -rf
${
directory
}
`
);
await
cpp
.
exec
(
`rm -rf
${
directory
}
`
);
}
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
/**
/**
* kill a process
* kill a process
* @param directory
* @param directory
*/
*/
export
async
function
execKill
(
pid
:
string
):
Promise
<
void
>
{
export
async
function
execKill
(
pid
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
@@ -145,37 +159,39 @@ export async function execKill(pid: string): Promise<void> {
...
@@ -145,37 +159,39 @@ export async function execKill(pid: string): Promise<void> {
}
else
{
}
else
{
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
}
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
/**
/**
*
set
environment variable
*
get command of setting
environment variable
* @param variable
* @param variable
* @returns command string
* @returns command string
*/
*/
export
function
setEnvironmentVariable
(
variable
:
{
key
:
string
;
value
:
string
}):
string
{
export
function
setEnvironmentVariable
(
variable
:
{
key
:
string
;
value
:
string
}):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
return
`$env:
${
variable
.
key
}
="
${
variable
.
value
}
"`
;
return
`$env:
${
variable
.
key
}
="
${
variable
.
value
}
"`
;
}
}
else
{
else
{
return
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
;
return
`export
${
variable
.
key
}
=
${
variable
.
value
}
`
;
}
}
}
}
/**
/**
* Compress files in directory to tar file
* Compress files in directory to tar file
* @param source
_p
ath
* @param source
P
ath
* @param tar
_p
ath
* @param tar
P
ath
*/
*/
export
async
function
tarAdd
(
tar
_p
ath
:
string
,
source
_p
ath
:
string
):
Promise
<
void
>
{
export
async
function
tarAdd
(
tar
P
ath
:
string
,
source
P
ath
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
tar_path
=
tar_path
.
split
(
'
\\
'
).
join
(
'
\\\\
'
);
const
tarFilePath
:
string
=
tarPath
.
split
(
'
\\
'
)
source_path
=
source_path
.
split
(
'
\\
'
).
join
(
'
\\\\
'
);
.
join
(
'
\\\\
'
);
let
script
:
string
[]
=
[];
const
sourceFilePath
:
string
=
sourcePath
.
split
(
'
\\
'
)
.
join
(
'
\\\\
'
);
const
script
:
string
[]
=
[];
script
.
push
(
script
.
push
(
`import os`
,
`import os`
,
`import tarfile`
,
`import tarfile`
,
String
.
Format
(
`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`
,
tar
_p
ath
,
source
_p
ath
),
String
.
Format
(
`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`
,
tar
FileP
ath
,
source
FileP
ath
),
` for file in files:`
,
` for file in files:`
,
` fullpath = os.path.join(root,file)`
,
` fullpath = os.path.join(root,file)`
,
` tar.add(fullpath, arcname=file)`
,
` tar.add(fullpath, arcname=file)`
,
...
@@ -184,39 +200,40 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
...
@@ -184,39 +200,40 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
const
tarScript
:
string
=
path
.
join
(
os
.
tmpdir
(),
'
tar.py
'
);
const
tarScript
:
string
=
path
.
join
(
os
.
tmpdir
(),
'
tar.py
'
);
await
cpp
.
exec
(
`python
${
tarScript
}
`
);
await
cpp
.
exec
(
`python
${
tarScript
}
`
);
}
else
{
}
else
{
await
cpp
.
exec
(
`tar -czf
${
tar
_p
ath
}
-C
${
source
_p
ath
}
.`
);
await
cpp
.
exec
(
`tar -czf
${
tar
P
ath
}
-C
${
source
P
ath
}
.`
);
}
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
/**
/**
* generate script file name
* generate script file name
* @param fileNamePrefix
* @param fileNamePrefix
*/
*/
export
function
getScriptName
(
fileNamePrefix
:
string
):
string
{
export
function
getScriptName
(
fileNamePrefix
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
return
fileNamePrefix
+
'
.ps1
'
;
return
String
.
Format
(
'
{0}.ps1
'
,
fileNamePrefix
)
;
}
else
{
}
else
{
return
fileNamePrefix
+
'
.sh
'
;
return
String
.
Format
(
'
{0}.sh
'
,
fileNamePrefix
)
;
}
}
}
}
/**
/**
* generate script file
* generate script file
* @param gpuMetricCollectorScriptFolder
* @param gpuMetricCollectorScriptFolder
*/
*/
export
function
getgpuMetricsCollectorScriptContent
(
gpuMetricCollectorScriptFolder
:
string
):
string
{
export
function
getgpuMetricsCollectorScriptContent
(
gpuMetricCollectorScriptFolder
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
return
String
.
Format
(
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
,
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
,
gpuMetricCollectorScriptFolder
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
);
}
else
{
}
else
{
return
String
.
Format
(
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
gpuMetricCollectorScriptFolder
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
);
}
}
}
}
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
View file @
ba8dccd6
...
@@ -19,108 +19,126 @@
...
@@ -19,108 +19,126 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
fs
from
'
fs
'
import
*
as
azureStorage
from
'
azure-storage
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
getLogger
}
from
'
../../common/log
'
;
import
{
getLogger
}
from
'
../../common/log
'
;
import
{
mkDirP
}
from
'
../../common/utils
'
;
import
{
mkDirP
}
from
'
../../common/utils
'
;
// tslint:disable: no-redundant-jsdoc no-any no-unsafe-any
export
namespace
AzureStorageClientUtility
{
export
namespace
AzureStorageClientUtility
{
/**
/**
* create azure share
* create azure share
* @param fileServerClient
* @param fileServerClient
* @param azureShare
* @param azureShare
*/
*/
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
fileServerClient
.
createShareIfNotExists
(
azureShare
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
fileServerClient
.
createShareIfNotExists
(
azureShare
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
){
if
(
error
)
{
getLogger
().
error
(
`Create share failed:,
${
error
}
`
);
getLogger
()
deferred
.
reject
(
error
)
.
error
(
`Create share failed:,
${
error
}
`
);
}
else
{
deferred
.
reject
(
error
);
deferred
.
resolve
()
}
else
{
deferred
.
resolve
();
}
}
})
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* Create a new directory (NOT recursively) in azure file storage.
* Create a new directory (NOT recursively) in azure file storage.
* @param fileServerClient
* @param fileServerClient
* @param azureFoler
* @param azureFoler
* @param azureShare
* @param azureShare
*/
*/
export
async
function
createDirectory
(
fileServerClient
:
a
ny
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createDirectory
(
fileServerClient
:
a
zureStorage
.
FileService
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
){
if
(
error
)
{
getLogger
().
error
(
`Create directory failed:,
${
error
}
`
);
getLogger
()
.
error
(
`Create directory failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
();
}
}
})
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* Create a new directory recursively in azure file storage
* Create a new directory recursively in azure file storage
* @param fileServerClient
* @param fileServerClient
* @param azureDirectory
* @param azureDirectory
*/
*/
export
async
function
createDirectoryRecursive
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createDirectoryRecursive
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
le
t
directories
=
azureDirectory
.
split
(
"
/
"
);
cons
t
directories
:
string
[]
=
azureDirectory
.
split
(
'
/
'
);
let
rootDirectory
=
""
let
rootDirectory
:
string
=
''
;
for
(
le
t
directory
of
directories
){
for
(
cons
t
directory
of
directories
)
{
rootDirectory
+=
directory
;
rootDirectory
+=
directory
;
await
createDirectory
(
fileServerClient
,
rootDirectory
,
azureShare
);
await
createDirectory
(
fileServerClient
,
rootDirectory
,
azureShare
);
rootDirectory
+=
'
/
'
;
rootDirectory
+=
'
/
'
;
}
}
deferred
.
resolve
();
deferred
.
resolve
();
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* upload a file to azure storage
* upload a file to azure storage
* @param fileServerClient
* @param fileServerClient
* @param azureDirectory
* @param azureDirectory
* @param azureFileName
* @param azureFileName
* @param azureShare
* @param azureShare
* @param localFilePath
* @param localFilePath
*/
*/
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
if
(
error
){
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
getLogger
().
error
(
`Upload file failed:,
${
error
}
`
);
if
(
error
)
{
getLogger
()
.
error
(
`Upload file failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
();
}
}
})
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* download a file from azure storage
* download a file from azure storage
* @param fileServerClient
* @param fileServerClient
* @param azureDirectory
* @param azureDirectory
* @param azureFileName
* @param azureFileName
* @param azureShare
* @param azureShare
* @param localFilePath
* @param localFilePath
*/
*/
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
// tslint:disable-next-line:non-literal-fs-path
if
(
error
){
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
getLogger
().
error
(
`Download file failed:,
${
error
}
`
);
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Download file failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
();
}
}
})
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
...
@@ -131,67 +149,79 @@ export namespace AzureStorageClientUtility {
...
@@ -131,67 +149,79 @@ export namespace AzureStorageClientUtility {
* @param azureShare : the azure share used
* @param azureShare : the azure share used
* @param localDirectory : local directory to be uploaded
* @param localDirectory : local directory to be uploaded
*/
*/
export
async
function
uploadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
,
localDirectory
:
any
):
Promise
<
void
>
{
// tslint:disable:non-literal-fs-path
export
async
function
uploadDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
,
localDirectory
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
await
createDirectoryRecursive
(
fileServerClient
,
azureDirectory
,
azureShare
);
await
createDirectoryRecursive
(
fileServerClient
,
azureDirectory
,
azureShare
);
for
(
le
t
fileName
of
fileNameArray
){
for
(
cons
t
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
try
{
if
(
fs
.
lstatSync
(
fullFilePath
).
isFile
())
{
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
await
uploadFileToAzure
(
fileServerClient
,
azureDirectory
,
fileName
,
azureShare
,
fullFilePath
);
await
uploadFileToAzure
(
fileServerClient
,
azureDirectory
,
fileName
,
azureShare
,
fullFilePath
);
}
else
{
}
else
{
// If filePath is a directory, recuisively copy it to azure
// If filePath is a directory, recuisively copy it to azure
await
uploadDirectory
(
fileServerClient
,
azureDirectory
+
'
/
'
+
fileName
,
azureShare
,
fullFilePath
);
await
uploadDirectory
(
fileServerClient
,
String
.
Format
(
'
{0}/{1}
'
,
azureDirectory
,
fileName
)
,
azureShare
,
fullFilePath
);
}
}
}
catch
(
error
)
{
}
catch
(
error
)
{
deferred
.
reject
(
error
);
deferred
.
reject
(
error
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
}
}
// All files/directories are copied successfully, resolve
// All files/directories are copied successfully, resolve
deferred
.
resolve
();
deferred
.
resolve
();
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* downlod a directory from azure
* downlod a directory from azure
* @param fileServerClient
* @param fileServerClient
* @param azureDirectory
* @param azureDirectory
* @param azureShare
* @param azureShare
* @param localDirectory
* @param localDirectory
*/
*/
export
async
function
downloadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
,
localDirectory
:
any
):
Promise
<
void
>
{
export
async
function
downloadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureShare
:
any
,
localDirectory
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
mkDirP
(
localDirectory
);
await
mkDirP
(
localDirectory
);
fileServerClient
.
listFilesAndDirectoriesSegmented
(
azureShare
,
azureDirectory
,
'
null
'
,
function
(
error
:
any
,
result
:
any
,
response
:
any
)
{
fileServerClient
.
listFilesAndDirectoriesSegmented
(
azureShare
,
azureDirectory
,
'
null
'
,
if
((
'
entries
'
in
result
)
===
false
){
async
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
getLogger
().
error
(
`list files failed, can't get entries in result`
);
if
((
'
entries
'
in
result
)
===
false
)
{
getLogger
()
.
error
(
`list files failed, can't get entries in result`
);
throw
new
Error
(
`list files failed, can't get entries in result`
);
throw
new
Error
(
`list files failed, can't get entries in result`
);
}
}
if
((
'
files
'
in
result
[
'
entries
'
])
===
false
){
if
((
'
files
'
in
result
.
entries
)
===
false
)
{
getLogger
().
error
(
`list files failed, can't get files in result['entries']`
);
getLogger
()
.
error
(
`list files failed, can't get files in result['entries']`
);
throw
new
Error
(
`list files failed, can't get files in result['entries']`
);
throw
new
Error
(
`list files failed, can't get files in result['entries']`
);
}
}
if
((
'
directories
'
in
result
[
'
directories
'
])
===
false
){
if
((
'
directories
'
in
result
.
directories
)
===
false
)
{
getLogger
().
error
(
`list files failed, can't get directories in result['entries']`
);
getLogger
()
.
error
(
`list files failed, can't get directories in result['entries']`
);
throw
new
Error
(
`list files failed, can't get directories in result['entries']`
);
throw
new
Error
(
`list files failed, can't get directories in result['entries']`
);
}
}
for
(
var
fileName
of
result
[
'
entries
'
][
'
files
'
]
){
for
(
const
fileName
of
result
.
entries
.
files
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
.
name
);
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
.
name
);
downloadFile
(
fileServerClient
,
azureDirectory
,
fileName
.
name
,
azureShare
,
fullFilePath
)
await
downloadFile
(
fileServerClient
,
azureDirectory
,
fileName
.
name
,
azureShare
,
fullFilePath
)
;
}
}
for
(
var
directoryName
of
result
[
'
entries
'
][
'
directories
'
]
){
for
(
const
directoryName
of
result
.
entries
.
directories
)
{
const
fullDirectoryPath
:
string
=
path
.
join
(
localDirectory
,
directoryName
.
name
)
const
fullDirectoryPath
:
string
=
path
.
join
(
localDirectory
,
directoryName
.
name
)
;
const
fullAzureDirectory
:
string
=
path
.
join
(
azureDirectory
,
directoryName
.
name
)
const
fullAzureDirectory
:
string
=
path
.
join
(
azureDirectory
,
directoryName
.
name
)
;
downloadDirectory
(
fileServerClient
,
fullAzureDirectory
,
azureShare
,
fullDirectoryPath
)
await
downloadDirectory
(
fileServerClient
,
fullAzureDirectory
,
azureShare
,
fullDirectoryPath
)
;
}
}
deferred
.
resolve
();
deferred
.
resolve
();
})
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
}
}
// tslint:enable: no-redundant-jsdoc no-any no-unsafe-any
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerApiClient.ts
View file @
ba8dccd6
/**
/**
* Copyright (c) Microsoft Corporation
* Copyright (c) Microsoft Corporation
* All rights reserved.
* All rights reserved.
...
@@ -20,21 +21,29 @@
...
@@ -20,21 +21,29 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
{
KubernetesCRDClient
,
GeneralK8s
Client
}
from
'
../kubernetesApiClient
'
;
import
{
GeneralK8sClient
,
KubernetesCRD
Client
}
from
'
../kubernetesApiClient
'
;
abstract
class
FrameworkControllerClient
extends
KubernetesCRDClient
{
/**
* FrameworkController Client
*/
abstract
class
FrameworkControllerClient
extends
KubernetesCRDClient
{
/**
/**
* Factory method to generate operator cliet
* Factory method to generate operator clie
n
t
*/
*/
// tslint:disable-next-line:function-name
public
static
generateFrameworkControllerClient
():
KubernetesCRDClient
{
public
static
generateFrameworkControllerClient
():
KubernetesCRDClient
{
return
new
FrameworkControllerClientV1
();
return
new
FrameworkControllerClientV1
();
}
}
}
}
/**
* FrameworkController ClientV1
*/
class
FrameworkControllerClientV1
extends
FrameworkControllerClient
{
class
FrameworkControllerClientV1
extends
FrameworkControllerClient
{
/**
/**
* constructor, to initialize frameworkcontroller CRD definition
* constructor, to initialize frameworkcontroller CRD definition
*/
*/
// tslint:disable: no-unsafe-any no-any
public
constructor
()
{
public
constructor
()
{
super
();
super
();
this
.
crdSchema
=
JSON
.
parse
(
fs
.
readFileSync
(
'
./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json
'
,
'
utf8
'
));
this
.
crdSchema
=
JSON
.
parse
(
fs
.
readFileSync
(
'
./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json
'
,
'
utf8
'
));
...
@@ -42,13 +51,13 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
...
@@ -42,13 +51,13 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
}
}
protected
get
operator
():
any
{
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
frameworkcontroller.microsoft.com
"
].
v1
.
namespaces
(
'
default
'
).
frameworks
;
return
this
.
client
.
apis
[
'
frameworkcontroller.microsoft.com
'
].
v1
.
namespaces
(
'
default
'
).
frameworks
;
}
}
// tslint:enable: no-unsafe-any no-any
public
get
containerName
():
string
{
public
get
containerName
():
string
{
return
'
framework
'
;
return
'
framework
'
;
}
}
}
}
export
{
FrameworkControllerClient
,
GeneralK8sClient
};
export
{
FrameworkControllerClient
,
GeneralK8sClient
};
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
View file @
ba8dccd6
...
@@ -20,10 +20,11 @@
...
@@ -20,10 +20,11 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
assert
from
'
assert
'
;
import
{
Kubernetes
Trial
Config
,
Kubernetes
TrialConfigTemplat
e
,
KubernetesClusterConfig
Azure
,
import
{
AzureStorage
,
KeyVaultConfig
,
Kubernetes
Cluster
Config
,
Kubernetes
ClusterConfigAzur
e
,
KubernetesClusterConfig
NFS
,
Kubernetes
ClusterConfigNFS
,
NFSConfig
,
KubernetesStorageKind
,
keyVault
Config
,
Azure
Storage
,
KubernetesCluster
Config
,
Kubernetes
StorageKind
,
KubernetesTrialConfig
,
KubernetesTrialConfigTemplate
,
NFS
Config
,
StorageConfig
StorageConfig
}
from
'
../kubernetesConfig
'
}
from
'
../kubernetesConfig
'
;
// tslint:disable:completed-docs
export
class
FrameworkAttemptCompletionPolicy
{
export
class
FrameworkAttemptCompletionPolicy
{
public
readonly
minFailedTaskCount
:
number
;
public
readonly
minFailedTaskCount
:
number
;
public
readonly
minSucceededTaskCount
:
number
;
public
readonly
minSucceededTaskCount
:
number
;
...
@@ -36,13 +37,13 @@ export class FrameworkAttemptCompletionPolicy {
...
@@ -36,13 +37,13 @@ export class FrameworkAttemptCompletionPolicy {
/**
/**
* Trial job configuration for FrameworkController
* Trial job configuration for FrameworkController
*/
*/
export
class
FrameworkControllerTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
export
class
FrameworkControllerTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
public
readonly
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
;
public
readonly
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
;
public
readonly
name
:
string
;
public
readonly
name
:
string
;
public
readonly
taskNum
:
number
;
public
readonly
taskNum
:
number
;
constructor
(
taskNum
:
number
,
command
:
string
,
gpuNum
:
number
,
constructor
(
taskNum
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
)
{
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
this
.
frameworkAttemptCompletionPolicy
=
frameworkAttemptCompletionPolicy
;
this
.
frameworkAttemptCompletionPolicy
=
frameworkAttemptCompletionPolicy
;
this
.
name
=
name
;
this
.
name
=
name
;
...
@@ -50,7 +51,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
...
@@ -50,7 +51,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
}
}
}
}
export
class
FrameworkControllerTrialConfig
extends
KubernetesTrialConfig
{
export
class
FrameworkControllerTrialConfig
extends
KubernetesTrialConfig
{
public
readonly
taskRoles
:
FrameworkControllerTrialConfigTemplate
[];
public
readonly
taskRoles
:
FrameworkControllerTrialConfigTemplate
[];
public
readonly
codeDir
:
string
;
public
readonly
codeDir
:
string
;
constructor
(
codeDir
:
string
,
taskRoles
:
FrameworkControllerTrialConfigTemplate
[])
{
constructor
(
codeDir
:
string
,
taskRoles
:
FrameworkControllerTrialConfigTemplate
[])
{
...
@@ -68,11 +69,12 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
...
@@ -68,11 +69,12 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
}
}
}
}
// tslint:disable:function-name
export
class
FrameworkControllerClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
export
class
FrameworkControllerClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
serviceAccountName
:
string
;
public
readonly
serviceAccountName
:
string
;
constructor
(
constructor
(
serviceAccountName
:
string
,
serviceAccountName
:
string
,
apiVersion
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
...
@@ -81,8 +83,9 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
...
@@ -81,8 +83,9 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
}
}
public
static
getInstance
(
jsonObject
:
object
):
FrameworkControllerClusterConfigNFS
{
public
static
getInstance
(
jsonObject
:
object
):
FrameworkControllerClusterConfigNFS
{
let
kubeflowClusterConfigObjectNFS
=
<
FrameworkControllerClusterConfigNFS
>
jsonObject
;
const
kubeflowClusterConfigObjectNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
)
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
);
return
new
FrameworkControllerClusterConfigNFS
(
return
new
FrameworkControllerClusterConfigNFS
(
kubeflowClusterConfigObjectNFS
.
serviceAccountName
,
kubeflowClusterConfigObjectNFS
.
serviceAccountName
,
kubeflowClusterConfigObjectNFS
.
apiVersion
,
kubeflowClusterConfigObjectNFS
.
apiVersion
,
...
@@ -94,20 +97,21 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
...
@@ -94,20 +97,21 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
export
class
FrameworkControllerClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
export
class
FrameworkControllerClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
public
readonly
serviceAccountName
:
string
;
public
readonly
serviceAccountName
:
string
;
constructor
(
constructor
(
serviceAccountName
:
string
,
serviceAccountName
:
string
,
apiVersion
:
string
,
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
this
.
serviceAccountName
=
serviceAccountName
;
this
.
serviceAccountName
=
serviceAccountName
;
}
}
public
static
getInstance
(
jsonObject
:
object
):
FrameworkControllerClusterConfigAzure
{
public
static
getInstance
(
jsonObject
:
object
):
FrameworkControllerClusterConfigAzure
{
let
kubeflowClusterConfigObjectAzure
=
<
FrameworkControllerClusterConfigAzure
>
jsonObject
;
const
kubeflowClusterConfigObjectAzure
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
jsonObject
;
return
new
FrameworkControllerClusterConfigAzure
(
return
new
FrameworkControllerClusterConfigAzure
(
kubeflowClusterConfigObjectAzure
.
serviceAccountName
,
kubeflowClusterConfigObjectAzure
.
serviceAccountName
,
kubeflowClusterConfigObjectAzure
.
apiVersion
,
kubeflowClusterConfigObjectAzure
.
apiVersion
,
...
@@ -121,11 +125,11 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf
...
@@ -121,11 +125,11 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf
export
class
FrameworkControllerClusterConfigFactory
{
export
class
FrameworkControllerClusterConfigFactory
{
public
static
generateFrameworkControllerClusterConfig
(
jsonObject
:
object
):
FrameworkControllerClusterConfig
{
public
static
generateFrameworkControllerClusterConfig
(
jsonObject
:
object
):
FrameworkControllerClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
!
storageConfig
)
{
if
(
storageConfig
===
undefined
)
{
throw
new
Error
(
"
Invalid json object as a StorageConfig instance
"
);
throw
new
Error
(
'
Invalid json object as a StorageConfig instance
'
);
}
}
if
(
storageConfig
.
storage
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
if
(
storageConfig
.
storage
!==
undefined
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
return
FrameworkControllerClusterConfigAzure
.
getInstance
(
jsonObject
);
return
FrameworkControllerClusterConfigAzure
.
getInstance
(
jsonObject
);
}
else
if
(
storageConfig
.
storage
===
undefined
||
storageConfig
.
storage
===
'
nfs
'
)
{
}
else
if
(
storageConfig
.
storage
===
undefined
||
storageConfig
.
storage
===
'
nfs
'
)
{
return
FrameworkControllerClusterConfigNFS
.
getInstance
(
jsonObject
);
return
FrameworkControllerClusterConfigNFS
.
getInstance
(
jsonObject
);
...
@@ -134,6 +138,7 @@ export class FrameworkControllerClusterConfigFactory {
...
@@ -134,6 +138,7 @@ export class FrameworkControllerClusterConfigFactory {
}
}
}
}
export
type
FrameworkControllerJobStatus
=
'
AttemptRunning
'
|
'
Completed
'
|
'
AttemptCreationPending
'
|
'
AttemptCreationRequested
'
|
'
AttemptPreparing
'
|
'
AttemptCompleted
'
;
export
type
FrameworkControllerJobStatus
=
'
AttemptRunning
'
|
'
Completed
'
|
'
AttemptCreationPending
'
|
'
AttemptCreationRequested
'
|
'
AttemptPreparing
'
|
'
AttemptCompleted
'
;
export
type
FrameworkControllerJobCompleteStatus
=
'
Succeeded
'
|
'
Failed
'
;
export
type
FrameworkControllerJobCompleteStatus
=
'
Succeeded
'
|
'
Failed
'
;
\ No newline at end of file
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobInfoCollector.ts
View file @
ba8dccd6
...
@@ -19,66 +19,74 @@
...
@@ -19,66 +19,74 @@
'
use strict
'
;
'
use strict
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesJobInfoCollector
}
from
'
../kubernetesJobInfoCollector
'
;
import
{
KubernetesJobInfoCollector
}
from
'
../kubernetesJobInfoCollector
'
;
import
{
FrameworkControllerJobStatus
,
FrameworkControllerJob
Complete
Status
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJob
Complete
Status
,
FrameworkControllerJobStatus
}
from
'
./frameworkcontrollerConfig
'
;
/**
/**
* Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally
* Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally
*/
*/
export
class
FrameworkControllerJobInfoCollector
extends
KubernetesJobInfoCollector
{
export
class
FrameworkControllerJobInfoCollector
extends
KubernetesJobInfoCollector
{
constructor
(
jobMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
)
{
constructor
(
jobMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
)
{
super
(
jobMap
);
super
(
jobMap
);
}
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
if
(
kubernetesCRDClient
===
undefined
)
{
if
(
kubernetesCRDClient
===
undefined
)
{
return
Promise
.
reject
(
'
kubernetesCRDClient is undefined
'
);
return
Promise
.
reject
(
'
kubernetesCRDClient is undefined
'
);
}
}
// tslint:disable-next-line:no-any
let
kubernetesJobInfo
:
any
;
let
kubernetesJobInfo
:
any
;
try
{
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
//This is not treat as a error status
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
state
)
{
// tslint:disable: no-unsafe-any
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
state
)
{
const
frameworkJobType
:
FrameworkControllerJobStatus
=
<
FrameworkControllerJobStatus
>
kubernetesJobInfo
.
status
.
state
;
const
frameworkJobType
:
FrameworkControllerJobStatus
=
<
FrameworkControllerJobStatus
>
kubernetesJobInfo
.
status
.
state
;
switch
(
frameworkJobType
)
{
switch
(
frameworkJobType
)
{
case
'
AttemptCreationPending
'
||
'
AttemptCreationRequested
'
||
'
AttemptPreparing
'
:
case
'
AttemptCreationPending
'
:
case
'
AttemptCreationRequested
'
:
case
'
AttemptPreparing
'
:
kubernetesTrialJob
.
status
=
'
WAITING
'
;
kubernetesTrialJob
.
status
=
'
WAITING
'
;
break
;
break
;
case
'
AttemptRunning
'
:
case
'
AttemptRunning
'
:
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
kubernetesTrialJob
.
startTime
)
{
if
(
kubernetesTrialJob
.
startTime
===
undefined
)
{
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
startTime
);
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
startTime
);
}
}
break
;
break
;
case
'
Completed
'
:
case
'
Completed
'
:
const
completedJobType
:
FrameworkControllerJobCompleteStatus
=
<
FrameworkControllerJobCompleteStatus
>
kubernetesJobInfo
.
status
.
attemptStatus
.
completionStatus
.
type
.
name
;
const
completedJobType
:
FrameworkControllerJobCompleteStatus
=
switch
(
completedJobType
)
{
<
FrameworkControllerJobCompleteStatus
>
kubernetesJobInfo
.
status
.
attemptStatus
.
completionStatus
.
type
.
name
;
switch
(
completedJobType
)
{
case
'
Succeeded
'
:
case
'
Succeeded
'
:
kubernetesTrialJob
.
status
=
'
SUCCEEDED
'
;
kubernetesTrialJob
.
status
=
'
SUCCEEDED
'
;
break
;
break
;
case
'
Failed
'
:
case
'
Failed
'
:
kubernetesTrialJob
.
status
=
'
FAILED
'
;
kubernetesTrialJob
.
status
=
'
FAILED
'
;
break
;
break
;
default
:
}
}
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
completionTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
completionTime
);
break
;
break
;
default
:
default
:
break
;
}
}
}
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
}
// tslint:enable: no-unsafe-any
\ No newline at end of file
}
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobRestServer.ts
View file @
ba8dccd6
...
@@ -20,16 +20,16 @@
...
@@ -20,16 +20,16 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
;
import
{
FrameworkControllerTrainingService
}
from
'
./frameworkcontrollerTrainingService
'
;
import
{
FrameworkControllerTrainingService
}
from
'
./frameworkcontrollerTrainingService
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
/**
/**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
*
*
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
class
FrameworkControllerJobRestServer
extends
KubernetesJobRestServer
{
export
class
FrameworkControllerJobRestServer
extends
KubernetesJobRestServer
{
constructor
()
{
constructor
()
{
super
(
component
.
get
(
FrameworkControllerTrainingService
));
super
(
component
.
get
(
FrameworkControllerTrainingService
));
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
ba8dccd6
...
@@ -17,31 +17,29 @@
...
@@ -17,31 +17,29 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
*/
'
use strict
'
'
use strict
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
import
{
JobApplicationForm
,
TrialJobApplicationForm
,
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
TrialJobDetail
,
NNIManagerIpConfig
}
from
'
../../../common/trainingService
'
;
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
FrameworkControllerTrialConfig
,
FrameworkControllerClusterConfig
,
FrameworkControllerClusterConfigAzure
,
FrameworkControllerClusterConfigNFS
,
FrameworkControllerClusterConfigFactory
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJobRestServer
}
from
'
./frameworkcontrollerJobRestServer
'
;
import
{
FrameworkControllerClient
}
from
'
./frameworkcontrollerApiClient
'
;
import
{
FrameworkControllerClient
}
from
'
./frameworkcontrollerApiClient
'
;
import
{
FrameworkControllerClusterConfig
,
FrameworkControllerClusterConfigAzure
,
FrameworkControllerClusterConfigFactory
,
FrameworkControllerClusterConfigNFS
,
FrameworkControllerTrialConfig
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJobInfoCollector
}
from
'
./frameworkcontrollerJobInfoCollector
'
;
import
{
FrameworkControllerJobInfoCollector
}
from
'
./frameworkcontrollerJobInfoCollector
'
;
import
{
FrameworkControllerJobRestServer
}
from
'
./frameworkcontrollerJobRestServer
'
;
/**
/**
* Training Service implementation for frameworkcontroller
* Training Service implementation for frameworkcontroller
...
@@ -49,30 +47,30 @@ import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInf
...
@@ -49,30 +47,30 @@ import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInf
@
component
.
Singleton
@
component
.
Singleton
class
FrameworkControllerTrainingService
extends
KubernetesTrainingService
implements
KubernetesTrainingService
{
class
FrameworkControllerTrainingService
extends
KubernetesTrainingService
implements
KubernetesTrainingService
{
private
fcTrialConfig
?:
FrameworkControllerTrialConfig
;
// frameworkcontroller trial configuration
private
fcTrialConfig
?:
FrameworkControllerTrialConfig
;
// frameworkcontroller trial configuration
private
fcJobInfoCollector
:
FrameworkControllerJobInfoCollector
;
// frameworkcontroller job info collector
private
readonly
fcJobInfoCollector
:
FrameworkControllerJobInfoCollector
;
// frameworkcontroller job info collector
private
fcContainerPortMap
=
new
Map
<
string
,
number
>
();
// store frameworkcontroller container port
private
readonly
fcContainerPortMap
:
Map
<
string
,
number
>
=
new
Map
<
string
,
number
>
();
// store frameworkcontroller container port
private
fcClusterConfig
?:
FrameworkControllerClusterConfig
;
private
fcClusterConfig
?:
FrameworkControllerClusterConfig
;
constructor
()
{
constructor
()
{
super
();
super
();
this
.
fcJobInfoCollector
=
new
FrameworkControllerJobInfoCollector
(
this
.
trialJobsMap
);
this
.
fcJobInfoCollector
=
new
FrameworkControllerJobInfoCollector
(
this
.
trialJobsMap
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
this
.
nextTrialSequenceId
=
-
1
;
}
}
public
async
run
():
Promise
<
void
>
{
public
async
run
():
Promise
<
void
>
{
this
.
kubernetesJobRestServer
=
component
.
get
(
FrameworkControllerJobRestServer
);
this
.
kubernetesJobRestServer
=
component
.
get
(
FrameworkControllerJobRestServer
);
if
(
!
this
.
kubernetesJobRestServer
)
{
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
}
await
this
.
kubernetesJobRestServer
.
start
();
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`frameworkcontroller Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
this
.
log
.
info
(
`frameworkcontroller Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
delay
(
3000
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
this
.
stopping
=
true
;
}
}
...
@@ -80,14 +78,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -80,14 +78,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
}
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
fcClusterConfig
)
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontrollerClusterConfig is not initialized
'
);
throw
new
Error
(
'
frameworkcontrollerClusterConfig is not initialized
'
);
}
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
kubernetesCRDClient is undefined
'
);
throw
new
Error
(
'
kubernetesCRDClient is undefined
'
);
}
}
if
(
!
this
.
kubernetesRestServerPort
)
{
if
(
this
.
kubernetesRestServerPort
===
undefined
)
{
const
restServer
:
FrameworkControllerJobRestServer
=
component
.
get
(
FrameworkControllerJobRestServer
);
const
restServer
:
FrameworkControllerJobRestServer
=
component
.
get
(
FrameworkControllerJobRestServer
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
}
...
@@ -97,14 +95,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -97,14 +95,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// Set trial's NFS working folder
// Set trial's NFS working folder
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
const
frameworkcontrollerJobName
=
`nniexp
${
this
.
experimentId
}
trial
${
trialJobId
}
`
.
toLowerCase
();
const
frameworkcontrollerJobName
:
string
=
`nniexp
${
this
.
experimentId
}
trial
${
trialJobId
}
`
.
toLowerCase
();
//Generate the port used for taskRole
//Generate the port used for taskRole
this
.
generateContainerPort
();
this
.
generateContainerPort
();
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
curTrialSequenceId
,
trialJobId
,
trialWorkingFolder
,
form
);
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
curTrialSequenceId
,
trialJobId
,
trialWorkingFolder
,
form
);
//upload code files
//upload code files
le
t
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
cons
t
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
trialJobId
,
'
WAITING
'
,
'
WAITING
'
,
...
@@ -116,182 +114,202 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -116,182 +114,202 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
trialJobOutputUrl
trialJobOutputUrl
);
);
// Set trial job detail until create frameworkcontroller job successfully
// Set trial job detail until create frameworkcontroller job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
const
frameworkcontrollerJobConfig
=
await
this
.
prepareFrameworkControllerConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
);
// tslint:disable-next-line:no-any
const
frameworkcontrollerJobConfig
:
any
=
await
this
.
prepareFrameworkControllerConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
);
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
frameworkcontrollerJobConfig
);
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
frameworkcontrollerJobConfig
);
// Set trial job detail until create frameworkcontroller job successfully
// Set trial job detail until create frameworkcontroller job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
}
}
// tslint:disable:no-redundant-jsdoc no-any no-unsafe-any
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
FRAMEWORKCONTROLLER_CLUSTER_CONFIG
:
const
frameworkcontrollerClusterJsonObject
:
any
=
JSON
.
parse
(
value
);
this
.
fcClusterConfig
=
FrameworkControllerClusterConfigFactory
.
generateFrameworkControllerClusterConfig
(
frameworkcontrollerClusterJsonObject
);
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
const
azureFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
this
.
azureStorageAccountName
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureFrameworkControllerClusterConfig
.
keyVault
.
vaultName
,
azureFrameworkControllerClusterConfig
.
keyVault
.
name
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
await
this
.
createNFSStorage
(
nfsFrameworkControllerClusterConfig
.
nfs
.
server
,
nfsFrameworkControllerClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
FrameworkControllerClient
.
generateFrameworkControllerClient
();
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
const
frameworkcontrollerTrialJsonObjsect
:
any
=
JSON
.
parse
(
value
);
this
.
fcTrialConfig
=
new
FrameworkControllerTrialConfig
(
frameworkcontrollerTrialJsonObjsect
.
codeDir
,
frameworkcontrollerTrialJsonObjsect
.
taskRoles
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
}
return
Promise
.
resolve
();
}
// tslint:enable: no-any no-unsafe-any
/**
/**
* upload code files to nfs or azureStroage
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialJobId
* @param trialLocalTempFolder
* @param trialLocalTempFolder
* return: trialJobOutputUrl
* return: trialJobOutputUrl
*/
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
if
(
!
this
.
fcClusterConfig
)
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
let
trialJobOutputUrl
:
string
=
''
;
let
trialJobOutputUrl
:
string
=
''
;
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
try
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
try
{
//upload local files to azure storage
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
await
AzureStorageClientUtility
.
uploadDirectory
(
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/\
}
catch
(
error
){
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
return
Promise
.
reject
(
error
);
}
}
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
// Creat work dir for current trial in NFS directory
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsFrameworkControllerClusterConfig
.
nfs
;
const
nfsConfig
:
NFSConfig
=
nfsFrameworkControllerClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
}
/**
/**
* generate trial's command for frameworkcontroller
* generate trial's command for frameworkcontroller
* expose port and execute injector.sh before executing user's command
* expose port and execute injector.sh before executing user's command
* @param command
* @param command
*/
*/
private
generateCommandScript
(
command
:
string
):
string
{
private
generateCommandScript
(
command
:
string
):
string
{
let
portScript
=
''
;
let
portScript
:
string
=
''
;
if
(
!
this
.
fcTrialConfig
)
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
}
for
(
le
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
for
(
cons
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
portScript
+=
`FB_
${
taskRole
.
name
.
toUpperCase
()}
_PORT=
${
this
.
fcContainerPortMap
.
get
(
taskRole
.
name
)}
`
;
portScript
+=
`FB_
${
taskRole
.
name
.
toUpperCase
()}
_PORT=
${
this
.
fcContainerPortMap
.
get
(
taskRole
.
name
)}
`
;
}
}
return
`
${
portScript
}
. /mnt/frameworkbarrier/injector.sh &&
${
command
}
`
;
return
`
${
portScript
}
. /mnt/frameworkbarrier/injector.sh &&
${
command
}
`
;
}
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
curTrialSequenceId
:
number
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
curTrialSequenceId
:
number
,
trialJobId
:
string
,
if
(
!
this
.
fcTrialConfig
)
{
trialWorkingFolder
:
string
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
}
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
this
.
fcTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
await
cpp
.
exec
(
`cp -r
${
this
.
fcTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
run
ScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
const
install
ScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
run
ScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
install
ScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
for
(
let
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
for
(
const
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
const
runScriptContent
:
string
=
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
const
runScriptContent
:
string
=
this
.
generateCommandScript
(
taskRole
.
command
),
curTrialSequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
);
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
this
.
generateCommandScript
(
taskRole
.
command
),
curTrialSequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
`run_
${
taskRole
.
name
}
.sh`
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
`run_
${
taskRole
.
name
}
.sh`
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
// Write file content ( parameter.cfg ) to local tmp folders
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
;
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
if
(
trialForm
!==
undefined
&&
trialForm
.
hyperParameters
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
}
}
private
async
prepareFrameworkControllerConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
fcTrialConfig
)
{
// tslint:disable: no-any no-unsafe-any
private
async
prepareFrameworkControllerConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
):
Promise
<
any
>
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
}
const
podResources
:
any
=
[];
const
podResources
:
any
=
[];
for
(
le
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
for
(
cons
t
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
le
t
resource
:
any
=
{};
cons
t
resource
:
any
=
{};
resource
.
requests
=
this
.
generatePodResource
(
taskRole
.
memoryMB
,
taskRole
.
cpuNum
,
taskRole
.
gpuNum
);
resource
.
requests
=
this
.
generatePodResource
(
taskRole
.
memoryMB
,
taskRole
.
cpuNum
,
taskRole
.
gpuNum
);
resource
.
limits
=
Object
.
assign
({},
resource
.
requests
)
;
resource
.
limits
=
{...
resource
.
requests
}
;
podResources
.
push
(
resource
);
podResources
.
push
(
resource
);
}
}
// Generate frameworkcontroller job resource config object
// Generate frameworkcontroller job resource config object
const
frameworkcontrollerJobConfig
:
any
=
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
const
frameworkcontrollerJobConfig
:
any
=
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
return
Promise
.
resolve
(
frameworkcontrollerJobConfig
);
return
Promise
.
resolve
(
frameworkcontrollerJobConfig
);
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
FRAMEWORKCONTROLLER_CLUSTER_CONFIG
:
let
frameworkcontrollerClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
fcClusterConfig
=
FrameworkControllerClusterConfigFactory
.
generateFrameworkControllerClusterConfig
(
frameworkcontrollerClusterJsonObject
);
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
let
azureFrameworkControllerClusterConfig
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
this
.
azureStorageAccountName
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureFrameworkControllerClusterConfig
.
keyVault
.
vaultName
,
azureFrameworkControllerClusterConfig
.
keyVault
.
name
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
accountName
,
azureFrameworkControllerClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsFrameworkControllerClusterConfig
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
await
this
.
createNFSStorage
(
nfsFrameworkControllerClusterConfig
.
nfs
.
server
,
nfsFrameworkControllerClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
FrameworkControllerClient
.
generateFrameworkControllerClient
();
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
let
frameworkcontrollerTrialJsonObjsect
=
JSON
.
parse
(
value
);
this
.
fcTrialConfig
=
new
FrameworkControllerTrialConfig
(
frameworkcontrollerTrialJsonObjsect
.
codeDir
,
frameworkcontrollerTrialJsonObjsect
.
taskRoles
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
break
;
}
return
Promise
.
resolve
();
}
}
private
generateContainerPort
()
{
private
generateContainerPort
()
:
void
{
if
(
!
this
.
fcTrialConfig
)
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
}
let
port
=
4000
;
//The default port used in container
let
port
:
number
=
4000
;
//The default port used in container
for
(
le
t
index
in
this
.
fcTrialConfig
.
taskRoles
)
{
for
(
cons
t
index
of
this
.
fcTrialConfig
.
taskRoles
.
keys
()
)
{
this
.
fcContainerPortMap
.
set
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
port
);
this
.
fcContainerPortMap
.
set
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
port
);
port
+=
1
;
port
+=
1
;
}
}
...
@@ -304,24 +322,25 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -304,24 +322,25 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* @param frameworkcontrollerJobName job name
* @param frameworkcontrollerJobName job name
* @param podResources pod template
* @param podResources pod template
*/
*/
private
generateFrameworkControllerJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
,
podResources
:
any
)
:
any
{
private
generateFrameworkControllerJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
if
(
!
this
.
fcClusterConfig
)
{
frameworkcontrollerJobName
:
string
,
podResources
:
any
)
:
any
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller Cluster config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller Cluster config is not initialized
'
);
}
}
if
(
!
this
.
fcTrialConfig
)
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
}
le
t
taskRoles
=
[];
cons
t
taskRoles
:
any
=
[];
for
(
le
t
index
in
this
.
fcTrialConfig
.
taskRoles
)
{
for
(
cons
t
index
of
this
.
fcTrialConfig
.
taskRoles
.
keys
()
)
{
le
t
containerPort
=
this
.
fcContainerPortMap
.
get
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
);
cons
t
containerPort
:
number
|
undefined
=
this
.
fcContainerPortMap
.
get
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
);
if
(
!
containerPort
)
{
if
(
containerPort
===
undefined
)
{
throw
new
Error
(
'
Container port is not initialized
'
);
throw
new
Error
(
'
Container port is not initialized
'
);
}
}
le
t
taskRole
=
this
.
generateTaskRoleConfig
(
cons
t
taskRole
:
any
=
this
.
generateTaskRoleConfig
(
trialWorkingFolder
,
trialWorkingFolder
,
this
.
fcTrialConfig
.
taskRoles
[
index
].
image
,
this
.
fcTrialConfig
.
taskRoles
[
index
].
image
,
`run_
${
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
}
.sh`
,
`run_
${
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
}
.sh`
,
podResources
[
index
],
podResources
[
index
],
containerPort
containerPort
...
@@ -330,17 +349,17 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -330,17 +349,17 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
name
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
name
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
taskNumber
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
taskNum
,
taskNumber
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
taskNum
,
frameworkAttemptCompletionPolicy
:
{
frameworkAttemptCompletionPolicy
:
{
minFailedTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minFailedTaskCount
,
minFailedTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minFailedTaskCount
,
minSucceededTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minSucceededTaskCount
minSucceededTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minSucceededTaskCount
},
},
task
:
taskRole
task
:
taskRole
});
});
}
}
return
{
return
{
apiVersion
:
`frameworkcontroller.microsoft.com/v1`
,
apiVersion
:
`frameworkcontroller.microsoft.com/v1`
,
kind
:
'
Framework
'
,
kind
:
'
Framework
'
,
metadata
:
{
metadata
:
{
name
:
frameworkcontrollerJobName
,
name
:
frameworkcontrollerJobName
,
namespace
:
'
default
'
,
namespace
:
'
default
'
,
labels
:
{
labels
:
{
...
@@ -356,19 +375,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -356,19 +375,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
};
};
}
}
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
):
any
{
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
):
any
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
if
(
!
this
.
fcClusterConfig
)
{
throw
new
Error
(
'
frameworkcontroller Cluster config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller Cluster config is not initialized
'
);
}
}
if
(
!
this
.
fcTrialConfig
)
{
if
(
this
.
fcTrialConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
}
le
t
volumeSpecMap
=
new
Map
<
string
,
object
>
();
cons
t
volumeSpecMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
){
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
{
name
:
'
nni-vol
'
,
name
:
'
nni-vol
'
,
...
@@ -380,9 +398,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -380,9 +398,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
},
{
},
{
name
:
'
frameworkbarrier-volume
'
,
name
:
'
frameworkbarrier-volume
'
,
emptyDir
:
{}
emptyDir
:
{}
}])
}]);
}
else
{
}
else
{
let
frameworkcontrollerClusterConfigNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
const
frameworkcontrollerClusterConfigNFS
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
{
name
:
'
nni-vol
'
,
name
:
'
nni-vol
'
,
...
@@ -393,19 +412,19 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -393,19 +412,19 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
},
{
},
{
name
:
'
frameworkbarrier-volume
'
,
name
:
'
frameworkbarrier-volume
'
,
emptyDir
:
{}
emptyDir
:
{}
}])
}])
;
}
}
le
t
containers
=
[
cons
t
containers
:
any
=
[
{
{
name
:
'
framework
'
,
name
:
'
framework
'
,
image
:
replicaImage
,
image
:
replicaImage
,
command
:
[
"
sh
"
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
command
:
[
'
sh
'
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
volumeMounts
:
[
volumeMounts
:
[
{
{
name
:
'
nni-vol
'
,
name
:
'
nni-vol
'
,
mountPath
:
this
.
CONTAINER_MOUNT_PATH
mountPath
:
this
.
CONTAINER_MOUNT_PATH
},{
},
{
name
:
'
frameworkbarrier-volume
'
,
name
:
'
frameworkbarrier-volume
'
,
mountPath
:
'
/mnt/frameworkbarrier
'
mountPath
:
'
/mnt/frameworkbarrier
'
}],
}],
...
@@ -413,35 +432,36 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -413,35 +432,36 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
ports
:
[{
ports
:
[{
containerPort
:
containerPort
containerPort
:
containerPort
}]
}]
}]
}]
;
le
t
initContainers
=
[
cons
t
initContainers
:
any
=
[
{
{
name
:
'
frameworkbarrier
'
,
name
:
'
frameworkbarrier
'
,
image
:
'
frameworkcontroller/frameworkbarrier
'
,
image
:
'
frameworkcontroller/frameworkbarrier
'
,
volumeMounts
:
[
volumeMounts
:
[
{
{
name
:
'
frameworkbarrier-volume
'
,
name
:
'
frameworkbarrier-volume
'
,
mountPath
:
'
/mnt/frameworkbarrier
'
mountPath
:
'
/mnt/frameworkbarrier
'
}]
}]
}]
}]
;
le
t
spec
:
any
=
{
cons
t
spec
:
any
=
{
containers
:
containers
,
containers
:
containers
,
initContainers
:
initContainers
,
initContainers
:
initContainers
,
restartPolicy
:
'
OnFailure
'
,
restartPolicy
:
'
OnFailure
'
,
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
),
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
),
hostNetwork
:
false
hostNetwork
:
false
};
};
if
(
this
.
fcClusterConfig
.
serviceAccountName
)
{
if
(
this
.
fcClusterConfig
.
serviceAccountName
!==
undefined
)
{
spec
.
serviceAccountName
=
this
.
fcClusterConfig
.
serviceAccountName
;
spec
.
serviceAccountName
=
this
.
fcClusterConfig
.
serviceAccountName
;
}
}
let
taskRole
=
{
return
{
pod
:
{
pod
:
{
spec
:
spec
spec
:
spec
}
}
}
};
return
taskRole
;
}
}
// tslint:enable: no-any no-unsafe-any
}
}
export
{
FrameworkControllerTrainingService
}
export
{
FrameworkControllerTrainingService
}
;
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
View file @
ba8dccd6
...
@@ -20,18 +20,22 @@
...
@@ -20,18 +20,22 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubeflowOperator
}
from
'
./kubeflowConfig
'
;
import
{
KubeflowOperator
}
from
'
./kubeflowConfig
'
;
import
{
KubernetesCRDClient
,
GeneralK8sClient
}
from
'
../kubernetesApiClient
'
;
abstract
class
KubeflowOperatorClient
extends
KubernetesCRDClient
{
/**
* KubeflowOperator Client
*/
abstract
class
KubeflowOperatorClient
extends
KubernetesCRDClient
{
/**
/**
* Factory method to generate operator cliet
* Factory method to generate operator clie
n
t
*/
*/
public
static
generateOperatorClient
(
kubeflowOperator
:
KubeflowOperator
,
// tslint:disable-next-line:function-name
operatorApiVersion
:
string
):
KubernetesCRDClient
{
public
static
generateOperatorClient
(
kubeflowOperator
:
KubeflowOperator
,
switch
(
kubeflowOperator
)
{
operatorApiVersion
:
string
):
KubernetesCRDClient
{
switch
(
kubeflowOperator
)
{
case
'
tf-operator
'
:
{
case
'
tf-operator
'
:
{
switch
(
operatorApiVersion
)
{
switch
(
operatorApiVersion
)
{
case
'
v1alpha2
'
:
{
case
'
v1alpha2
'
:
{
return
new
TFOperatorClientV1Alpha2
();
return
new
TFOperatorClientV1Alpha2
();
}
}
...
@@ -41,11 +45,12 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
...
@@ -41,11 +45,12 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case
'
v1beta2
'
:
{
case
'
v1beta2
'
:
{
return
new
TFOperatorClientV1Beta2
();
return
new
TFOperatorClientV1Beta2
();
}
}
default
:
throw
new
Error
(
`Invalid tf-operator apiVersion
${
operatorApiVersion
}
`
);
}
}
break
;
}
}
case
'
pytorch-operator
'
:
{
case
'
pytorch-operator
'
:
{
switch
(
operatorApiVersion
)
{
switch
(
operatorApiVersion
)
{
case
'
v1alpha2
'
:
{
case
'
v1alpha2
'
:
{
return
new
PyTorchOperatorClientV1Alpha2
();
return
new
PyTorchOperatorClientV1Alpha2
();
}
}
...
@@ -55,13 +60,17 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
...
@@ -55,13 +60,17 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case
'
v1beta2
'
:
{
case
'
v1beta2
'
:
{
return
new
PyTorchOperatorClientV1Beta2
();
return
new
PyTorchOperatorClientV1Beta2
();
}
}
default
:
throw
new
Error
(
`Invalid pytorch-operator apiVersion
${
operatorApiVersion
}
`
);
}
}
}
}
default
:
throw
new
Error
(
`Invalid operator
${
kubeflowOperator
}
`
);
}
}
throw
new
Error
(
`Invalid operator
${
kubeflowOperator
}
or apiVersion
${
operatorApiVersion
}
`
);
}
}
}
}
// tslint:disable: no-unsafe-any no-any completed-docs
class
TFOperatorClientV1Alpha2
extends
KubeflowOperatorClient
{
class
TFOperatorClientV1Alpha2
extends
KubeflowOperatorClient
{
/**
/**
* constructor, to initialize tfjob CRD definition
* constructor, to initialize tfjob CRD definition
...
@@ -73,12 +82,12 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
...
@@ -73,12 +82,12 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
}
}
protected
get
operator
():
any
{
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1alpha2
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1alpha2
.
namespaces
(
'
default
'
).
tfjobs
;
}
}
public
get
containerName
():
string
{
public
get
containerName
():
string
{
return
'
tensorflow
'
;
return
'
tensorflow
'
;
}
}
}
}
class
TFOperatorClientV1Beta1
extends
KubernetesCRDClient
{
class
TFOperatorClientV1Beta1
extends
KubernetesCRDClient
{
...
@@ -92,12 +101,12 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
...
@@ -92,12 +101,12 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
}
}
protected
get
operator
():
any
{
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta1
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta1
.
namespaces
(
'
default
'
).
tfjobs
;
}
}
public
get
containerName
():
string
{
public
get
containerName
():
string
{
return
'
tensorflow
'
;
return
'
tensorflow
'
;
}
}
}
}
class
TFOperatorClientV1Beta2
extends
KubernetesCRDClient
{
class
TFOperatorClientV1Beta2
extends
KubernetesCRDClient
{
...
@@ -111,12 +120,12 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
...
@@ -111,12 +120,12 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
}
}
protected
get
operator
():
any
{
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta2
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta2
.
namespaces
(
'
default
'
).
tfjobs
;
}
}
public
get
containerName
():
string
{
public
get
containerName
():
string
{
return
'
tensorflow
'
;
return
'
tensorflow
'
;
}
}
}
}
class
PyTorchOperatorClientV1Alpha2
extends
KubeflowOperatorClient
{
class
PyTorchOperatorClientV1Alpha2
extends
KubeflowOperatorClient
{
...
@@ -130,7 +139,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
...
@@ -130,7 +139,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
}
}
protected
get
operator
():
any
{
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1alpha2
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1alpha2
.
namespaces
(
'
default
'
).
pytorchjobs
;
}
}
public
get
containerName
():
string
{
public
get
containerName
():
string
{
...
@@ -149,7 +158,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
...
@@ -149,7 +158,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
}
}
protected
get
operator
():
any
{
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta1
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta1
.
namespaces
(
'
default
'
).
pytorchjobs
;
}
}
public
get
containerName
():
string
{
public
get
containerName
():
string
{
...
@@ -168,7 +177,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
...
@@ -168,7 +177,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
}
}
protected
get
operator
():
any
{
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
"
kubeflow.org
"
].
v1beta2
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta2
.
namespaces
(
'
default
'
).
pytorchjobs
;
}
}
public
get
containerName
():
string
{
public
get
containerName
():
string
{
...
@@ -176,5 +185,5 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
...
@@ -176,5 +185,5 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
}
}
}
}
// tslint:enable: no-unsafe-any
export
{
KubeflowOperatorClient
,
GeneralK8sClient
};
export
{
KubeflowOperatorClient
,
GeneralK8sClient
};
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
View file @
ba8dccd6
...
@@ -20,16 +20,20 @@
...
@@ -20,16 +20,20 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
assert
from
'
assert
'
;
import
{
KubernetesClusterConfigAzure
,
KubernetesClusterConfigNFS
,
KubernetesStorageKind
,
NFSConfig
,
AzureStorage
,
keyVaultConfig
,
KubernetesTrialConfig
,
KubernetesTrialConfigTemplate
,
StorageConfig
,
KubernetesClusterConfig
}
from
'
../kubernetesConfig
'
import
{
MethodNotImplementedError
}
from
'
../../../common/errors
'
;
import
{
MethodNotImplementedError
}
from
'
../../../common/errors
'
;
import
{
AzureStorage
,
KeyVaultConfig
,
KubernetesClusterConfig
,
KubernetesClusterConfigAzure
,
KubernetesClusterConfigNFS
,
KubernetesStorageKind
,
KubernetesTrialConfig
,
KubernetesTrialConfigTemplate
,
NFSConfig
,
StorageConfig
}
from
'
../kubernetesConfig
'
;
/
**
operator types that kubeflow supported
*/
/
/
operator types that kubeflow supported
export
type
KubeflowOperator
=
'
tf-operator
'
|
'
pytorch-operator
'
;
export
type
KubeflowOperator
=
'
tf-operator
'
|
'
pytorch-operator
'
;
export
type
DistTrainRole
=
'
worker
'
|
'
ps
'
|
'
master
'
;
export
type
DistTrainRole
=
'
worker
'
|
'
ps
'
|
'
master
'
;
export
type
KubeflowJobStatus
=
'
Created
'
|
'
Running
'
|
'
Failed
'
|
'
Succeeded
'
;
export
type
KubeflowJobStatus
=
'
Created
'
|
'
Running
'
|
'
Failed
'
|
'
Succeeded
'
;
export
type
OperatorApiVersion
=
'
v1alpha2
'
|
'
v1beta1
'
|
'
v1beta2
'
;
export
type
OperatorApiVersion
=
'
v1alpha2
'
|
'
v1beta1
'
|
'
v1beta2
'
;
/**
* Kubeflow Cluster Configuration
*/
export
class
KubeflowClusterConfig
extends
KubernetesClusterConfig
{
export
class
KubeflowClusterConfig
extends
KubernetesClusterConfig
{
public
readonly
operator
:
KubeflowOperator
;
public
readonly
operator
:
KubeflowOperator
;
constructor
(
apiVersion
:
string
,
operator
:
KubeflowOperator
)
{
constructor
(
apiVersion
:
string
,
operator
:
KubeflowOperator
)
{
...
@@ -38,11 +42,12 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
...
@@ -38,11 +42,12 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
}
}
}
}
// tslint:disable:completed-docs
export
class
KubeflowClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
export
class
KubeflowClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
operator
:
KubeflowOperator
;
public
readonly
operator
:
KubeflowOperator
;
constructor
(
constructor
(
operator
:
KubeflowOperator
,
operator
:
KubeflowOperator
,
apiVersion
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
...
@@ -54,9 +59,11 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
...
@@ -54,9 +59,11 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
return
'
nfs
'
;
return
'
nfs
'
;
}
}
// tslint:disable-next-line:function-name
public
static
getInstance
(
jsonObject
:
object
):
KubeflowClusterConfigNFS
{
public
static
getInstance
(
jsonObject
:
object
):
KubeflowClusterConfigNFS
{
let
kubeflowClusterConfigObjectNFS
=
<
KubeflowClusterConfigNFS
>
jsonObject
;
const
kubeflowClusterConfigObjectNFS
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
jsonObject
;
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
)
assert
(
kubeflowClusterConfigObjectNFS
!==
undefined
);
return
new
KubeflowClusterConfigNFS
(
return
new
KubeflowClusterConfigNFS
(
kubeflowClusterConfigObjectNFS
.
operator
,
kubeflowClusterConfigObjectNFS
.
operator
,
kubeflowClusterConfigObjectNFS
.
apiVersion
,
kubeflowClusterConfigObjectNFS
.
apiVersion
,
...
@@ -66,26 +73,28 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
...
@@ -66,26 +73,28 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
}
}
}
}
export
class
KubeflowClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
export
class
KubeflowClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
public
readonly
operator
:
KubeflowOperator
;
public
readonly
operator
:
KubeflowOperator
;
constructor
(
constructor
(
operator
:
KubeflowOperator
,
operator
:
KubeflowOperator
,
apiVersion
:
string
,
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
this
.
operator
=
operator
;
this
.
operator
=
operator
;
}
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
azureStorage
'
;
return
'
azureStorage
'
;
}
}
// tslint:disable-next-line:function-name
public
static
getInstance
(
jsonObject
:
object
):
KubeflowClusterConfigAzure
{
public
static
getInstance
(
jsonObject
:
object
):
KubeflowClusterConfigAzure
{
let
kubeflowClusterConfigObjectAzure
=
<
KubeflowClusterConfigAzure
>
jsonObject
;
const
kubeflowClusterConfigObjectAzure
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
jsonObject
;
return
new
KubeflowClusterConfigAzure
(
return
new
KubeflowClusterConfigAzure
(
kubeflowClusterConfigObjectAzure
.
operator
,
kubeflowClusterConfigObjectAzure
.
operator
,
kubeflowClusterConfigObjectAzure
.
apiVersion
,
kubeflowClusterConfigObjectAzure
.
apiVersion
,
...
@@ -98,12 +107,13 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
...
@@ -98,12 +107,13 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
export
class
KubeflowClusterConfigFactory
{
export
class
KubeflowClusterConfigFactory
{
// tslint:disable-next-line:function-name
public
static
generateKubeflowClusterConfig
(
jsonObject
:
object
):
KubeflowClusterConfig
{
public
static
generateKubeflowClusterConfig
(
jsonObject
:
object
):
KubeflowClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
if
(
!
storageConfig
)
{
if
(
storageConfig
===
undefined
)
{
throw
new
Error
(
"
Invalid json object as a StorageConfig instance
"
);
throw
new
Error
(
'
Invalid json object as a StorageConfig instance
'
);
}
}
if
(
storageConfig
.
storage
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
if
(
storageConfig
.
storage
!==
undefined
&&
storageConfig
.
storage
===
'
azureStorage
'
)
{
return
KubeflowClusterConfigAzure
.
getInstance
(
jsonObject
);
return
KubeflowClusterConfigAzure
.
getInstance
(
jsonObject
);
}
else
if
(
storageConfig
.
storage
===
undefined
||
storageConfig
.
storage
===
'
nfs
'
)
{
}
else
if
(
storageConfig
.
storage
===
undefined
||
storageConfig
.
storage
===
'
nfs
'
)
{
return
KubeflowClusterConfigNFS
.
getInstance
(
jsonObject
);
return
KubeflowClusterConfigNFS
.
getInstance
(
jsonObject
);
...
@@ -122,10 +132,10 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
...
@@ -122,10 +132,10 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
}
}
}
}
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
public
readonly
replicas
:
number
;
public
readonly
replicas
:
number
;
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
this
.
replicas
=
replicas
;
this
.
replicas
=
replicas
;
}
}
...
@@ -163,22 +173,25 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
...
@@ -163,22 +173,25 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
export
class
KubeflowTrialConfigFactory
{
export
class
KubeflowTrialConfigFactory
{
// tslint:disable-next-line:function-name
public
static
generateKubeflowTrialConfig
(
jsonObject
:
object
,
operator
:
KubeflowOperator
):
KubeflowTrialConfig
{
public
static
generateKubeflowTrialConfig
(
jsonObject
:
object
,
operator
:
KubeflowOperator
):
KubeflowTrialConfig
{
if
(
operator
===
'
tf-operator
'
){
if
(
operator
===
'
tf-operator
'
)
{
let
kubeflowTrialConfigObject
=
<
KubeflowTrialConfigTensorflow
>
jsonObject
;
const
kubeflowTrialConfigObject
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
jsonObject
;
return
new
KubeflowTrialConfigTensorflow
(
return
new
KubeflowTrialConfigTensorflow
(
kubeflowTrialConfigObject
.
codeDir
,
kubeflowTrialConfigObject
.
codeDir
,
kubeflowTrialConfigObject
.
worker
,
kubeflowTrialConfigObject
.
worker
,
kubeflowTrialConfigObject
.
ps
kubeflowTrialConfigObject
.
ps
);
);
}
else
if
(
operator
===
'
pytorch-operator
'
){
}
else
if
(
operator
===
'
pytorch-operator
'
)
{
let
kubeflowTrialConfigObject
=
<
KubeflowTrialConfigPytorch
>
jsonObject
;
const
kubeflowTrialConfigObject
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
jsonObject
;
return
new
KubeflowTrialConfigPytorch
(
return
new
KubeflowTrialConfigPytorch
(
kubeflowTrialConfigObject
.
codeDir
,
kubeflowTrialConfigObject
.
codeDir
,
kubeflowTrialConfigObject
.
master
,
kubeflowTrialConfigObject
.
master
,
kubeflowTrialConfigObject
.
worker
kubeflowTrialConfigObject
.
worker
);
);
}
}
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
}
}
}
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
View file @
ba8dccd6
...
@@ -19,65 +19,68 @@
...
@@ -19,65 +19,68 @@
'
use strict
'
;
'
use strict
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubernetesCRDClient
}
from
'
../kubernetesApiClient
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesJobInfoCollector
}
from
'
../kubernetesJobInfoCollector
'
;
import
{
KubernetesJobInfoCollector
}
from
'
../kubernetesJobInfoCollector
'
;
import
{
KubeflowJobStatus
}
from
'
./kubeflowConfig
'
;
import
{
KubeflowJobStatus
}
from
'
./kubeflowConfig
'
;
/**
/**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/
*/
export
class
KubeflowJobInfoCollector
extends
KubernetesJobInfoCollector
{
export
class
KubeflowJobInfoCollector
extends
KubernetesJobInfoCollector
{
constructor
(
jobMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
)
{
constructor
(
jobMap
:
Map
<
string
,
KubernetesTrialJobDetail
>
)
{
super
(
jobMap
);
super
(
jobMap
);
}
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
if
(
kubernetesCRDClient
===
undefined
)
{
if
(
kubernetesCRDClient
===
undefined
)
{
return
Promise
.
reject
(
'
kubernetesCRDClient is undefined
'
);
return
Promise
.
reject
(
'
kubernetesCRDClient is undefined
'
);
}
}
// tslint:disable:no-any no-unsafe-any
let
kubernetesJobInfo
:
any
;
let
kubernetesJobInfo
:
any
;
try
{
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
}
catch
(
error
)
{
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
//This is not treat as a error status
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
conditions
)
{
if
(
kubernetesJobInfo
.
status
&&
kubernetesJobInfo
.
status
.
conditions
)
{
const
latestCondition
=
kubernetesJobInfo
.
status
.
conditions
[
kubernetesJobInfo
.
status
.
conditions
.
length
-
1
];
const
latestCondition
:
any
=
kubernetesJobInfo
.
status
.
conditions
[
kubernetesJobInfo
.
status
.
conditions
.
length
-
1
];
const
tfJobType
:
KubeflowJobStatus
=
<
KubeflowJobStatus
>
latestCondition
.
type
;
const
tfJobType
:
KubeflowJobStatus
=
<
KubeflowJobStatus
>
latestCondition
.
type
;
switch
(
tfJobType
)
{
switch
(
tfJobType
)
{
case
'
Created
'
:
case
'
Created
'
:
kubernetesTrialJob
.
status
=
'
WAITING
'
;
kubernetesTrialJob
.
status
=
'
WAITING
'
;
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
break
;
case
'
Running
'
:
case
'
Running
'
:
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
kubernetesTrialJob
.
startTime
)
{
if
(
kubernetesTrialJob
.
startTime
===
undefined
)
{
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
}
}
break
;
break
;
case
'
Failed
'
:
case
'
Failed
'
:
kubernetesTrialJob
.
status
=
'
FAILED
'
;
kubernetesTrialJob
.
status
=
'
FAILED
'
;
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
break
;
case
'
Succeeded
'
:
case
'
Succeeded
'
:
kubernetesTrialJob
.
status
=
'
SUCCEEDED
'
;
kubernetesTrialJob
.
status
=
'
SUCCEEDED
'
;
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
break
;
default
:
default
:
break
;
}
}
}
}
// tslint:enable:no-any no-unsafe-any
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
View file @
ba8dccd6
...
@@ -20,19 +20,19 @@
...
@@ -20,19 +20,19 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
;
import
{
KubeflowTrainingService
}
from
'
./kubeflowTrainingService
'
;
import
{
KubeflowTrainingService
}
from
'
./kubeflowTrainingService
'
;
import
{
KubernetesJobRestServer
}
from
'
../kubernetesJobRestServer
'
/**
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
class
KubeflowJobRestServer
extends
KubernetesJobRestServer
{
export
class
KubeflowJobRestServer
extends
KubernetesJobRestServer
{
/**
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
*/
constructor
()
{
constructor
()
{
super
(
component
.
get
(
KubeflowTrainingService
));
super
(
component
.
get
(
KubeflowTrainingService
));
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
ba8dccd6
...
@@ -17,35 +17,34 @@
...
@@ -17,35 +17,34 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
*/
'
use strict
'
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
import
{
JobApplicationForm
,
TrialJobApplicationForm
,
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
TrialJobDetail
,
NNIManagerIpConfig
}
from
'
../../../common/trainingService
'
;
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
KubeflowClusterConfigNFS
,
KubeflowClusterConfigAzure
,
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
KubeflowTrialConfigPytorch
,
KubeflowTrialConfigTensorflow
,
KubeflowClusterConfigFactory
,
KubeflowTrialConfigFactory
,
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
KubeflowTrialConfig
,
KubeflowClusterConfig
}
from
'
./kubeflowConfig
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
NFSConfig
}
from
'
../kubernetesConfig
'
;
import
{
KubernetesTrialJobDetail
}
from
'
../kubernetesData
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
KubeflowOperatorClient
}
from
'
./kubeflowApiClient
'
;
import
{
KubeflowOperatorClient
}
from
'
./kubeflowApiClient
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
import
{
KubeflowClusterConfig
,
KubeflowClusterConfigAzure
,
KubeflowClusterConfigFactory
,
KubeflowClusterConfigNFS
,
KubeflowTrialConfig
,
KubeflowTrialConfigFactory
,
KubeflowTrialConfigPytorch
,
KubeflowTrialConfigTensorflow
}
from
'
./kubeflowConfig
'
;
import
{
KubeflowJobInfoCollector
}
from
'
./kubeflowJobInfoCollector
'
;
import
{
KubeflowJobInfoCollector
}
from
'
./kubeflowJobInfoCollector
'
;
import
{
KubeflowJobRestServer
}
from
'
./kubeflowJobRestServer
'
;
// tslint:disable: no-unsafe-any no-any
/**
/**
* Training Service implementation for Kubeflow
* Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
...
@@ -54,12 +53,12 @@ import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
...
@@ -54,12 +53,12 @@ import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
class
KubeflowTrainingService
extends
KubernetesTrainingService
implements
KubernetesTrainingService
{
class
KubeflowTrainingService
extends
KubernetesTrainingService
implements
KubernetesTrainingService
{
private
kubeflowClusterConfig
?:
KubeflowClusterConfig
;
private
kubeflowClusterConfig
?:
KubeflowClusterConfig
;
private
kubeflowTrialConfig
?:
KubeflowTrialConfig
;
private
kubeflowTrialConfig
?:
KubeflowTrialConfig
;
private
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
private
readonly
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
constructor
()
{
constructor
()
{
super
();
super
();
this
.
kubeflowJobInfoCollector
=
new
KubeflowJobInfoCollector
(
this
.
trialJobsMap
);
this
.
kubeflowJobInfoCollector
=
new
KubeflowJobInfoCollector
(
this
.
trialJobsMap
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
this
.
nextTrialSequenceId
=
-
1
;
this
.
log
.
info
(
'
Construct Kubeflow training service.
'
);
this
.
log
.
info
(
'
Construct Kubeflow training service.
'
);
}
}
...
@@ -67,17 +66,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -67,17 +66,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
public
async
run
():
Promise
<
void
>
{
public
async
run
():
Promise
<
void
>
{
this
.
log
.
info
(
'
Run Kubeflow training service.
'
);
this
.
log
.
info
(
'
Run Kubeflow training service.
'
);
this
.
kubernetesJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
this
.
kubernetesJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
if
(
!
this
.
kubernetesJobRestServer
)
{
if
(
this
.
kubernetesJobRestServer
===
undefined
)
{
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
}
await
this
.
kubernetesJobRestServer
.
start
();
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
delay
(
3000
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
this
.
stopping
=
true
;
}
}
...
@@ -86,17 +85,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -86,17 +85,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
}
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
public
async
submitTrialJob
(
form
:
JobApplicationForm
):
Promise
<
TrialJobDetail
>
{
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow job operator client is undefined
'
);
throw
new
Error
(
'
Kubeflow job operator client is undefined
'
);
}
}
if
(
!
this
.
kubernetesRestServerPort
)
{
if
(
this
.
kubernetesRestServerPort
===
undefined
)
{
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
}
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
kubeflowJobName
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
kubeflowJobName
:
string
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
curTrialSequenceId
:
number
=
this
.
generateSequenceId
();
const
curTrialSequenceId
:
number
=
this
.
generateSequenceId
();
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials-local
'
,
trialJobId
);
//prepare the runscript
//prepare the runscript
...
@@ -113,226 +112,239 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -113,226 +112,239 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
curTrialSequenceId
,
curTrialSequenceId
,
trialJobOutputUrl
trialJobOutputUrl
);
);
// Generate kubeflow job resource config object
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
await
this
.
prepareKubeflowConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
);
const
kubeflowJobConfig
:
any
=
await
this
.
prepareKubeflowConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
);
// Create kubeflow job based on generated kubeflow job resource config
// Create kubeflow job based on generated kubeflow job resource config
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
kubeflowJobConfig
);
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
kubeflowJobConfig
);
// Set trial job detail until create Kubeflow job successfully
// Set trial job detail until create Kubeflow job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
}
}
// tslint:disable:no-redundant-jsdoc
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
const
kubeflowClusterJsonObject
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
await
this
.
createNFSStorage
(
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
);
const
kubeflowTrialJsonObjsect
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
}
return
Promise
.
resolve
();
}
/**
/**
* upload code files to nfs or azureStroage
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialJobId
* @param trialLocalTempFolder
* @param trialLocalTempFolder
* return: trialJobOutputUrl
* return: trialJobOutputUrl
*/
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
let
trialJobOutputUrl
:
string
=
''
;
let
trialJobOutputUrl
:
string
=
''
;
assert
(
!
this
.
kubeflowClusterConfig
.
storage
assert
(
this
.
kubeflowClusterConfig
.
storage
===
undefined
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
try
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
try
{
//upload local files to azure storage
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
\
}
catch
(
error
){
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
return
Promise
.
reject
(
error
);
}
}
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
le
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
cons
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
if
(
!
this
.
kubeflowClusterConfig
)
{
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
// initialize kubeflow trial config to specific type
// initialize kubeflow trial config to specific type
let
kubeflowTrialConfig
;
let
kubeflowTrialConfig
:
any
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
{
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
;
}
}
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
)
{
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
const
runScriptContent
:
string
=
CONTAINER_INSTALL_NNI_SHELL_FORMAT
;
// Write NNI installation file to local tmp files
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
install_nni.sh
'
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
// Create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_worker.sh
'
),
workerRunScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_worker.sh
'
),
workerRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
// Write parameter server file content run_ps.sh to local tmp folders
// Write parameter server file content run_ps.sh to local tmp folders
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
){
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_ps.sh
'
),
psRunScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_ps.sh
'
),
psRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
!==
undefined
)
{
if
(
pytorchTrialConfig
.
master
){
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
command
,
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_master.sh
'
),
masterRunScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_master.sh
'
),
masterRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
}
}
// Write file content ( parameter.cfg ) to local tmp folders
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
;
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
if
(
trialForm
!==
undefined
&&
trialForm
.
hyperParameters
!==
undefined
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
}
}
private
async
prepareKubeflowConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
):
Promise
<
any
>
{
private
async
prepareKubeflowConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
}
// initialize kubeflow trial config to specific type
// initialize kubeflow trial config to specific type
let
kubeflowTrialConfig
;
let
kubeflowTrialConfig
:
any
;
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
kubeflowTrialConfig
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
kubeflowTrialConfig
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
}
else
{
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
;
}
}
const
workerPodResources
:
any
=
{};
const
workerPodResources
:
any
=
{};
if
(
kubeflowTrialConfig
.
worker
)
{
if
(
kubeflowTrialConfig
.
worker
!==
undefined
)
{
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
kubeflowTrialConfig
.
worker
.
gpuNum
)
kubeflowTrialConfig
.
worker
.
gpuNum
)
;
}
}
workerPodResources
.
limits
=
Object
.
assign
({},
workerPodResources
.
requests
)
;
workerPodResources
.
limits
=
{...
workerPodResources
.
requests
}
;
le
t
nonWorkerResources
:
any
=
{};
cons
t
nonWorkerResources
:
any
=
{};
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
le
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
cons
t
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
)
{
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
tensorflowTrialConfig
.
ps
.
gpuNum
)
tensorflowTrialConfig
.
ps
.
gpuNum
)
;
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
nonWorkerResources
.
limits
=
{...
nonWorkerResources
.
requests
};
}
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
let
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
const
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
pyTorchTrialConfig
.
master
.
gpuNum
)
pyTorchTrialConfig
.
master
.
gpuNum
);
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
nonWorkerResources
.
limits
=
{...
nonWorkerResources
.
requests
};
}
}
// Generate kubeflow job resource config object
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
let
kubeflowClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
let
azureKubeflowClusterConfig
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
this
.
azureStorageShare
=
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
;
await
this
.
createAzureStorage
(
azureKubeflowClusterConfig
.
keyVault
.
vaultName
,
azureKubeflowClusterConfig
.
keyVault
.
name
,
azureKubeflowClusterConfig
.
azureStorage
.
accountName
,
azureKubeflowClusterConfig
.
azureStorage
.
azureShare
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsKubeflowClusterConfig
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
await
this
.
createNFSStorage
(
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
);
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
!
this
.
kubeflowClusterConfig
){
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
)
let
kubeflowTrialJsonObjsect
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
);
// Validate to make sure codeDir doesn't have too many files
try
{
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
}
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
this
.
versionCheck
=
(
value
===
'
true
'
||
value
===
'
True
'
);
break
;
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
default
:
break
;
}
return
Promise
.
resolve
();
}
}
/**
/**
...
@@ -343,49 +355,48 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -343,49 +355,48 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template
* @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master
* @param nonWorkerPodResources non-worker pod template, like ps or master
*/
*/
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
any
{
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
if
(
!
this
.
kubeflowClusterConfig
)
{
nonWorkerPodResources
?:
any
)
:
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
}
const
replicaSpecsObj
:
any
=
{};
const
replicaSpecsObj
:
any
=
{};
let
replicaSpecsObjMap
=
new
Map
<
string
,
object
>
();
const
replicaSpecsObjMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
if
(
tensorflowTrialConfig
.
ps
){
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
}
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
tfReplicaSpecs
'
:
replicaSpecsObj
})
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
tfReplicaSpecs
:
replicaSpecsObj
});
}
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
!==
undefined
)
{
if
(
pytorchTrialConfig
.
worker
)
{
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
}
}
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
pytorchReplicaSpecs
'
:
replicaSpecsObj
})
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
pytorchReplicaSpecs
:
replicaSpecsObj
})
;
}
}
return
{
return
{
apiVersion
:
`kubeflow.org/
${
this
.
kubernetesCRDClient
.
apiVersion
}
`
,
apiVersion
:
`kubeflow.org/
${
this
.
kubernetesCRDClient
.
apiVersion
}
`
,
kind
:
this
.
kubernetesCRDClient
.
jobKind
,
kind
:
this
.
kubernetesCRDClient
.
jobKind
,
metadata
:
{
metadata
:
{
name
:
kubeflowJobName
,
name
:
kubeflowJobName
,
namespace
:
'
default
'
,
namespace
:
'
default
'
,
labels
:
{
labels
:
{
...
@@ -395,7 +406,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -395,7 +406,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
}
},
},
spec
:
replicaSpecsObjMap
.
get
(
this
.
kubernetesCRDClient
.
jobKind
)
spec
:
replicaSpecsObjMap
.
get
(
this
.
kubernetesCRDClient
.
jobKind
)
};
};
}
}
/**
/**
...
@@ -406,21 +417,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -406,21 +417,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param runScriptFile script file name
* @param runScriptFile script file name
* @param podResources pod resource config section
* @param podResources pod resource config section
*/
*/
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
):
any
{
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
if
(
!
this
.
kubeflowClusterConfig
)
{
podResources
:
any
):
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
}
if
(
!
this
.
kubeflowTrialConfig
)
{
if
(
this
.
kubeflowTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
throw
new
Error
(
'
Kubeflow trial config is not initialized
'
);
}
}
if
(
!
this
.
kubernetesCRDClient
)
{
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
}
le
t
volumeSpecMap
=
new
Map
<
string
,
object
>
();
cons
t
volumeSpecMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
){
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
{
name
:
'
nni-vol
'
,
name
:
'
nni-vol
'
,
...
@@ -429,9 +441,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -429,9 +441,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
shareName
:
`
${
this
.
azureStorageShare
}
`
,
shareName
:
`
${
this
.
azureStorageShare
}
`
,
readonly
:
false
readonly
:
false
}
}
}])
}])
;
}
else
{
}
else
{
le
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
cons
t
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
{
{
name
:
'
nni-vol
'
,
name
:
'
nni-vol
'
,
...
@@ -439,13 +451,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -439,13 +451,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
server
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
server
}
`
,
server
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
server
}
`
,
path
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
path
}
`
path
:
`
${
nfsKubeflowClusterConfig
.
nfs
.
path
}
`
}
}
}])
}])
;
}
}
return
{
return
{
replicas
:
replicaNumber
,
replicas
:
replicaNumber
,
template
:
{
template
:
{
metadata
:
{
metadata
:
{
// tslint:disable-next-line:no-null-keyword
creationTimestamp
:
null
creationTimestamp
:
null
},
},
spec
:
{
spec
:
{
...
@@ -455,7 +468,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -455,7 +468,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// TODO: change the name based on operator's type
// TODO: change the name based on operator's type
name
:
this
.
kubernetesCRDClient
.
containerName
,
name
:
this
.
kubernetesCRDClient
.
containerName
,
image
:
replicaImage
,
image
:
replicaImage
,
args
:
[
"
sh
"
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
args
:
[
'
sh
'
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
volumeMounts
:
[
volumeMounts
:
[
{
{
name
:
'
nni-vol
'
,
name
:
'
nni-vol
'
,
...
@@ -470,5 +483,5 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -470,5 +483,5 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
};
};
}
}
}
}
// tslint:enable: no-unsafe-any no-any
export
{
KubeflowTrainingService
}
export
{
KubeflowTrainingService
}
;
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
View file @
ba8dccd6
...
@@ -19,44 +19,46 @@
...
@@ -19,44 +19,46 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
os
from
'
os
'
import
{
Client1_10
,
config
}
from
'
kubernetes-client
'
;
import
*
as
path
from
'
path
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
var
K8SClient
=
require
(
'
kubernetes-client
'
).
Client
;
var
K8SConfig
=
require
(
'
kubernetes-client
'
).
config
;
/**
/**
* Generict Kubernetes client, target version >= 1.9
* Generict Kubernetes client, target version >= 1.9
*/
*/
// tslint:disable: no-any no-unsafe-any
class
GeneralK8sClient
{
class
GeneralK8sClient
{
protected
readonly
client
:
any
;
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
protected
readonly
log
:
Logger
=
getLogger
();
constructor
()
{
constructor
()
{
this
.
client
=
new
K8S
Client
({
config
:
K8SC
onfig
.
fromKubeconfig
(),
version
:
'
1.9
'
});
this
.
client
=
new
Client
1_10
({
config
:
c
onfig
.
fromKubeconfig
(),
version
:
'
1.9
'
});
this
.
client
.
loadSpec
();
this
.
client
.
loadSpec
();
}
}
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
result
=
Promise
.
resolve
(
true
);
}
else
{
}
else
{
result
=
Promise
.
reject
(
`Create secrets failed, statusCode is
${
response
.
statusCode
}
`
);
result
=
Promise
.
reject
(
`Create secrets failed, statusCode is
${
response
.
statusCode
}
`
);
}
}
return
result
;
return
result
;
}
}
}
}
/**
* Kubernetes CRD client
*/
abstract
class
KubernetesCRDClient
{
abstract
class
KubernetesCRDClient
{
protected
readonly
client
:
any
;
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
();
protected
readonly
log
:
Logger
=
getLogger
();
protected
crdSchema
:
any
;
protected
crdSchema
:
any
;
constructor
()
{
constructor
()
{
this
.
client
=
new
K8S
Client
({
config
:
K8SC
onfig
.
fromKubeconfig
()
});
this
.
client
=
new
Client
1_10
({
config
:
c
onfig
.
fromKubeconfig
()
});
this
.
client
.
loadSpec
();
this
.
client
.
loadSpec
();
}
}
...
@@ -65,8 +67,8 @@ abstract class KubernetesCRDClient {
...
@@ -65,8 +67,8 @@ abstract class KubernetesCRDClient {
public
abstract
get
containerName
():
string
;
public
abstract
get
containerName
():
string
;
public
get
jobKind
():
string
{
public
get
jobKind
():
string
{
if
(
this
.
crdSchema
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
names
&&
this
.
crdSchema
.
spec
.
names
&&
this
.
crdSchema
.
spec
.
names
.
kind
)
{
&&
this
.
crdSchema
.
spec
.
names
.
kind
)
{
return
this
.
crdSchema
.
spec
.
names
.
kind
;
return
this
.
crdSchema
.
spec
.
names
.
kind
;
...
@@ -76,55 +78,62 @@ abstract class KubernetesCRDClient {
...
@@ -76,55 +78,62 @@ abstract class KubernetesCRDClient {
}
}
public
get
apiVersion
():
string
{
public
get
apiVersion
():
string
{
if
(
this
.
crdSchema
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
version
)
{
&&
this
.
crdSchema
.
spec
.
version
)
{
return
this
.
crdSchema
.
spec
.
version
;
return
this
.
crdSchema
.
spec
.
version
;
}
else
{
}
else
{
throw
new
Error
(
'
KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!
'
);
throw
new
Error
(
'
KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!
'
);
}
}
}
}
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
result
=
Promise
.
resolve
(
true
);
}
else
{
}
else
{
result
=
Promise
.
reject
(
`Create kubernetes job failed, statusCode is
${
response
.
statusCode
}
`
);
result
=
Promise
.
reject
(
`Create kubernetes job failed, statusCode is
${
response
.
statusCode
}
`
);
}
}
return
result
;
return
result
;
}
}
//TODO : replace any
//TODO : replace any
public
async
getKubernetesJob
(
kubeflowJobName
:
string
):
Promise
<
any
>
{
public
async
getKubernetesJob
(
kubeflowJobName
:
string
):
Promise
<
any
>
{
let
result
:
Promise
<
any
>
;
let
result
:
Promise
<
any
>
;
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
).
get
();
const
response
:
any
=
await
this
.
operator
(
kubeflowJobName
)
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
.
get
();
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
response
.
body
);
result
=
Promise
.
resolve
(
response
.
body
);
}
else
{
}
else
{
result
=
Promise
.
reject
(
`KubeflowOperatorClient get tfjobs failed, statusCode is
${
response
.
statusCode
}
`
);
result
=
Promise
.
reject
(
`KubeflowOperatorClient get tfjobs failed, statusCode is
${
response
.
statusCode
}
`
);
}
}
return
result
;
return
result
;
}
}
public
async
deleteKubernetesJob
(
labels
:
Map
<
string
,
string
>
):
Promise
<
boolean
>
{
public
async
deleteKubernetesJob
(
labels
:
Map
<
string
,
string
>
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
let
result
:
Promise
<
boolean
>
;
// construct match query from labels for deleting tfjob
// construct match query from labels for deleting tfjob
const
matchQuery
:
string
=
Array
.
from
(
labels
.
keys
()).
map
(
labelKey
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
).
join
(
'
,
'
);
const
matchQuery
:
string
=
Array
.
from
(
labels
.
keys
())
.
map
((
labelKey
:
string
)
=>
`
${
labelKey
}
=
${
labels
.
get
(
labelKey
)}
`
)
.
join
(
'
,
'
);
try
{
try
{
const
deleteResult
:
any
=
await
this
.
operator
().
delete
({
const
deleteResult
:
any
=
await
this
.
operator
()
.
delete
({
qs
:
{
qs
:
{
labelSelector
:
matchQuery
,
labelSelector
:
matchQuery
,
propagationPolicy
:
"
Background
"
propagationPolicy
:
'
Background
'
}
}
});
});
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
result
=
Promise
.
resolve
(
true
);
result
=
Promise
.
resolve
(
true
);
}
else
{
}
else
{
result
=
Promise
.
reject
(
`KubeflowOperatorClient, delete labels
${
matchQuery
}
get wrong statusCode
${
deleteResult
.
statusCode
}
`
);
result
=
Promise
.
reject
(
`KubeflowOperatorClient, delete labels
${
matchQuery
}
get wrong statusCode
${
deleteResult
.
statusCode
}
`
);
}
}
}
catch
(
err
)
{
}
catch
(
err
)
{
result
=
Promise
.
reject
(
err
);
result
=
Promise
.
reject
(
err
);
}
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
ba8dccd6
...
@@ -22,16 +22,17 @@
...
@@ -22,16 +22,17 @@
export
type
KubernetesStorageKind
=
'
nfs
'
|
'
azureStorage
'
;
export
type
KubernetesStorageKind
=
'
nfs
'
|
'
azureStorage
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
// tslint:disable: completed-docs function-name
export
abstract
class
KubernetesClusterConfig
{
export
abstract
class
KubernetesClusterConfig
{
public
readonly
storage
?:
KubernetesStorageKind
;
public
readonly
storage
?:
KubernetesStorageKind
;
public
readonly
apiVersion
:
string
;
public
readonly
apiVersion
:
string
;
constructor
(
apiVersion
:
string
,
storage
?:
KubernetesStorageKind
)
{
constructor
(
apiVersion
:
string
,
storage
?:
KubernetesStorageKind
)
{
this
.
storage
=
storage
;
this
.
storage
=
storage
;
this
.
apiVersion
=
apiVersion
;
this
.
apiVersion
=
apiVersion
;
}
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
throw
new
MethodNotImplementedError
();
throw
new
MethodNotImplementedError
();
}
}
}
}
...
@@ -48,7 +49,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
...
@@ -48,7 +49,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
public
readonly
nfs
:
NFSConfig
;
public
readonly
nfs
:
NFSConfig
;
constructor
(
constructor
(
apiVersion
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
...
@@ -56,12 +57,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
...
@@ -56,12 +57,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
this
.
nfs
=
nfs
;
this
.
nfs
=
nfs
;
}
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
nfs
'
;
return
'
nfs
'
;
}
}
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigNFS
{
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigNFS
{
let
kubernetesClusterConfigObjectNFS
=
<
KubernetesClusterConfigNFS
>
jsonObject
;
const
kubernetesClusterConfigObjectNFS
:
KubernetesClusterConfigNFS
=
<
KubernetesClusterConfigNFS
>
jsonObject
;
return
new
KubernetesClusterConfigNFS
(
return
new
KubernetesClusterConfigNFS
(
kubernetesClusterConfigObjectNFS
.
apiVersion
,
kubernetesClusterConfigObjectNFS
.
apiVersion
,
kubernetesClusterConfigObjectNFS
.
nfs
,
kubernetesClusterConfigObjectNFS
.
nfs
,
...
@@ -71,13 +73,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
...
@@ -71,13 +73,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
}
}
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
public
readonly
keyVault
:
k
eyVaultConfig
;
public
readonly
keyVault
:
K
eyVaultConfig
;
public
readonly
azureStorage
:
AzureStorage
;
public
readonly
azureStorage
:
AzureStorage
;
constructor
(
constructor
(
apiVersion
:
string
,
apiVersion
:
string
,
keyVault
:
k
eyVaultConfig
,
keyVault
:
K
eyVaultConfig
,
azureStorage
:
AzureStorage
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
super
(
apiVersion
,
storage
);
super
(
apiVersion
,
storage
);
...
@@ -85,12 +87,13 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
...
@@ -85,12 +87,13 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
this
.
azureStorage
=
azureStorage
;
this
.
azureStorage
=
azureStorage
;
}
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
return
'
azureStorage
'
;
return
'
azureStorage
'
;
}
}
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigAzure
{
public
static
getInstance
(
jsonObject
:
object
):
KubernetesClusterConfigAzure
{
let
kubernetesClusterConfigObjectAzure
=
<
KubernetesClusterConfigAzure
>
jsonObject
;
const
kubernetesClusterConfigObjectAzure
:
KubernetesClusterConfigAzure
=
<
KubernetesClusterConfigAzure
>
jsonObject
;
return
new
KubernetesClusterConfigAzure
(
return
new
KubernetesClusterConfigAzure
(
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
...
@@ -100,17 +103,20 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
...
@@ -100,17 +103,20 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
}
}
}
}
// tslint:disable-next-line:no-unnecessary-class
export
class
KubernetesClusterConfigFactory
{
export
class
KubernetesClusterConfigFactory
{
public
static
generateKubernetesClusterConfig
(
jsonObject
:
object
):
KubernetesClusterConfig
{
public
static
generateKubernetesClusterConfig
(
jsonObject
:
object
):
KubernetesClusterConfig
{
let
s
torageConfig
=
<
StorageConfig
>
jsonObject
;
const
storageConfig
:
S
torageConfig
=
<
StorageConfig
>
jsonObject
;
switch
(
storageConfig
.
storage
)
{
switch
(
storageConfig
.
storage
)
{
case
'
azureStorage
'
:
case
'
azureStorage
'
:
return
KubernetesClusterConfigAzure
.
getInstance
(
jsonObject
);
return
KubernetesClusterConfigAzure
.
getInstance
(
jsonObject
);
case
'
nfs
'
||
undefined
:
case
'
nfs
'
:
case
undefined
:
return
KubernetesClusterConfigNFS
.
getInstance
(
jsonObject
);
return
KubernetesClusterConfigNFS
.
getInstance
(
jsonObject
);
default
:
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
}
throw
new
Error
(
`Invalid json object
${
jsonObject
}
`
);
}
}
}
}
...
@@ -118,9 +124,9 @@ export class KubernetesClusterConfigFactory {
...
@@ -118,9 +124,9 @@ export class KubernetesClusterConfigFactory {
* NFS configuration to store Kubeflow job related files
* NFS configuration to store Kubeflow job related files
*/
*/
export
class
NFSConfig
{
export
class
NFSConfig
{
/
**
IP Adress of NFS server
*/
/
/
IP Adress of NFS server
public
readonly
server
:
string
;
public
readonly
server
:
string
;
/
**
exported NFS path on NFS server
*/
/
/
exported NFS path on NFS server
public
readonly
path
:
string
;
public
readonly
path
:
string
;
constructor
(
server
:
string
,
path
:
string
)
{
constructor
(
server
:
string
,
path
:
string
)
{
...
@@ -133,13 +139,13 @@ export class NFSConfig {
...
@@ -133,13 +139,13 @@ export class NFSConfig {
* KeyVault configuration to store the key of Azure Storage Service
* KeyVault configuration to store the key of Azure Storage Service
* Refer https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2
* Refer https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2
*/
*/
export
class
k
eyVaultConfig
{
export
class
K
eyVaultConfig
{
/
**
The vault-name to specify vault
*/
/
/
The vault-name to specify vault
public
readonly
vaultName
:
string
;
public
readonly
vaultName
:
string
;
/
**
The name to specify private key
*/
/
/
The name to specify private key
public
readonly
name
:
string
;
public
readonly
name
:
string
;
constructor
(
vaultName
:
string
,
name
:
string
){
constructor
(
vaultName
:
string
,
name
:
string
)
{
this
.
vaultName
=
vaultName
;
this
.
vaultName
=
vaultName
;
this
.
name
=
name
;
this
.
name
=
name
;
}
}
...
@@ -149,12 +155,12 @@ export class keyVaultConfig {
...
@@ -149,12 +155,12 @@ export class keyVaultConfig {
* Azure Storage Service
* Azure Storage Service
*/
*/
export
class
AzureStorage
{
export
class
AzureStorage
{
/
**
The azure share to storage files
*/
/
/
The azure share to storage files
public
readonly
azureShare
:
string
;
public
readonly
azureShare
:
string
;
/
**
The account name of sotrage service
*/
/
/
The account name of sotrage service
public
readonly
accountName
:
string
;
public
readonly
accountName
:
string
;
constructor
(
azureShare
:
string
,
accountName
:
string
){
constructor
(
azureShare
:
string
,
accountName
:
string
)
{
this
.
azureShare
=
azureShare
;
this
.
azureShare
=
azureShare
;
this
.
accountName
=
accountName
;
this
.
accountName
=
accountName
;
}
}
...
@@ -164,23 +170,23 @@ export class AzureStorage {
...
@@ -164,23 +170,23 @@ export class AzureStorage {
* Trial job configuration for Kubernetes
* Trial job configuration for Kubernetes
*/
*/
export
class
KubernetesTrialConfigTemplate
{
export
class
KubernetesTrialConfigTemplate
{
/
**
CPU number
*/
/
/
CPU number
public
readonly
cpuNum
:
number
;
public
readonly
cpuNum
:
number
;
/
**
Memory
*/
/
/
Memory
public
readonly
memoryMB
:
number
;
public
readonly
memoryMB
:
number
;
/
**
Docker image
*/
/
/
Docker image
public
readonly
image
:
string
;
public
readonly
image
:
string
;
/
**
Trail command
*/
/
/
Trail command
public
readonly
command
:
string
;
public
readonly
command
:
string
;
/
**
Required GPU number for trial job. The number should be in [0,100]
*/
/
/
Required GPU number for trial job. The number should be in [0,100]
public
readonly
gpuNum
:
number
;
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
gpuNum
:
number
,
constructor
(
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
this
.
command
=
command
;
this
.
command
=
command
;
this
.
gpuNum
=
gpuNum
;
this
.
gpuNum
=
gpuNum
;
this
.
cpuNum
=
cpuNum
;
this
.
cpuNum
=
cpuNum
;
...
@@ -195,4 +201,4 @@ export class KubernetesTrialConfig {
...
@@ -195,4 +201,4 @@ export class KubernetesTrialConfig {
constructor
(
codeDir
:
string
)
{
constructor
(
codeDir
:
string
)
{
this
.
codeDir
=
codeDir
;
this
.
codeDir
=
codeDir
;
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
ba8dccd6
...
@@ -24,7 +24,6 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo
...
@@ -24,7 +24,6 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo
/**
/**
* KubeflowTrialJobDetail
* KubeflowTrialJobDetail
*/
*/
// tslint:disable-next-line:max-classes-per-file
export
class
KubernetesTrialJobDetail
implements
TrialJobDetail
{
export
class
KubernetesTrialJobDetail
implements
TrialJobDetail
{
public
id
:
string
;
public
id
:
string
;
public
status
:
TrialJobStatus
;
public
status
:
TrialJobStatus
;
...
@@ -40,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
...
@@ -40,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
public
queryJobFailedCount
:
number
;
public
queryJobFailedCount
:
number
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
kubernetesJobName
:
string
,
sequenceId
:
number
,
url
:
string
)
{
kubernetesJobName
:
string
,
sequenceId
:
number
,
url
:
string
)
{
this
.
id
=
id
;
this
.
id
=
id
;
this
.
status
=
status
;
this
.
status
=
status
;
...
@@ -55,7 +54,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
...
@@ -55,7 +54,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
}
}
}
}
export
const
K
ubernetesScriptFormat
=
export
const
k
ubernetesScriptFormat
:
string
=
`#!/bin/bash
`#!/bin/bash
export NNI_PLATFORM={0}
export NNI_PLATFORM={0}
export NNI_SYS_DIR=$PWD/nni/{1}
export NNI_SYS_DIR=$PWD/nni/{1}
...
@@ -71,5 +70,5 @@ mkdir -p $NNI_OUTPUT_DIR
...
@@ -71,5 +70,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10}
--nni_manager_version '{11}' --log_collection '{12}'`
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10}
\
+
`
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
--nni_manager_version '{11}' --log_collection '{12}'
1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
;
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
View file @
ba8dccd6
...
@@ -20,11 +20,10 @@
...
@@ -20,11 +20,10 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
assert
from
'
assert
'
;
import
{
MethodNotImplementedError
,
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
MethodNotImplementedError
}
from
'
../../common/errors
'
;
import
{
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
/**
/**
...
@@ -43,22 +42,22 @@ export class KubernetesJobInfoCollector {
...
@@ -43,22 +42,22 @@ export class KubernetesJobInfoCollector {
public
async
retrieveTrialStatus
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
)
:
Promise
<
void
>
{
public
async
retrieveTrialStatus
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
)
:
Promise
<
void
>
{
assert
(
kubernetesCRDClient
!==
undefined
);
assert
(
kubernetesCRDClient
!==
undefined
);
const
updateKubernetesTrialJobs
:
Promise
<
void
>
[]
=
[];
const
updateKubernetesTrialJobs
:
Promise
<
void
>
[]
=
[];
for
(
le
t
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
for
(
cons
t
[
trialJobId
,
kubernetesTrialJob
]
of
this
.
trialJobsMap
)
{
if
(
!
kubernetesTrialJob
)
{
if
(
kubernetesTrialJob
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
throw
new
NNIError
(
NNIErrorNames
.
NOT_FOUND
,
`trial job id
${
trialJobId
}
not found`
);
}
}
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
if
(
Date
.
now
()
-
kubernetesTrialJob
.
submitTime
<
20
*
1000
)
{
if
(
Date
.
now
()
-
kubernetesTrialJob
.
submitTime
<
20
*
1000
)
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
updateKubernetesTrialJobs
.
push
(
this
.
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
,
kubernetesTrialJob
))
updateKubernetesTrialJobs
.
push
(
this
.
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
,
kubernetesTrialJob
))
;
}
}
await
Promise
.
all
(
updateKubernetesTrialJobs
);
await
Promise
.
all
(
updateKubernetesTrialJobs
);
}
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
throw
new
MethodNotImplementedError
();
throw
new
MethodNotImplementedError
();
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
View file @
ba8dccd6
...
@@ -19,19 +19,19 @@
...
@@ -19,19 +19,19 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
{
Inject
}
from
'
typescript-ioc
'
;
import
*
as
component
from
'
../../common/component
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
;
import
{
KubernetesTrainingService
}
from
'
./kubernetesTrainingService
'
;
import
{
KubernetesTrainingService
}
from
'
./kubernetesTrainingService
'
;
import
{
ClusterJobRestServer
}
from
'
../common/clusterJobRestServer
'
/**
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
@
Inject
@
Inject
private
kubernetesTrainingService
?
:
KubernetesTrainingService
;
private
readonly
kubernetesTrainingService
?
:
KubernetesTrainingService
;
/**
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
* constructor to provide NNIRestServer's own rest property, e.g. port
...
@@ -41,8 +41,9 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
...
@@ -41,8 +41,9 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
this
.
kubernetesTrainingService
=
kubernetesTrainingService
;
this
.
kubernetesTrainingService
=
kubernetesTrainingService
;
}
}
// tslint:disable-next-line:no-any
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
protected
handleTrialMetrics
(
jobId
:
string
,
metrics
:
any
[])
:
void
{
if
(
!
this
.
kubernetesTrainingService
)
{
if
(
this
.
kubernetesTrainingService
===
undefined
)
{
throw
Error
(
'
kubernetesTrainingService not initialized!
'
);
throw
Error
(
'
kubernetesTrainingService not initialized!
'
);
}
}
// Split metrics array into single metric, then emit
// Split metrics array into single metric, then emit
...
@@ -53,5 +54,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
...
@@ -53,5 +54,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
data
:
singleMetric
data
:
singleMetric
});
});
}
}
}
}
}
}
\ No newline at end of file
Prev
1
2
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment