Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
ae7a72bc
Commit
ae7a72bc
authored
Jun 19, 2019
by
Hongarc
Committed by
Chi Song
Jun 19, 2019
Browse files
Remove all whitespace at end of line (#1162)
parent
14c1b31c
Changes
176
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
185 additions
and
185 deletions
+185
-185
src/nni_manager/training_service/common/clusterJobRestServer.ts
...i_manager/training_service/common/clusterJobRestServer.ts
+5
-5
src/nni_manager/training_service/common/containerJobData.ts
src/nni_manager/training_service/common/containerJobData.ts
+2
-2
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+2
-2
src/nni_manager/training_service/common/util.ts
src/nni_manager/training_service/common/util.ts
+11
-11
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
...er/training_service/kubernetes/azureStorageClientUtils.ts
+28
-28
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerApiClient.ts
...netes/frameworkcontroller/frameworkcontrollerApiClient.ts
+1
-1
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
...bernetes/frameworkcontroller/frameworkcontrollerConfig.ts
+9
-9
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobInfoCollector.ts
...rameworkcontroller/frameworkcontrollerJobInfoCollector.ts
+4
-4
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobRestServer.ts
...s/frameworkcontroller/frameworkcontrollerJobRestServer.ts
+2
-2
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+37
-37
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
...training_service/kubernetes/kubeflow/kubeflowApiClient.ts
+3
-3
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
...er/training_service/kubernetes/kubeflow/kubeflowConfig.ts
+8
-8
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
...g_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
+7
-7
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
...ning_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
+2
-2
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+44
-44
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
...anager/training_service/kubernetes/kubernetesApiClient.ts
+7
-7
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+9
-9
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+1
-1
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
...training_service/kubernetes/kubernetesJobInfoCollector.ts
+1
-1
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
...er/training_service/kubernetes/kubernetesJobRestServer.ts
+2
-2
No files found.
src/nni_manager/training_service/common/clusterJobRestServer.ts
View file @
ae7a72bc
...
@@ -32,7 +32,7 @@ import { Writable } from 'stream';
...
@@ -32,7 +32,7 @@ import { Writable } from 'stream';
/**
/**
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
*
*
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
abstract
class
ClusterJobRestServer
extends
RestServer
{
export
abstract
class
ClusterJobRestServer
extends
RestServer
{
...
@@ -52,8 +52,8 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -52,8 +52,8 @@ export abstract class ClusterJobRestServer extends RestServer{
super
();
super
();
const
basePort
:
number
=
getBasePort
();
const
basePort
:
number
=
getBasePort
();
assert
(
basePort
&&
basePort
>
1024
);
assert
(
basePort
&&
basePort
>
1024
);
this
.
port
=
basePort
+
1
;
this
.
port
=
basePort
+
1
;
}
}
public
get
clusterRestServerPort
():
number
{
public
get
clusterRestServerPort
():
number
{
...
@@ -62,11 +62,11 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -62,11 +62,11 @@ export abstract class ClusterJobRestServer extends RestServer{
}
}
return
this
.
port
;
return
this
.
port
;
}
}
public
get
getErrorMessage
():
string
|
undefined
{
public
get
getErrorMessage
():
string
|
undefined
{
return
this
.
errorMessage
;
return
this
.
errorMessage
;
}
}
public
set
setEnableVersionCheck
(
versionCheck
:
boolean
)
{
public
set
setEnableVersionCheck
(
versionCheck
:
boolean
)
{
this
.
enableVersionCheck
=
versionCheck
;
this
.
enableVersionCheck
=
versionCheck
;
}
}
...
...
src/nni_manager/training_service/common/containerJobData.ts
View file @
ae7a72bc
...
@@ -19,12 +19,12 @@
...
@@ -19,12 +19,12 @@
'
use strict
'
;
'
use strict
'
;
export
const
CONTAINER_INSTALL_NNI_SHELL_FORMAT
:
string
=
export
const
CONTAINER_INSTALL_NNI_SHELL_FORMAT
:
string
=
`#!/bin/bash
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
# nni module is already installed, skip
return
return
else
else
# Install nni
# Install nni
python3 -m pip install --user --upgrade nni
python3 -m pip install --user --upgrade nni
fi`
;
fi`
;
\ No newline at end of file
src/nni_manager/training_service/common/gpuData.ts
View file @
ae7a72bc
...
@@ -59,7 +59,7 @@ export class GPUSummary {
...
@@ -59,7 +59,7 @@ export class GPUSummary {
}
}
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT_LINUX
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT_LINUX
:
string
=
`
`
#!/bin/bash
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
export METRIC_OUTPUT_DIR={0}
...
@@ -67,7 +67,7 @@ echo $$ >{1}
...
@@ -67,7 +67,7 @@ echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
python3 -m nni_gpu_tool.gpu_metrics_collector
`
`
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
`
`
$env:METRIC_OUTPUT_DIR="{0}"
$env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
...
...
src/nni_manager/training_service/common/util.ts
View file @
ae7a72bc
...
@@ -34,7 +34,7 @@ import { file } from "../../node_modules/@types/tmp";
...
@@ -34,7 +34,7 @@ import { file } from "../../node_modules/@types/tmp";
/**
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
*
*
* @param codeDir codeDir in nni config file
* @param codeDir codeDir in nni config file
* @returns file number under codeDir
* @returns file number under codeDir
*/
*/
...
@@ -48,9 +48,9 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
...
@@ -48,9 +48,9 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
}
}
if
(
fileCount
&&
fileCount
>
1000
)
{
if
(
fileCount
&&
fileCount
>
1000
)
{
const
errMessage
:
string
=
`Too many files(
${
fileCount
}
found}) in
${
codeDir
}
,`
const
errMessage
:
string
=
`Too many files(
${
fileCount
}
found}) in
${
codeDir
}
,`
+
` please check if it's a valid code dir`
;
+
` please check if it's a valid code dir`
;
throw
new
Error
(
errMessage
);
throw
new
Error
(
errMessage
);
}
}
return
fileCount
;
return
fileCount
;
...
@@ -58,7 +58,7 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
...
@@ -58,7 +58,7 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
/**
/**
* crete a new directory
* crete a new directory
* @param directory
* @param directory
*/
*/
export
async
function
execMkdir
(
directory
:
string
):
Promise
<
void
>
{
export
async
function
execMkdir
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
@@ -85,7 +85,7 @@ export async function execCopydir(source: string, destination: string): Promise<
...
@@ -85,7 +85,7 @@ export async function execCopydir(source: string, destination: string): Promise<
/**
/**
* crete a new file
* crete a new file
* @param filename
* @param filename
*/
*/
export
async
function
execNewFile
(
filename
:
string
):
Promise
<
void
>
{
export
async
function
execNewFile
(
filename
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
@@ -110,7 +110,7 @@ export function execScript(filePath: string): cp.ChildProcess {
...
@@ -110,7 +110,7 @@ export function execScript(filePath: string): cp.ChildProcess {
/**
/**
* output the last line of a file
* output the last line of a file
* @param filePath
* @param filePath
*/
*/
export
async
function
execTail
(
filePath
:
string
):
Promise
<
cpp
.
childProcessPromise
.
Result
>
{
export
async
function
execTail
(
filePath
:
string
):
Promise
<
cpp
.
childProcessPromise
.
Result
>
{
let
cmdresult
:
cpp
.
childProcessPromise
.
Result
;
let
cmdresult
:
cpp
.
childProcessPromise
.
Result
;
...
@@ -124,7 +124,7 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
...
@@ -124,7 +124,7 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
/**
/**
* delete a directory
* delete a directory
* @param directory
* @param directory
*/
*/
export
async
function
execRemove
(
directory
:
string
):
Promise
<
void
>
{
export
async
function
execRemove
(
directory
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
@@ -137,7 +137,7 @@ export async function execRemove(directory: string): Promise<void> {
...
@@ -137,7 +137,7 @@ export async function execRemove(directory: string): Promise<void> {
/**
/**
* kill a process
* kill a process
* @param directory
* @param directory
*/
*/
export
async
function
execKill
(
pid
:
string
):
Promise
<
void
>
{
export
async
function
execKill
(
pid
:
string
):
Promise
<
void
>
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
@@ -151,7 +151,7 @@ export async function execKill(pid: string): Promise<void> {
...
@@ -151,7 +151,7 @@ export async function execKill(pid: string): Promise<void> {
/**
/**
* set environment variable
* set environment variable
* @param variable
* @param variable
* @returns command string
* @returns command string
*/
*/
export
function
setEnvironmentVariable
(
variable
:
{
key
:
string
;
value
:
string
}):
string
{
export
function
setEnvironmentVariable
(
variable
:
{
key
:
string
;
value
:
string
}):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
@@ -191,7 +191,7 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
...
@@ -191,7 +191,7 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
/**
/**
* generate script file name
* generate script file name
* @param fileNamePrefix
* @param fileNamePrefix
*/
*/
export
function
getScriptName
(
fileNamePrefix
:
string
):
string
{
export
function
getScriptName
(
fileNamePrefix
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
@@ -203,7 +203,7 @@ export function getScriptName(fileNamePrefix: string): string {
...
@@ -203,7 +203,7 @@ export function getScriptName(fileNamePrefix: string): string {
/**
/**
* generate script file
* generate script file
* @param gpuMetricCollectorScriptFolder
* @param gpuMetricCollectorScriptFolder
*/
*/
export
function
getgpuMetricsCollectorScriptContent
(
gpuMetricCollectorScriptFolder
:
string
):
string
{
export
function
getgpuMetricsCollectorScriptContent
(
gpuMetricCollectorScriptFolder
:
string
):
string
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
)
{
...
...
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
View file @
ae7a72bc
...
@@ -26,11 +26,11 @@ import { getLogger } from '../../common/log';
...
@@ -26,11 +26,11 @@ import { getLogger } from '../../common/log';
import
{
mkDirP
}
from
'
../../common/utils
'
;
import
{
mkDirP
}
from
'
../../common/utils
'
;
export
namespace
AzureStorageClientUtility
{
export
namespace
AzureStorageClientUtility
{
/**
/**
* create azure share
* create azure share
* @param fileServerClient
* @param fileServerClient
* @param azureShare
* @param azureShare
*/
*/
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
@@ -44,12 +44,12 @@ export namespace AzureStorageClientUtility {
...
@@ -44,12 +44,12 @@ export namespace AzureStorageClientUtility {
})
})
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* Create a new directory (NOT recursively) in azure file storage.
* Create a new directory (NOT recursively) in azure file storage.
* @param fileServerClient
* @param fileServerClient
* @param azureFoler
* @param azureFoler
* @param azureShare
* @param azureShare
*/
*/
export
async
function
createDirectory
(
fileServerClient
:
any
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createDirectory
(
fileServerClient
:
any
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
@@ -67,7 +67,7 @@ export namespace AzureStorageClientUtility {
...
@@ -67,7 +67,7 @@ export namespace AzureStorageClientUtility {
/**
/**
* Create a new directory recursively in azure file storage
* Create a new directory recursively in azure file storage
* @param fileServerClient
* @param fileServerClient
* @param azureDirectory
* @param azureDirectory
*/
*/
export
async
function
createDirectoryRecursive
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createDirectoryRecursive
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
@@ -81,14 +81,14 @@ export namespace AzureStorageClientUtility {
...
@@ -81,14 +81,14 @@ export namespace AzureStorageClientUtility {
deferred
.
resolve
();
deferred
.
resolve
();
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* upload a file to azure storage
* upload a file to azure storage
* @param fileServerClient
* @param fileServerClient
* @param azureDirectory
* @param azureDirectory
* @param azureFileName
* @param azureFileName
* @param azureShare
* @param azureShare
* @param localFilePath
* @param localFilePath
*/
*/
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
@@ -96,20 +96,20 @@ export namespace AzureStorageClientUtility {
...
@@ -96,20 +96,20 @@ export namespace AzureStorageClientUtility {
if
(
error
){
if
(
error
){
getLogger
().
error
(
`Upload file failed:,
${
error
}
`
);
getLogger
().
error
(
`Upload file failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
();
}
}
})
})
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* download a file from azure storage
* download a file from azure storage
* @param fileServerClient
* @param fileServerClient
* @param azureDirectory
* @param azureDirectory
* @param azureFileName
* @param azureFileName
* @param azureShare
* @param azureShare
* @param localFilePath
* @param localFilePath
*/
*/
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
@@ -118,7 +118,7 @@ export namespace AzureStorageClientUtility {
...
@@ -118,7 +118,7 @@ export namespace AzureStorageClientUtility {
getLogger
().
error
(
`Download file failed:,
${
error
}
`
);
getLogger
().
error
(
`Download file failed:,
${
error
}
`
);
deferred
.
reject
(
error
);
deferred
.
reject
(
error
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
();
}
}
})
})
return
deferred
.
promise
;
return
deferred
.
promise
;
...
@@ -153,13 +153,13 @@ export namespace AzureStorageClientUtility {
...
@@ -153,13 +153,13 @@ export namespace AzureStorageClientUtility {
deferred
.
resolve
();
deferred
.
resolve
();
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
/**
/**
* downlod a directory from azure
* downlod a directory from azure
* @param fileServerClient
* @param fileServerClient
* @param azureDirectory
* @param azureDirectory
* @param azureShare
* @param azureShare
* @param localDirectory
* @param localDirectory
*/
*/
export
async
function
downloadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
,
localDirectory
:
any
):
Promise
<
void
>
{
export
async
function
downloadDirectory
(
fileServerClient
:
any
,
azureDirectory
:
any
,
azureShare
:
any
,
localDirectory
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
@@ -184,7 +184,7 @@ export namespace AzureStorageClientUtility {
...
@@ -184,7 +184,7 @@ export namespace AzureStorageClientUtility {
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
.
name
);
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
.
name
);
downloadFile
(
fileServerClient
,
azureDirectory
,
fileName
.
name
,
azureShare
,
fullFilePath
)
downloadFile
(
fileServerClient
,
azureDirectory
,
fileName
.
name
,
azureShare
,
fullFilePath
)
}
}
for
(
var
directoryName
of
result
[
'
entries
'
][
'
directories
'
]){
for
(
var
directoryName
of
result
[
'
entries
'
][
'
directories
'
]){
const
fullDirectoryPath
:
string
=
path
.
join
(
localDirectory
,
directoryName
.
name
)
const
fullDirectoryPath
:
string
=
path
.
join
(
localDirectory
,
directoryName
.
name
)
const
fullAzureDirectory
:
string
=
path
.
join
(
azureDirectory
,
directoryName
.
name
)
const
fullAzureDirectory
:
string
=
path
.
join
(
azureDirectory
,
directoryName
.
name
)
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerApiClient.ts
View file @
ae7a72bc
...
@@ -47,7 +47,7 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
...
@@ -47,7 +47,7 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
public
get
containerName
():
string
{
public
get
containerName
():
string
{
return
'
framework
'
;
return
'
framework
'
;
}
}
}
}
export
{
FrameworkControllerClient
,
GeneralK8sClient
};
export
{
FrameworkControllerClient
,
GeneralK8sClient
};
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
View file @
ae7a72bc
...
@@ -40,8 +40,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
...
@@ -40,8 +40,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
public
readonly
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
;
public
readonly
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
;
public
readonly
name
:
string
;
public
readonly
name
:
string
;
public
readonly
taskNum
:
number
;
public
readonly
taskNum
:
number
;
constructor
(
taskNum
:
number
,
command
:
string
,
gpuNum
:
number
,
constructor
(
taskNum
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
)
{
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
this
.
frameworkAttemptCompletionPolicy
=
frameworkAttemptCompletionPolicy
;
this
.
frameworkAttemptCompletionPolicy
=
frameworkAttemptCompletionPolicy
;
...
@@ -71,8 +71,8 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
...
@@ -71,8 +71,8 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
export
class
FrameworkControllerClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
export
class
FrameworkControllerClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
serviceAccountName
:
string
;
public
readonly
serviceAccountName
:
string
;
constructor
(
constructor
(
serviceAccountName
:
string
,
serviceAccountName
:
string
,
apiVersion
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
...
@@ -94,12 +94,12 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
...
@@ -94,12 +94,12 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
export
class
FrameworkControllerClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
export
class
FrameworkControllerClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
public
readonly
serviceAccountName
:
string
;
public
readonly
serviceAccountName
:
string
;
constructor
(
constructor
(
serviceAccountName
:
string
,
serviceAccountName
:
string
,
apiVersion
:
string
,
apiVersion
:
string
,
keyVault
:
keyVaultConfig
,
keyVault
:
keyVaultConfig
,
azureStorage
:
AzureStorage
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobInfoCollector.ts
View file @
ae7a72bc
...
@@ -32,7 +32,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
...
@@ -32,7 +32,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
super
(
jobMap
);
super
(
jobMap
);
}
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
...
@@ -44,7 +44,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
...
@@ -44,7 +44,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
let
kubernetesJobInfo
:
any
;
let
kubernetesJobInfo
:
any
;
try
{
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
//This is not treat as a error status
...
@@ -71,9 +71,9 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
...
@@ -71,9 +71,9 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
break
;
break
;
case
'
Failed
'
:
case
'
Failed
'
:
kubernetesTrialJob
.
status
=
'
FAILED
'
;
kubernetesTrialJob
.
status
=
'
FAILED
'
;
break
;
break
;
}
}
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
completionTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
kubernetesJobInfo
.
status
.
completionTime
);
break
;
break
;
default
:
default
:
break
;
break
;
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerJobRestServer.ts
View file @
ae7a72bc
...
@@ -25,11 +25,11 @@ import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
...
@@ -25,11 +25,11 @@ import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/**
/**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
*
*
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
class
FrameworkControllerJobRestServer
extends
KubernetesJobRestServer
{
export
class
FrameworkControllerJobRestServer
extends
KubernetesJobRestServer
{
constructor
()
{
constructor
()
{
super
(
component
.
get
(
FrameworkControllerTrainingService
));
super
(
component
.
get
(
FrameworkControllerTrainingService
));
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
ae7a72bc
...
@@ -37,7 +37,7 @@ import { KubernetesTrialJobDetail } from '../kubernetesData';
...
@@ -37,7 +37,7 @@ import { KubernetesTrialJobDetail } from '../kubernetesData';
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
AzureStorageClientUtility
}
from
'
../azureStorageClientUtils
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
KubernetesTrainingService
}
from
'
../kubernetesTrainingService
'
;
import
{
FrameworkControllerTrialConfig
,
FrameworkControllerClusterConfig
,
FrameworkControllerClusterConfigAzure
,
FrameworkControllerClusterConfigNFS
,
import
{
FrameworkControllerTrialConfig
,
FrameworkControllerClusterConfig
,
FrameworkControllerClusterConfigAzure
,
FrameworkControllerClusterConfigNFS
,
FrameworkControllerClusterConfigFactory
}
from
'
./frameworkcontrollerConfig
'
;
FrameworkControllerClusterConfigFactory
}
from
'
./frameworkcontrollerConfig
'
;
import
{
FrameworkControllerJobRestServer
}
from
'
./frameworkcontrollerJobRestServer
'
;
import
{
FrameworkControllerJobRestServer
}
from
'
./frameworkcontrollerJobRestServer
'
;
import
{
FrameworkControllerClient
}
from
'
./frameworkcontrollerApiClient
'
;
import
{
FrameworkControllerClient
}
from
'
./frameworkcontrollerApiClient
'
;
...
@@ -56,7 +56,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -56,7 +56,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
constructor
()
{
constructor
()
{
super
();
super
();
this
.
fcJobInfoCollector
=
new
FrameworkControllerJobInfoCollector
(
this
.
trialJobsMap
);
this
.
fcJobInfoCollector
=
new
FrameworkControllerJobInfoCollector
(
this
.
trialJobsMap
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
this
.
nextTrialSequenceId
=
-
1
;
}
}
...
@@ -69,7 +69,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -69,7 +69,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`frameworkcontroller Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
this
.
log
.
info
(
`frameworkcontroller Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
delay
(
3000
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
...
@@ -101,10 +101,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -101,10 +101,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
//Generate the port used for taskRole
//Generate the port used for taskRole
this
.
generateContainerPort
();
this
.
generateContainerPort
();
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
curTrialSequenceId
,
trialJobId
,
trialWorkingFolder
,
form
);
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
curTrialSequenceId
,
trialJobId
,
trialWorkingFolder
,
form
);
//upload code files
//upload code files
let
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
let
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
trialJobId
,
'
WAITING
'
,
'
WAITING
'
,
...
@@ -116,14 +116,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -116,14 +116,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
trialJobOutputUrl
trialJobOutputUrl
);
);
// Set trial job detail until create frameworkcontroller job successfully
// Set trial job detail until create frameworkcontroller job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
const
frameworkcontrollerJobConfig
=
await
this
.
prepareFrameworkControllerConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
);
const
frameworkcontrollerJobConfig
=
await
this
.
prepareFrameworkControllerConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
);
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
frameworkcontrollerJobConfig
);
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
frameworkcontrollerJobConfig
);
// Set trial job detail until create frameworkcontroller job successfully
// Set trial job detail until create frameworkcontroller job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
...
@@ -131,8 +131,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -131,8 +131,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
/**
/**
* upload code files to nfs or azureStroage
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialJobId
* @param trialLocalTempFolder
* @param trialLocalTempFolder
* return: trialJobOutputUrl
* return: trialJobOutputUrl
*/
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
...
@@ -145,7 +145,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -145,7 +145,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
try
{
try
{
//upload local files to azure storage
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
...
@@ -155,21 +155,21 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -155,21 +155,21 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
}
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
let
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
let
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
// Creat work dir for current trial in NFS directory
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsFrameworkControllerClusterConfig
.
nfs
;
const
nfsConfig
:
NFSConfig
=
nfsFrameworkControllerClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
}
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
}
/**
/**
* generate trial's command for frameworkcontroller
* generate trial's command for frameworkcontroller
* expose port and execute injector.sh before executing user's command
* expose port and execute injector.sh before executing user's command
* @param command
* @param command
*/
*/
private
generateCommandScript
(
command
:
string
):
string
{
private
generateCommandScript
(
command
:
string
):
string
{
let
portScript
=
''
;
let
portScript
=
''
;
...
@@ -181,7 +181,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -181,7 +181,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
}
return
`
${
portScript
}
. /mnt/frameworkbarrier/injector.sh &&
${
command
}
`
;
return
`
${
portScript
}
. /mnt/frameworkbarrier/injector.sh &&
${
command
}
`
;
}
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
curTrialSequenceId
:
number
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
curTrialSequenceId
:
number
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
!
this
.
fcTrialConfig
)
{
if
(
!
this
.
fcTrialConfig
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
...
@@ -196,7 +196,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -196,7 +196,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
trialLocalTempFolder
}
`
);
for
(
let
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
for
(
let
taskRole
of
this
.
fcTrialConfig
.
taskRoles
)
{
const
runScriptContent
:
string
=
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
const
runScriptContent
:
string
=
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
this
.
generateCommandScript
(
taskRole
.
command
),
curTrialSequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
);
this
.
generateCommandScript
(
taskRole
.
command
),
curTrialSequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
`run_
${
taskRole
.
name
}
.sh`
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
`run_
${
taskRole
.
name
}
.sh`
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
...
@@ -204,11 +204,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -204,11 +204,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// Write file content ( parameter.cfg ) to local tmp folders
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
}
}
private
async
prepareFrameworkControllerConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
):
Promise
<
any
>
{
private
async
prepareFrameworkControllerConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
fcTrialConfig
)
{
if
(
!
this
.
fcTrialConfig
)
{
...
@@ -222,18 +222,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -222,18 +222,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
resource
.
limits
=
Object
.
assign
({},
resource
.
requests
);
resource
.
limits
=
Object
.
assign
({},
resource
.
requests
);
podResources
.
push
(
resource
);
podResources
.
push
(
resource
);
}
}
// Generate frameworkcontroller job resource config object
// Generate frameworkcontroller job resource config object
const
frameworkcontrollerJobConfig
:
any
=
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
const
frameworkcontrollerJobConfig
:
any
=
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
return
Promise
.
resolve
(
frameworkcontrollerJobConfig
);
return
Promise
.
resolve
(
frameworkcontrollerJobConfig
);
}
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
break
;
case
TrialConfigMetadataKey
.
FRAMEWORKCONTROLLER_CLUSTER_CONFIG
:
case
TrialConfigMetadataKey
.
FRAMEWORKCONTROLLER_CLUSTER_CONFIG
:
let
frameworkcontrollerClusterJsonObject
=
JSON
.
parse
(
value
);
let
frameworkcontrollerClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
fcClusterConfig
=
FrameworkControllerClusterConfigFactory
.
generateFrameworkControllerClusterConfig
(
frameworkcontrollerClusterJsonObject
);
this
.
fcClusterConfig
=
FrameworkControllerClusterConfigFactory
.
generateFrameworkControllerClusterConfig
(
frameworkcontrollerClusterJsonObject
);
...
@@ -253,7 +253,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -253,7 +253,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
nfsFrameworkControllerClusterConfig
.
nfs
.
server
,
nfsFrameworkControllerClusterConfig
.
nfs
.
server
,
nfsFrameworkControllerClusterConfig
.
nfs
.
path
nfsFrameworkControllerClusterConfig
.
nfs
.
path
);
);
}
}
this
.
kubernetesCRDClient
=
FrameworkControllerClient
.
generateFrameworkControllerClient
();
this
.
kubernetesCRDClient
=
FrameworkControllerClient
.
generateFrameworkControllerClient
();
break
;
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
...
@@ -269,7 +269,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -269,7 +269,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
await
validateCodeDir
(
this
.
fcTrialConfig
.
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
return
Promise
.
reject
(
new
Error
(
error
));
}
}
break
;
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
...
@@ -284,7 +284,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -284,7 +284,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
private
generateContainerPort
()
{
private
generateContainerPort
()
{
if
(
!
this
.
fcTrialConfig
)
{
if
(
!
this
.
fcTrialConfig
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
...
@@ -312,7 +312,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -312,7 +312,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if
(
!
this
.
fcTrialConfig
)
{
if
(
!
this
.
fcTrialConfig
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
}
let
taskRoles
=
[];
let
taskRoles
=
[];
for
(
let
index
in
this
.
fcTrialConfig
.
taskRoles
)
{
for
(
let
index
in
this
.
fcTrialConfig
.
taskRoles
)
{
let
containerPort
=
this
.
fcContainerPortMap
.
get
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
);
let
containerPort
=
this
.
fcContainerPortMap
.
get
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
);
...
@@ -320,8 +320,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -320,8 +320,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw
new
Error
(
'
Container port is not initialized
'
);
throw
new
Error
(
'
Container port is not initialized
'
);
}
}
let
taskRole
=
this
.
generateTaskRoleConfig
(
let
taskRole
=
this
.
generateTaskRoleConfig
(
trialWorkingFolder
,
trialWorkingFolder
,
this
.
fcTrialConfig
.
taskRoles
[
index
].
image
,
this
.
fcTrialConfig
.
taskRoles
[
index
].
image
,
`run_
${
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
}
.sh`
,
`run_
${
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
}
.sh`
,
podResources
[
index
],
podResources
[
index
],
containerPort
containerPort
...
@@ -330,17 +330,17 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -330,17 +330,17 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
name
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
name
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
taskNumber
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
taskNum
,
taskNumber
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
taskNum
,
frameworkAttemptCompletionPolicy
:
{
frameworkAttemptCompletionPolicy
:
{
minFailedTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minFailedTaskCount
,
minFailedTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minFailedTaskCount
,
minSucceededTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minSucceededTaskCount
minSucceededTaskCount
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
frameworkAttemptCompletionPolicy
.
minSucceededTaskCount
},
},
task
:
taskRole
task
:
taskRole
});
});
}
}
return
{
return
{
apiVersion
:
`frameworkcontroller.microsoft.com/v1`
,
apiVersion
:
`frameworkcontroller.microsoft.com/v1`
,
kind
:
'
Framework
'
,
kind
:
'
Framework
'
,
metadata
:
{
metadata
:
{
name
:
frameworkcontrollerJobName
,
name
:
frameworkcontrollerJobName
,
namespace
:
'
default
'
,
namespace
:
'
default
'
,
labels
:
{
labels
:
{
...
@@ -356,7 +356,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -356,7 +356,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
};
};
}
}
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
):
any
{
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
):
any
{
if
(
!
this
.
fcClusterConfig
)
{
if
(
!
this
.
fcClusterConfig
)
{
...
@@ -366,7 +366,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -366,7 +366,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if
(
!
this
.
fcTrialConfig
)
{
if
(
!
this
.
fcTrialConfig
)
{
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
throw
new
Error
(
'
frameworkcontroller trial config is not initialized
'
);
}
}
let
volumeSpecMap
=
new
Map
<
string
,
object
>
();
let
volumeSpecMap
=
new
Map
<
string
,
object
>
();
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
){
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
){
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
...
@@ -395,7 +395,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -395,7 +395,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
emptyDir
:
{}
emptyDir
:
{}
}])
}])
}
}
let
containers
=
[
let
containers
=
[
{
{
name
:
'
framework
'
,
name
:
'
framework
'
,
...
@@ -420,7 +420,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -420,7 +420,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
name
:
'
frameworkbarrier
'
,
name
:
'
frameworkbarrier
'
,
image
:
'
frameworkcontroller/frameworkbarrier
'
,
image
:
'
frameworkcontroller/frameworkbarrier
'
,
volumeMounts
:
[
volumeMounts
:
[
{
{
name
:
'
frameworkbarrier-volume
'
,
name
:
'
frameworkbarrier-volume
'
,
mountPath
:
'
/mnt/frameworkbarrier
'
mountPath
:
'
/mnt/frameworkbarrier
'
}]
}]
...
@@ -432,8 +432,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -432,8 +432,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
),
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
),
hostNetwork
:
false
hostNetwork
:
false
};
};
if
(
this
.
fcClusterConfig
.
serviceAccountName
)
{
if
(
this
.
fcClusterConfig
.
serviceAccountName
)
{
spec
.
serviceAccountName
=
this
.
fcClusterConfig
.
serviceAccountName
;
spec
.
serviceAccountName
=
this
.
fcClusterConfig
.
serviceAccountName
;
}
}
let
taskRole
=
{
let
taskRole
=
{
pod
:
{
pod
:
{
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
View file @
ae7a72bc
...
@@ -27,7 +27,7 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
...
@@ -27,7 +27,7 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
/**
/**
* Factory method to generate operator cliet
* Factory method to generate operator cliet
*/
*/
public
static
generateOperatorClient
(
kubeflowOperator
:
KubeflowOperator
,
public
static
generateOperatorClient
(
kubeflowOperator
:
KubeflowOperator
,
operatorApiVersion
:
string
):
KubernetesCRDClient
{
operatorApiVersion
:
string
):
KubernetesCRDClient
{
switch
(
kubeflowOperator
)
{
switch
(
kubeflowOperator
)
{
case
'
tf-operator
'
:
{
case
'
tf-operator
'
:
{
...
@@ -78,7 +78,7 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
...
@@ -78,7 +78,7 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
public
get
containerName
():
string
{
public
get
containerName
():
string
{
return
'
tensorflow
'
;
return
'
tensorflow
'
;
}
}
}
}
class
TFOperatorClientV1Beta1
extends
KubernetesCRDClient
{
class
TFOperatorClientV1Beta1
extends
KubernetesCRDClient
{
...
@@ -97,7 +97,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
...
@@ -97,7 +97,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
public
get
containerName
():
string
{
public
get
containerName
():
string
{
return
'
tensorflow
'
;
return
'
tensorflow
'
;
}
}
}
}
class
TFOperatorClientV1Beta2
extends
KubernetesCRDClient
{
class
TFOperatorClientV1Beta2
extends
KubernetesCRDClient
{
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
View file @
ae7a72bc
...
@@ -41,8 +41,8 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
...
@@ -41,8 +41,8 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
export
class
KubeflowClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
export
class
KubeflowClusterConfigNFS
extends
KubernetesClusterConfigNFS
{
public
readonly
operator
:
KubeflowOperator
;
public
readonly
operator
:
KubeflowOperator
;
constructor
(
constructor
(
operator
:
KubeflowOperator
,
operator
:
KubeflowOperator
,
apiVersion
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
...
@@ -68,12 +68,12 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
...
@@ -68,12 +68,12 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
export
class
KubeflowClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
export
class
KubeflowClusterConfigAzure
extends
KubernetesClusterConfigAzure
{
public
readonly
operator
:
KubeflowOperator
;
public
readonly
operator
:
KubeflowOperator
;
constructor
(
constructor
(
operator
:
KubeflowOperator
,
operator
:
KubeflowOperator
,
apiVersion
:
string
,
apiVersion
:
string
,
keyVault
:
keyVaultConfig
,
keyVault
:
keyVaultConfig
,
azureStorage
:
AzureStorage
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
...
@@ -124,7 +124,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
...
@@ -124,7 +124,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
public
readonly
replicas
:
number
;
public
readonly
replicas
:
number
;
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
this
.
replicas
=
replicas
;
this
.
replicas
=
replicas
;
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobInfoCollector.ts
View file @
ae7a72bc
...
@@ -32,7 +32,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
...
@@ -32,7 +32,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
super
(
jobMap
);
super
(
jobMap
);
}
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
if
(
!
this
.
statusesNeedToCheck
.
includes
(
kubernetesTrialJob
.
status
))
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
...
@@ -44,9 +44,9 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
...
@@ -44,9 +44,9 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
let
kubernetesJobInfo
:
any
;
let
kubernetesJobInfo
:
any
;
try
{
try
{
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
kubernetesJobInfo
=
await
kubernetesCRDClient
.
getKubernetesJob
(
kubernetesTrialJob
.
kubernetesJobName
);
}
catch
(
error
)
{
}
catch
(
error
)
{
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
this
.
log
.
error
(
`Get job
${
kubernetesTrialJob
.
kubernetesJobName
}
info failed, error is
${
error
}
`
);
//This is not treat as a error status
//This is not treat as a error status
return
Promise
.
resolve
();
return
Promise
.
resolve
();
...
@@ -58,8 +58,8 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
...
@@ -58,8 +58,8 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
switch
(
tfJobType
)
{
switch
(
tfJobType
)
{
case
'
Created
'
:
case
'
Created
'
:
kubernetesTrialJob
.
status
=
'
WAITING
'
;
kubernetesTrialJob
.
status
=
'
WAITING
'
;
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
startTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
break
;
case
'
Running
'
:
case
'
Running
'
:
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
kubernetesTrialJob
.
status
=
'
RUNNING
'
;
if
(
!
kubernetesTrialJob
.
startTime
)
{
if
(
!
kubernetesTrialJob
.
startTime
)
{
...
@@ -68,11 +68,11 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
...
@@ -68,11 +68,11 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
break
;
break
;
case
'
Failed
'
:
case
'
Failed
'
:
kubernetesTrialJob
.
status
=
'
FAILED
'
;
kubernetesTrialJob
.
status
=
'
FAILED
'
;
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
break
;
case
'
Succeeded
'
:
case
'
Succeeded
'
:
kubernetesTrialJob
.
status
=
'
SUCCEEDED
'
;
kubernetesTrialJob
.
status
=
'
SUCCEEDED
'
;
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
kubernetesTrialJob
.
endTime
=
Date
.
parse
(
<
string
>
latestCondition
.
lastUpdateTime
);
break
;
break
;
default
:
default
:
break
;
break
;
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
View file @
ae7a72bc
...
@@ -25,7 +25,7 @@ import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
...
@@ -25,7 +25,7 @@ import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/**
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
class
KubeflowJobRestServer
extends
KubernetesJobRestServer
{
export
class
KubeflowJobRestServer
extends
KubernetesJobRestServer
{
...
@@ -34,5 +34,5 @@ export class KubeflowJobRestServer extends KubernetesJobRestServer{
...
@@ -34,5 +34,5 @@ export class KubeflowJobRestServer extends KubernetesJobRestServer{
*/
*/
constructor
()
{
constructor
()
{
super
(
component
.
get
(
KubeflowTrainingService
));
super
(
component
.
get
(
KubeflowTrainingService
));
}
}
}
}
\ No newline at end of file
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
ae7a72bc
...
@@ -57,9 +57,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -57,9 +57,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
private
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
private
kubeflowJobInfoCollector
:
KubeflowJobInfoCollector
;
constructor
()
{
constructor
()
{
super
();
super
();
this
.
kubeflowJobInfoCollector
=
new
KubeflowJobInfoCollector
(
this
.
trialJobsMap
);
this
.
kubeflowJobInfoCollector
=
new
KubeflowJobInfoCollector
(
this
.
trialJobsMap
);
this
.
experimentId
=
getExperimentId
();
this
.
experimentId
=
getExperimentId
();
this
.
nextTrialSequenceId
=
-
1
;
this
.
nextTrialSequenceId
=
-
1
;
this
.
log
.
info
(
'
Construct Kubeflow training service.
'
);
this
.
log
.
info
(
'
Construct Kubeflow training service.
'
);
}
}
...
@@ -74,7 +74,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -74,7 +74,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
delay
(
3000
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
...
@@ -113,22 +113,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -113,22 +113,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
curTrialSequenceId
,
curTrialSequenceId
,
trialJobOutputUrl
trialJobOutputUrl
);
);
// Generate kubeflow job resource config object
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
await
this
.
prepareKubeflowConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
);
const
kubeflowJobConfig
:
any
=
await
this
.
prepareKubeflowConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
);
// Create kubeflow job based on generated kubeflow job resource config
// Create kubeflow job based on generated kubeflow job resource config
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
kubeflowJobConfig
);
await
this
.
kubernetesCRDClient
.
createKubernetesJob
(
kubeflowJobConfig
);
// Set trial job detail until create Kubeflow job successfully
// Set trial job detail until create Kubeflow job successfully
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
return
Promise
.
resolve
(
trialJobDetail
);
}
}
/**
/**
* upload code files to nfs or azureStroage
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialJobId
* @param trialLocalTempFolder
* @param trialLocalTempFolder
* return: trialJobOutputUrl
* return: trialJobOutputUrl
*/
*/
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
private
async
uploadCodeFiles
(
trialJobId
:
string
,
trialLocalTempFolder
:
string
):
Promise
<
string
>
{
...
@@ -138,14 +138,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -138,14 +138,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
let
trialJobOutputUrl
:
string
=
''
;
let
trialJobOutputUrl
:
string
=
''
;
assert
(
!
this
.
kubeflowClusterConfig
.
storage
assert
(
!
this
.
kubeflowClusterConfig
.
storage
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
||
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
);
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
azureStorage
'
)
{
try
{
try
{
//upload local files to azure storage
//upload local files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
...
@@ -155,18 +155,18 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -155,18 +155,18 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
}
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
let
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
let
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
// Creat work dir for current trial in NFS directory
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
await
cpp
.
exec
(
`mkdir -p
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
);
// Copy code files from local dir to NFS mounted dir
// Copy code files from local dir to NFS mounted dir
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
await
cpp
.
exec
(
`cp -r
${
trialLocalTempFolder
}
/*
${
this
.
trialLocalNFSTempFolder
}
/nni/
${
getExperimentId
()}
/
${
trialJobId
}
/.`
);
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
const
nfsConfig
:
NFSConfig
=
nfsKubeflowClusterConfig
.
nfs
;
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
trialJobOutputUrl
=
`nfs://
${
nfsConfig
.
server
}
:
${
path
.
join
(
nfsConfig
.
path
,
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
}
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
}
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
private
async
prepareRunScript
(
trialLocalTempFolder
:
string
,
trialJobId
:
string
,
trialWorkingFolder
:
string
,
curTrialSequenceId
:
number
,
form
:
JobApplicationForm
):
Promise
<
void
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
!
this
.
kubeflowClusterConfig
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
...
@@ -181,7 +181,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -181,7 +181,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
else
{
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
}
//create tmp trial working folder locally.
//create tmp trial working folder locally.
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`mkdir -p
${
path
.
dirname
(
trialLocalTempFolder
)}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
await
cpp
.
exec
(
`cp -r
${
kubeflowTrialConfig
.
codeDir
}
${
trialLocalTempFolder
}
`
);
...
@@ -193,7 +193,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -193,7 +193,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// Write worker file content run_worker.sh to local tmp folders
// Write worker file content run_worker.sh to local tmp folders
if
(
kubeflowTrialConfig
.
worker
)
{
if
(
kubeflowTrialConfig
.
worker
)
{
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
const
workerRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
kubeflowTrialConfig
.
worker
.
command
,
curTrialSequenceId
.
toString
(),
'
worker
'
,
kubeflowTrialConfig
.
worker
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_worker.sh
'
),
workerRunScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_worker.sh
'
),
workerRunScriptContent
,
{
encoding
:
'
utf8
'
});
...
@@ -202,7 +202,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -202,7 +202,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
){
if
(
tensorflowTrialConfig
.
ps
){
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
const
psRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
tensorflowTrialConfig
.
ps
.
command
,
curTrialSequenceId
.
toString
(),
'
ps
'
,
tensorflowTrialConfig
.
ps
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_ps.sh
'
),
psRunScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_ps.sh
'
),
psRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
...
@@ -210,7 +210,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -210,7 +210,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
master
){
if
(
pytorchTrialConfig
.
master
){
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
const
masterRunScriptContent
:
string
=
await
this
.
generateRunScript
(
'
kubeflow
'
,
trialJobId
,
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
pytorchTrialConfig
.
master
.
command
,
curTrialSequenceId
.
toString
(),
'
master
'
,
pytorchTrialConfig
.
master
.
gpuNum
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_master.sh
'
),
masterRunScriptContent
,
{
encoding
:
'
utf8
'
});
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
'
run_master.sh
'
),
masterRunScriptContent
,
{
encoding
:
'
utf8
'
});
}
}
...
@@ -218,11 +218,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -218,11 +218,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// Write file content ( parameter.cfg ) to local tmp folders
// Write file content ( parameter.cfg ) to local tmp folders
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
const
trialForm
:
TrialJobApplicationForm
=
(
<
TrialJobApplicationForm
>
form
)
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
if
(
trialForm
&&
trialForm
.
hyperParameters
)
{
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
generateParamFileName
(
trialForm
.
hyperParameters
)),
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
trialForm
.
hyperParameters
.
value
,
{
encoding
:
'
utf8
'
});
}
}
}
}
private
async
prepareKubeflowConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
):
Promise
<
any
>
{
private
async
prepareKubeflowConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
):
Promise
<
any
>
{
if
(
!
this
.
kubeflowClusterConfig
)
{
if
(
!
this
.
kubeflowClusterConfig
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
...
@@ -241,10 +241,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -241,10 +241,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
else
{
}
else
{
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
throw
Error
(
`operator
${
this
.
kubeflowClusterConfig
.
operator
}
is invalid`
)
}
}
const
workerPodResources
:
any
=
{};
const
workerPodResources
:
any
=
{};
if
(
kubeflowTrialConfig
.
worker
)
{
if
(
kubeflowTrialConfig
.
worker
)
{
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
workerPodResources
.
requests
=
this
.
generatePodResource
(
kubeflowTrialConfig
.
worker
.
memoryMB
,
kubeflowTrialConfig
.
worker
.
cpuNum
,
kubeflowTrialConfig
.
worker
.
gpuNum
)
kubeflowTrialConfig
.
worker
.
gpuNum
)
}
}
workerPodResources
.
limits
=
Object
.
assign
({},
workerPodResources
.
requests
);
workerPodResources
.
limits
=
Object
.
assign
({},
workerPodResources
.
requests
);
...
@@ -253,30 +253,30 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -253,30 +253,30 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
if
(
tensorflowTrialConfig
.
ps
)
{
if
(
tensorflowTrialConfig
.
ps
)
{
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
tensorflowTrialConfig
.
ps
.
memoryMB
,
tensorflowTrialConfig
.
ps
.
cpuNum
,
tensorflowTrialConfig
.
ps
.
gpuNum
)
tensorflowTrialConfig
.
ps
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
}
}
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
}
else
if
(
this
.
kubeflowClusterConfig
.
operator
===
'
pytorch-operator
'
){
let
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
let
pyTorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
nonWorkerResources
.
requests
=
this
.
generatePodResource
(
pyTorchTrialConfig
.
master
.
memoryMB
,
pyTorchTrialConfig
.
master
.
cpuNum
,
pyTorchTrialConfig
.
master
.
gpuNum
)
pyTorchTrialConfig
.
master
.
gpuNum
)
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
nonWorkerResources
.
limits
=
Object
.
assign
({},
nonWorkerResources
.
requests
);
}
// Generate kubeflow job resource config object
}
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
}
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
case
TrialConfigMetadataKey
.
NNI_MANAGER_IP
:
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
this
.
nniManagerIpConfig
=
<
NNIManagerIpConfig
>
JSON
.
parse
(
value
);
break
;
break
;
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
let
kubeflowClusterJsonObject
=
JSON
.
parse
(
value
);
let
kubeflowClusterJsonObject
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
...
@@ -296,7 +296,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -296,7 +296,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
server
,
nfsKubeflowClusterConfig
.
nfs
.
path
nfsKubeflowClusterConfig
.
nfs
.
path
);
);
}
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubernetesCRDClient
=
KubeflowOperatorClient
.
generateOperatorClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
this
.
kubeflowClusterConfig
.
apiVersion
);
break
;
break
;
...
@@ -304,13 +304,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -304,13 +304,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
if
(
!
this
.
kubeflowClusterConfig
){
if
(
!
this
.
kubeflowClusterConfig
){
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
this
.
log
.
error
(
'
kubeflow cluster config is not initialized
'
);
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
return
Promise
.
reject
(
new
Error
(
'
kubeflow cluster config is not initialized
'
));
}
}
assert
(
this
.
kubeflowClusterConfig
!==
undefined
)
assert
(
this
.
kubeflowClusterConfig
!==
undefined
)
let
kubeflowTrialJsonObjsect
=
JSON
.
parse
(
value
);
let
kubeflowTrialJsonObjsect
=
JSON
.
parse
(
value
);
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
this
.
kubeflowTrialConfig
=
KubeflowTrialConfigFactory
.
generateKubeflowTrialConfig
(
kubeflowTrialJsonObjsect
,
kubeflowTrialJsonObjsect
,
this
.
kubeflowClusterConfig
.
operator
this
.
kubeflowClusterConfig
.
operator
);
);
...
@@ -319,7 +319,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -319,7 +319,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
await
validateCodeDir
(
this
.
kubeflowTrialConfig
.
codeDir
);
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
new
Error
(
error
));
return
Promise
.
reject
(
new
Error
(
error
));
}
}
break
;
break
;
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
case
TrialConfigMetadataKey
.
VERSION_CHECK
:
...
@@ -361,11 +361,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -361,11 +361,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
let
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
if
(
tensorflowTrialConfig
.
ps
){
if
(
tensorflowTrialConfig
.
ps
){
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
}
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
tfReplicaSpecs
'
:
replicaSpecsObj
})
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
tfReplicaSpecs
'
:
replicaSpecsObj
})
...
@@ -373,19 +373,19 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -373,19 +373,19 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
let
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
)
{
if
(
pytorchTrialConfig
.
worker
)
{
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
}
}
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
pytorchReplicaSpecs
'
:
replicaSpecsObj
})
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
'
pytorchReplicaSpecs
'
:
replicaSpecsObj
})
}
}
return
{
return
{
apiVersion
:
`kubeflow.org/
${
this
.
kubernetesCRDClient
.
apiVersion
}
`
,
apiVersion
:
`kubeflow.org/
${
this
.
kubernetesCRDClient
.
apiVersion
}
`
,
kind
:
this
.
kubernetesCRDClient
.
jobKind
,
kind
:
this
.
kubernetesCRDClient
.
jobKind
,
metadata
:
{
metadata
:
{
name
:
kubeflowJobName
,
name
:
kubeflowJobName
,
namespace
:
'
default
'
,
namespace
:
'
default
'
,
labels
:
{
labels
:
{
...
@@ -395,7 +395,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -395,7 +395,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
}
},
},
spec
:
replicaSpecsObjMap
.
get
(
this
.
kubernetesCRDClient
.
jobKind
)
spec
:
replicaSpecsObjMap
.
get
(
this
.
kubernetesCRDClient
.
jobKind
)
};
};
}
}
/**
/**
...
...
src/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
View file @
ae7a72bc
...
@@ -39,7 +39,7 @@ class GeneralK8sClient {
...
@@ -39,7 +39,7 @@ class GeneralK8sClient {
}
}
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
public
async
createSecret
(
secretManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
const
response
:
any
=
await
this
.
client
.
api
.
v1
.
namespaces
(
'
default
'
).
secrets
.
post
({
body
:
secretManifest
});
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
if
(
response
.
statusCode
&&
(
response
.
statusCode
>=
200
&&
response
.
statusCode
<=
299
))
{
result
=
Promise
.
resolve
(
true
);
result
=
Promise
.
resolve
(
true
);
...
@@ -65,8 +65,8 @@ abstract class KubernetesCRDClient {
...
@@ -65,8 +65,8 @@ abstract class KubernetesCRDClient {
public
abstract
get
containerName
():
string
;
public
abstract
get
containerName
():
string
;
public
get
jobKind
():
string
{
public
get
jobKind
():
string
{
if
(
this
.
crdSchema
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
names
&&
this
.
crdSchema
.
spec
.
names
&&
this
.
crdSchema
.
spec
.
names
.
kind
)
{
&&
this
.
crdSchema
.
spec
.
names
.
kind
)
{
return
this
.
crdSchema
.
spec
.
names
.
kind
;
return
this
.
crdSchema
.
spec
.
names
.
kind
;
...
@@ -76,15 +76,15 @@ abstract class KubernetesCRDClient {
...
@@ -76,15 +76,15 @@ abstract class KubernetesCRDClient {
}
}
public
get
apiVersion
():
string
{
public
get
apiVersion
():
string
{
if
(
this
.
crdSchema
if
(
this
.
crdSchema
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
&&
this
.
crdSchema
.
spec
.
version
)
{
&&
this
.
crdSchema
.
spec
.
version
)
{
return
this
.
crdSchema
.
spec
.
version
;
return
this
.
crdSchema
.
spec
.
version
;
}
else
{
}
else
{
throw
new
Error
(
'
KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!
'
);
throw
new
Error
(
'
KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!
'
);
}
}
}
}
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
public
async
createKubernetesJob
(
jobManifest
:
any
):
Promise
<
boolean
>
{
let
result
:
Promise
<
boolean
>
;
let
result
:
Promise
<
boolean
>
;
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
const
response
:
any
=
await
this
.
operator
.
post
({
body
:
jobManifest
});
...
@@ -117,7 +117,7 @@ abstract class KubernetesCRDClient {
...
@@ -117,7 +117,7 @@ abstract class KubernetesCRDClient {
qs
:
{
qs
:
{
labelSelector
:
matchQuery
,
labelSelector
:
matchQuery
,
propagationPolicy
:
"
Background
"
propagationPolicy
:
"
Background
"
}
}
});
});
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
if
(
deleteResult
.
statusCode
&&
deleteResult
.
statusCode
>=
200
&&
deleteResult
.
statusCode
<=
299
)
{
result
=
Promise
.
resolve
(
true
);
result
=
Promise
.
resolve
(
true
);
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
ae7a72bc
...
@@ -25,7 +25,7 @@ import { MethodNotImplementedError } from '../../common/errors';
...
@@ -25,7 +25,7 @@ import { MethodNotImplementedError } from '../../common/errors';
export
abstract
class
KubernetesClusterConfig
{
export
abstract
class
KubernetesClusterConfig
{
public
readonly
storage
?:
KubernetesStorageKind
;
public
readonly
storage
?:
KubernetesStorageKind
;
public
readonly
apiVersion
:
string
;
public
readonly
apiVersion
:
string
;
constructor
(
apiVersion
:
string
,
storage
?:
KubernetesStorageKind
)
{
constructor
(
apiVersion
:
string
,
storage
?:
KubernetesStorageKind
)
{
this
.
storage
=
storage
;
this
.
storage
=
storage
;
this
.
apiVersion
=
apiVersion
;
this
.
apiVersion
=
apiVersion
;
...
@@ -48,7 +48,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
...
@@ -48,7 +48,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
public
readonly
nfs
:
NFSConfig
;
public
readonly
nfs
:
NFSConfig
;
constructor
(
constructor
(
apiVersion
:
string
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
...
@@ -73,11 +73,11 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
...
@@ -73,11 +73,11 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
public
readonly
keyVault
:
keyVaultConfig
;
public
readonly
keyVault
:
keyVaultConfig
;
public
readonly
azureStorage
:
AzureStorage
;
public
readonly
azureStorage
:
AzureStorage
;
constructor
(
constructor
(
apiVersion
:
string
,
apiVersion
:
string
,
keyVault
:
keyVaultConfig
,
keyVault
:
keyVaultConfig
,
azureStorage
:
AzureStorage
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
)
{
)
{
super
(
apiVersion
,
storage
);
super
(
apiVersion
,
storage
);
...
@@ -151,7 +151,7 @@ export class keyVaultConfig {
...
@@ -151,7 +151,7 @@ export class keyVaultConfig {
export
class
AzureStorage
{
export
class
AzureStorage
{
/**The azure share to storage files */
/**The azure share to storage files */
public
readonly
azureShare
:
string
;
public
readonly
azureShare
:
string
;
/**The account name of sotrage service */
/**The account name of sotrage service */
public
readonly
accountName
:
string
;
public
readonly
accountName
:
string
;
constructor
(
azureShare
:
string
,
accountName
:
string
){
constructor
(
azureShare
:
string
,
accountName
:
string
){
...
@@ -178,8 +178,8 @@ export class KubernetesTrialConfigTemplate {
...
@@ -178,8 +178,8 @@ export class KubernetesTrialConfigTemplate {
/** Required GPU number for trial job. The number should be in [0,100] */
/** Required GPU number for trial job. The number should be in [0,100] */
public
readonly
gpuNum
:
number
;
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
gpuNum
:
number
,
constructor
(
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
this
.
command
=
command
;
this
.
command
=
command
;
this
.
gpuNum
=
gpuNum
;
this
.
gpuNum
=
gpuNum
;
...
...
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
ae7a72bc
...
@@ -40,7 +40,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
...
@@ -40,7 +40,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
public
queryJobFailedCount
:
number
;
public
queryJobFailedCount
:
number
;
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
kubernetesJobName
:
string
,
sequenceId
:
number
,
url
:
string
)
{
kubernetesJobName
:
string
,
sequenceId
:
number
,
url
:
string
)
{
this
.
id
=
id
;
this
.
id
=
id
;
this
.
status
=
status
;
this
.
status
=
status
;
...
...
src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts
View file @
ae7a72bc
...
@@ -57,7 +57,7 @@ export class KubernetesJobInfoCollector {
...
@@ -57,7 +57,7 @@ export class KubernetesJobInfoCollector {
await
Promise
.
all
(
updateKubernetesTrialJobs
);
await
Promise
.
all
(
updateKubernetesTrialJobs
);
}
}
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
protected
async
retrieveSingleTrialJobInfo
(
kubernetesCRDClient
:
KubernetesCRDClient
|
undefined
,
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
kubernetesTrialJob
:
KubernetesTrialJobDetail
)
:
Promise
<
void
>
{
throw
new
MethodNotImplementedError
();
throw
new
MethodNotImplementedError
();
}
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesJobRestServer.ts
View file @
ae7a72bc
...
@@ -26,7 +26,7 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer'
...
@@ -26,7 +26,7 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/**
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*
*/
*/
@
component
.
Singleton
@
component
.
Singleton
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
export
class
KubernetesJobRestServer
extends
ClusterJobRestServer
{
...
@@ -53,5 +53,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
...
@@ -53,5 +53,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
data
:
singleMetric
data
:
singleMetric
});
});
}
}
}
}
}
}
\ No newline at end of file
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment