Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
2d252c9e
Unverified
Commit
2d252c9e
authored
Aug 29, 2019
by
SparkSnail
Committed by
GitHub
Aug 29, 2019
Browse files
Add retry policy for azureStorage (#1480)
parent
a224f4f2
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
125 additions
and
75 deletions
+125
-75
docs/en_US/Tutorial/ExperimentConfig.md
docs/en_US/Tutorial/ExperimentConfig.md
+4
-0
src/nni_manager/rest_server/restValidationSchemas.ts
src/nni_manager/rest_server/restValidationSchemas.ts
+4
-2
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
...er/training_service/kubernetes/azureStorageClientUtils.ts
+41
-28
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+10
-21
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+8
-19
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+6
-2
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+48
-1
tools/nni_cmd/config_schema.py
tools/nni_cmd/config_schema.py
+4
-2
No files found.
docs/en_US/Tutorial/ExperimentConfig.md
View file @
2d252c9e
...
@@ -519,6 +519,10 @@ machineList:
...
@@ -519,6 +519,10 @@ machineList:
__azureShare__ is the share of the azure file storage.
__azureShare__ is the share of the azure file storage.
*
__uploadRetryCount__
If upload files to azure storage failed, NNI will retry the process of uploading, this field will specify the number of attempts to re-upload files.
*
__paiConfig__
*
__paiConfig__
*
__userName__
*
__userName__
...
...
src/nni_manager/rest_server/restValidationSchemas.ts
View file @
2d252c9e
...
@@ -125,7 +125,8 @@ export namespace ValidationSchemas {
...
@@ -125,7 +125,8 @@ export namespace ValidationSchemas {
azureStorage
:
joi
.
object
({
azureStorage
:
joi
.
object
({
accountName
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,31}
$/
),
accountName
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,31}
$/
),
azureShare
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,63}
$/
)
azureShare
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,63}
$/
)
})
}),
uploadRetryCount
:
joi
.
number
().
min
(
1
)
}),
}),
frameworkcontroller_config
:
joi
.
object
({
frameworkcontroller_config
:
joi
.
object
({
storage
:
joi
.
string
().
min
(
1
),
storage
:
joi
.
string
().
min
(
1
),
...
@@ -141,7 +142,8 @@ export namespace ValidationSchemas {
...
@@ -141,7 +142,8 @@ export namespace ValidationSchemas {
azureStorage
:
joi
.
object
({
azureStorage
:
joi
.
object
({
accountName
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,31}
$/
),
accountName
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,31}
$/
),
azureShare
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,63}
$/
)
azureShare
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,63}
$/
)
})
}),
uploadRetryCount
:
joi
.
number
().
min
(
1
)
}),
}),
nni_manager_ip
:
joi
.
object
({
nni_manager_ip
:
joi
.
object
({
nniManagerIp
:
joi
.
string
().
min
(
1
)
nniManagerIp
:
joi
.
string
().
min
(
1
)
...
...
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
View file @
2d252c9e
...
@@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility {
...
@@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient
* @param fileServerClient
* @param azureShare
* @param azureShare
*/
*/
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
fileServerClient
.
createShareIfNotExists
(
azureShare
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
fileServerClient
.
createShareIfNotExists
(
azureShare
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
if
(
error
)
{
getLogger
()
getLogger
()
.
error
(
`Create share failed:,
${
error
}
`
);
.
error
(
`Create share failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
}
});
});
...
@@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility {
...
@@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility {
* @param azureFoler
* @param azureFoler
* @param azureShare
* @param azureShare
*/
*/
export
async
function
createDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
if
(
error
)
{
getLogger
()
getLogger
()
.
error
(
`Create directory failed:,
${
error
}
`
);
.
error
(
`Create directory failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
}
});
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
...
@@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility {
...
@@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility {
* @param azureDirectory
* @param azureDirectory
*/
*/
export
async
function
createDirectoryRecursive
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
export
async
function
createDirectoryRecursive
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
):
Promise
<
void
>
{
azureShare
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
directories
:
string
[]
=
azureDirectory
.
split
(
'
/
'
);
const
directories
:
string
[]
=
azureDirectory
.
split
(
'
/
'
);
let
rootDirectory
:
string
=
''
;
let
rootDirectory
:
string
=
''
;
for
(
const
directory
of
directories
)
{
for
(
const
directory
of
directories
)
{
rootDirectory
+=
directory
;
rootDirectory
+=
directory
;
await
createDirectory
(
fileServerClient
,
rootDirectory
,
azureShare
);
let
result
:
boolean
=
await
createDirectory
(
fileServerClient
,
rootDirectory
,
azureShare
);
if
(
!
result
)
{
deferred
.
resolve
(
false
);
return
deferred
.
promise
;
}
rootDirectory
+=
'
/
'
;
rootDirectory
+=
'
/
'
;
}
}
deferred
.
resolve
();
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
...
@@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility {
...
@@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility {
* @param localFilePath
* @param localFilePath
*/
*/
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
localFilePath
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
if
(
error
)
{
getLogger
()
getLogger
()
.
error
(
`Upload file failed:,
${
error
}
`
);
.
error
(
`Upload file failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
}
});
});
...
@@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility {
...
@@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility {
* @param localFilePath
* @param localFilePath
*/
*/
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
localFilePath
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
// tslint:disable-next-line:non-literal-fs-path
// tslint:disable-next-line:non-literal-fs-path
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
if
(
error
)
{
getLogger
()
getLogger
()
.
error
(
`Download file failed:,
${
error
}
`
);
.
error
(
`Download file failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
}
});
});
...
@@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility {
...
@@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility {
*/
*/
// tslint:disable:non-literal-fs-path
// tslint:disable:non-literal-fs-path
export
async
function
uploadDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
,
export
async
function
uploadDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
,
localDirectory
:
string
):
Promise
<
void
>
{
localDirectory
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
await
createDirectoryRecursive
(
fileServerClient
,
azureDirectory
,
azureShare
);
let
result
:
boolean
=
await
createDirectoryRecursive
(
fileServerClient
,
azureDirectory
,
azureShare
);
if
(
!
result
)
{
deferred
.
resolve
(
false
);
return
deferred
.
promise
;
}
for
(
const
fileName
of
fileNameArray
)
{
for
(
const
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
try
{
let
resultUploadFile
:
boolean
=
true
;
let
resultUploadDir
:
boolean
=
true
;
if
(
fs
.
lstatSync
(
fullFilePath
)
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
.
isFile
())
{
await
uploadFileToAzure
(
fileServerClient
,
azureDirectory
,
fileName
,
azureShare
,
fullFilePath
);
resultUploadFile
=
await
uploadFileToAzure
(
fileServerClient
,
azureDirectory
,
fileName
,
azureShare
,
fullFilePath
);
}
else
{
}
else
{
// If filePath is a directory, recuisively copy it to azure
// If filePath is a directory, recuisively copy it to azure
await
uploadDirectory
(
fileServerClient
,
String
.
Format
(
'
{0}/{1}
'
,
azureDirectory
,
fileName
),
azureShare
,
fullFilePath
);
resultUploadDir
=
await
uploadDirectory
(
fileServerClient
,
String
.
Format
(
'
{0}/{1}
'
,
azureDirectory
,
fileName
),
azureShare
,
fullFilePath
);
}
if
(
!
(
resultUploadFile
&&
resultUploadDir
))
{
deferred
.
resolve
(
false
);
return
deferred
.
promise
;
}
}
}
catch
(
error
)
{
}
catch
(
error
)
{
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
}
}
// All files/directories are copied successfully, resolve
// All files/directories are copied successfully, resolve
deferred
.
resolve
();
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
2d252c9e
...
@@ -25,7 +25,7 @@ import * as path from 'path';
...
@@ -25,7 +25,7 @@ import * as path from 'path';
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
import
{
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../../common/trainingService
'
;
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
...
@@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
//upload code files
//upload code files
const
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
if
(
!
trialJobOutputUrl
)
{
initStatus
=
'
FAILED
'
;
}
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
trialJobId
,
'
WAITING
'
,
initStatus
,
Date
.
now
(),
Date
.
now
(),
trialWorkingFolder
,
trialWorkingFolder
,
form
,
form
,
...
@@ -208,24 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -208,24 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
let
trialJobOutputUrl
:
string
=
''
;
let
trialJobOutputUrl
:
string
=
''
;
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
const
azureFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigAzure
=
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
}
trialJobOutputUrl
=
await
this
.
uploadFilesToAzureStorage
(
trialJobId
,
trialLocalTempFolder
,
this
.
fcTrialConfig
.
codeDir
,
try
{
azureFrameworkControllerClusterConfig
.
uploadRetryCount
);
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
//upload code files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
this
.
fcTrialConfig
.
codeDir
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/`
+
`
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
2d252c9e
...
@@ -27,7 +27,7 @@ import * as component from '../../../common/component';
...
@@ -27,7 +27,7 @@ import * as component from '../../../common/component';
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
import
{
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../../common/trainingService
'
;
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
...
@@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
curTrialSequenceId
,
form
);
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
curTrialSequenceId
,
form
);
//upload files to sotrage
//upload files to sotrage
const
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
if
(
!
trialJobOutputUrl
)
{
initStatus
=
'
FAILED
'
;
}
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
trialJobId
,
'
WAITING
'
,
initStatus
,
Date
.
now
(),
Date
.
now
(),
trialWorkingFolder
,
trialWorkingFolder
,
form
,
form
,
...
@@ -215,23 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -215,23 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if
(
this
.
azureStorageClient
===
undefined
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
}
try
{
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
trialJobOutputUrl
=
await
this
.
uploadFilesToAzureStorage
(
trialJobId
,
trialLocalTempFolder
,
this
.
kubeflowTrialConfig
.
codeDir
,
azureKubeflowClusterConfig
.
uploadRetryCount
);
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
//upload code files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
this
.
kubeflowTrialConfig
.
codeDir
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
`
+
`/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
// Creat work dir for current trial in NFS directory
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
2d252c9e
...
@@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
...
@@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
public
readonly
keyVault
:
KeyVaultConfig
;
public
readonly
keyVault
:
KeyVaultConfig
;
public
readonly
azureStorage
:
AzureStorage
;
public
readonly
azureStorage
:
AzureStorage
;
public
readonly
uploadRetryCount
:
number
|
undefined
;
constructor
(
constructor
(
apiVersion
:
string
,
apiVersion
:
string
,
keyVault
:
KeyVaultConfig
,
keyVault
:
KeyVaultConfig
,
azureStorage
:
AzureStorage
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
,
uploadRetryCount
?:
number
)
{
)
{
super
(
apiVersion
,
storage
);
super
(
apiVersion
,
storage
);
this
.
keyVault
=
keyVault
;
this
.
keyVault
=
keyVault
;
this
.
azureStorage
=
azureStorage
;
this
.
azureStorage
=
azureStorage
;
this
.
uploadRetryCount
=
uploadRetryCount
;
}
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
...
@@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
...
@@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
kubernetesClusterConfigObjectAzure
.
azureStorage
,
kubernetesClusterConfigObjectAzure
.
azureStorage
,
kubernetesClusterConfigObjectAzure
.
storage
kubernetesClusterConfigObjectAzure
.
storage
,
kubernetesClusterConfigObjectAzure
.
uploadRetryCount
);
);
}
}
}
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
2d252c9e
...
@@ -31,13 +31,14 @@ import { getLogger, Logger } from '../../common/log';
...
@@ -31,13 +31,14 @@ import { getLogger, Logger } from '../../common/log';
import
{
import
{
NNIManagerIpConfig
,
TrialJobDetail
,
TrialJobMetric
NNIManagerIpConfig
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
delay
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
var
yaml
=
require
(
'
js-yaml
'
);
var
fs
=
require
(
'
fs
'
);
var
fs
=
require
(
'
fs
'
);
/**
/**
...
@@ -358,5 +359,51 @@ abstract class KubernetesTrainingService {
...
@@ -358,5 +359,51 @@ abstract class KubernetesTrainingService {
return
registrySecretName
;
return
registrySecretName
;
}
}
protected
async
uploadFilesToAzureStorage
(
trialJobId
:
string
,
trialLocalTempFolder
:
String
,
codeDir
:
String
,
uploadRetryCount
:
number
|
undefined
):
Promise
<
string
>
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
let
trialJobOutputUrl
:
string
=
''
;
let
retryCount
:
number
=
1
;
if
(
uploadRetryCount
)
{
retryCount
=
uploadRetryCount
;
}
let
resultUploadNNIScript
:
boolean
=
false
;
let
resultUploadCodeFile
:
boolean
=
false
;
try
{
do
{
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
if
(
!
resultUploadNNIScript
)
{
resultUploadNNIScript
=
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
}
//upload code files to azure storage
if
(
!
resultUploadCodeFile
)
{
resultUploadCodeFile
=
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
codeDir
}
`
);
}
if
(
resultUploadNNIScript
&&
resultUploadCodeFile
)
{
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
`
+
`/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
break
;
}
else
{
//wait for 5 seconds to re-upload files
await
delay
(
5000
);
this
.
log
.
info
(
'
Upload failed, Retry: upload files to azure-storage
'
);
}
}
while
(
retryCount
--
>=
0
)
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
//return a empty url when got error
return
Promise
.
resolve
(
""
);
}
if
(
!
trialJobOutputUrl
)
{
this
.
log
.
info
(
`Retry-count is used up, upload files to azureStorage for trial
${
trialJobId
}
failed!`
);
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
}
}
export
{
KubernetesTrainingService
};
export
{
KubernetesTrainingService
};
tools/nni_cmd/config_schema.py
View file @
2d252c9e
...
@@ -315,7 +315,8 @@ kubeflow_config_schema = {
...
@@ -315,7 +315,8 @@ kubeflow_config_schema = {
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
}
},
Optional
(
'uploadRetryCount'
):
setNumberRange
(
'uploadRetryCount'
,
int
,
1
,
99999
)
})
})
}
}
...
@@ -361,7 +362,8 @@ frameworkcontroller_config_schema = {
...
@@ -361,7 +362,8 @@ frameworkcontroller_config_schema = {
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
}
},
Optional
(
'uploadRetryCount'
):
setNumberRange
(
'uploadRetryCount'
,
int
,
1
,
99999
)
})
})
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment