Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
2d252c9e
"src/include/gridwise_direct_convolution_1.hip.hpp" did not exist on "39775d484c4d15a5b895edfc9d2323f05ab2d3d4"
Unverified
Commit
2d252c9e
authored
Aug 29, 2019
by
SparkSnail
Committed by
GitHub
Aug 29, 2019
Browse files
Add retry policy for azureStorage (#1480)
parent
a224f4f2
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
125 additions
and
75 deletions
+125
-75
docs/en_US/Tutorial/ExperimentConfig.md
docs/en_US/Tutorial/ExperimentConfig.md
+4
-0
src/nni_manager/rest_server/restValidationSchemas.ts
src/nni_manager/rest_server/restValidationSchemas.ts
+4
-2
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
...er/training_service/kubernetes/azureStorageClientUtils.ts
+41
-28
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+10
-21
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+8
-19
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+6
-2
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+48
-1
tools/nni_cmd/config_schema.py
tools/nni_cmd/config_schema.py
+4
-2
No files found.
docs/en_US/Tutorial/ExperimentConfig.md
View file @
2d252c9e
...
@@ -519,6 +519,10 @@ machineList:
...
@@ -519,6 +519,10 @@ machineList:
__azureShare__ is the share of the azure file storage.
__azureShare__ is the share of the azure file storage.
*
__uploadRetryCount__
If upload files to azure storage failed, NNI will retry the process of uploading, this field will specify the number of attempts to re-upload files.
*
__paiConfig__
*
__paiConfig__
*
__userName__
*
__userName__
...
...
src/nni_manager/rest_server/restValidationSchemas.ts
View file @
2d252c9e
...
@@ -125,7 +125,8 @@ export namespace ValidationSchemas {
...
@@ -125,7 +125,8 @@ export namespace ValidationSchemas {
azureStorage
:
joi
.
object
({
azureStorage
:
joi
.
object
({
accountName
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,31}
$/
),
accountName
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,31}
$/
),
azureShare
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,63}
$/
)
azureShare
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,63}
$/
)
})
}),
uploadRetryCount
:
joi
.
number
().
min
(
1
)
}),
}),
frameworkcontroller_config
:
joi
.
object
({
frameworkcontroller_config
:
joi
.
object
({
storage
:
joi
.
string
().
min
(
1
),
storage
:
joi
.
string
().
min
(
1
),
...
@@ -141,7 +142,8 @@ export namespace ValidationSchemas {
...
@@ -141,7 +142,8 @@ export namespace ValidationSchemas {
azureStorage
:
joi
.
object
({
azureStorage
:
joi
.
object
({
accountName
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,31}
$/
),
accountName
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,31}
$/
),
azureShare
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,63}
$/
)
azureShare
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,63}
$/
)
})
}),
uploadRetryCount
:
joi
.
number
().
min
(
1
)
}),
}),
nni_manager_ip
:
joi
.
object
({
nni_manager_ip
:
joi
.
object
({
nniManagerIp
:
joi
.
string
().
min
(
1
)
nniManagerIp
:
joi
.
string
().
min
(
1
)
...
...
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
View file @
2d252c9e
...
@@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility {
...
@@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient
* @param fileServerClient
* @param azureShare
* @param azureShare
*/
*/
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
fileServerClient
.
createShareIfNotExists
(
azureShare
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
fileServerClient
.
createShareIfNotExists
(
azureShare
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
if
(
error
)
{
getLogger
()
getLogger
()
.
error
(
`Create share failed:,
${
error
}
`
);
.
error
(
`Create share failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
}
});
});
...
@@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility {
...
@@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility {
* @param azureFoler
* @param azureFoler
* @param azureShare
* @param azureShare
*/
*/
export
async
function
createDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
export
async
function
createDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
if
(
error
)
{
getLogger
()
getLogger
()
.
error
(
`Create directory failed:,
${
error
}
`
);
.
error
(
`Create directory failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
}
});
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
...
@@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility {
...
@@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility {
* @param azureDirectory
* @param azureDirectory
*/
*/
export
async
function
createDirectoryRecursive
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
export
async
function
createDirectoryRecursive
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
):
Promise
<
void
>
{
azureShare
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
directories
:
string
[]
=
azureDirectory
.
split
(
'
/
'
);
const
directories
:
string
[]
=
azureDirectory
.
split
(
'
/
'
);
let
rootDirectory
:
string
=
''
;
let
rootDirectory
:
string
=
''
;
for
(
const
directory
of
directories
)
{
for
(
const
directory
of
directories
)
{
rootDirectory
+=
directory
;
rootDirectory
+=
directory
;
await
createDirectory
(
fileServerClient
,
rootDirectory
,
azureShare
);
let
result
:
boolean
=
await
createDirectory
(
fileServerClient
,
rootDirectory
,
azureShare
);
if
(
!
result
)
{
deferred
.
resolve
(
false
);
return
deferred
.
promise
;
}
rootDirectory
+=
'
/
'
;
rootDirectory
+=
'
/
'
;
}
}
deferred
.
resolve
();
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
...
@@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility {
...
@@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility {
* @param localFilePath
* @param localFilePath
*/
*/
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
localFilePath
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
if
(
error
)
{
getLogger
()
getLogger
()
.
error
(
`Upload file failed:,
${
error
}
`
);
.
error
(
`Upload file failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
}
});
});
...
@@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility {
...
@@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility {
* @param localFilePath
* @param localFilePath
*/
*/
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
localFilePath
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
// tslint:disable-next-line:non-literal-fs-path
// tslint:disable-next-line:non-literal-fs-path
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
if
(
error
)
{
getLogger
()
getLogger
()
.
error
(
`Download file failed:,
${
error
}
`
);
.
error
(
`Download file failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
}
});
});
...
@@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility {
...
@@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility {
*/
*/
// tslint:disable:non-literal-fs-path
// tslint:disable:non-literal-fs-path
export
async
function
uploadDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
,
export
async
function
uploadDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
,
localDirectory
:
string
):
Promise
<
void
>
{
localDirectory
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
await
createDirectoryRecursive
(
fileServerClient
,
azureDirectory
,
azureShare
);
let
result
:
boolean
=
await
createDirectoryRecursive
(
fileServerClient
,
azureDirectory
,
azureShare
);
if
(
!
result
)
{
deferred
.
resolve
(
false
);
return
deferred
.
promise
;
}
for
(
const
fileName
of
fileNameArray
)
{
for
(
const
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
try
{
let
resultUploadFile
:
boolean
=
true
;
let
resultUploadDir
:
boolean
=
true
;
if
(
fs
.
lstatSync
(
fullFilePath
)
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
.
isFile
())
{
await
uploadFileToAzure
(
fileServerClient
,
azureDirectory
,
fileName
,
azureShare
,
fullFilePath
);
resultUploadFile
=
await
uploadFileToAzure
(
fileServerClient
,
azureDirectory
,
fileName
,
azureShare
,
fullFilePath
);
}
else
{
}
else
{
// If filePath is a directory, recuisively copy it to azure
// If filePath is a directory, recuisively copy it to azure
await
uploadDirectory
(
fileServerClient
,
String
.
Format
(
'
{0}/{1}
'
,
azureDirectory
,
fileName
),
azureShare
,
fullFilePath
);
resultUploadDir
=
await
uploadDirectory
(
fileServerClient
,
String
.
Format
(
'
{0}/{1}
'
,
azureDirectory
,
fileName
),
azureShare
,
fullFilePath
);
}
if
(
!
(
resultUploadFile
&&
resultUploadDir
))
{
deferred
.
resolve
(
false
);
return
deferred
.
promise
;
}
}
}
catch
(
error
)
{
}
catch
(
error
)
{
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
}
}
// All files/directories are copied successfully, resolve
// All files/directories are copied successfully, resolve
deferred
.
resolve
();
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
2d252c9e
...
@@ -25,7 +25,7 @@ import * as path from 'path';
...
@@ -25,7 +25,7 @@ import * as path from 'path';
import
*
as
component
from
'
../../../common/component
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
import
{
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../../common/trainingService
'
;
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
...
@@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
//upload code files
//upload code files
const
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
if
(
!
trialJobOutputUrl
)
{
initStatus
=
'
FAILED
'
;
}
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
trialJobId
,
'
WAITING
'
,
initStatus
,
Date
.
now
(),
Date
.
now
(),
trialWorkingFolder
,
trialWorkingFolder
,
form
,
form
,
...
@@ -208,24 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -208,24 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
let
trialJobOutputUrl
:
string
=
''
;
let
trialJobOutputUrl
:
string
=
''
;
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
const
azureFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigAzure
=
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
}
trialJobOutputUrl
=
await
this
.
uploadFilesToAzureStorage
(
trialJobId
,
trialLocalTempFolder
,
this
.
fcTrialConfig
.
codeDir
,
try
{
azureFrameworkControllerClusterConfig
.
uploadRetryCount
);
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
//upload code files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
this
.
fcTrialConfig
.
codeDir
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/`
+
`
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
2d252c9e
...
@@ -27,7 +27,7 @@ import * as component from '../../../common/component';
...
@@ -27,7 +27,7 @@ import * as component from '../../../common/component';
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
import
{
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../../common/trainingService
'
;
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
...
@@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
curTrialSequenceId
,
form
);
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
curTrialSequenceId
,
form
);
//upload files to sotrage
//upload files to sotrage
const
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
const
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
if
(
!
trialJobOutputUrl
)
{
initStatus
=
'
FAILED
'
;
}
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
trialJobId
,
'
WAITING
'
,
initStatus
,
Date
.
now
(),
Date
.
now
(),
trialWorkingFolder
,
trialWorkingFolder
,
form
,
form
,
...
@@ -215,23 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -215,23 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if
(
this
.
azureStorageClient
===
undefined
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
}
try
{
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
trialJobOutputUrl
=
await
this
.
uploadFilesToAzureStorage
(
trialJobId
,
trialLocalTempFolder
,
this
.
kubeflowTrialConfig
.
codeDir
,
azureKubeflowClusterConfig
.
uploadRetryCount
);
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
//upload code files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
this
.
kubeflowTrialConfig
.
codeDir
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
`
+
`/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
// Creat work dir for current trial in NFS directory
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
2d252c9e
...
@@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
...
@@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
public
readonly
keyVault
:
KeyVaultConfig
;
public
readonly
keyVault
:
KeyVaultConfig
;
public
readonly
azureStorage
:
AzureStorage
;
public
readonly
azureStorage
:
AzureStorage
;
public
readonly
uploadRetryCount
:
number
|
undefined
;
constructor
(
constructor
(
apiVersion
:
string
,
apiVersion
:
string
,
keyVault
:
KeyVaultConfig
,
keyVault
:
KeyVaultConfig
,
azureStorage
:
AzureStorage
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
,
uploadRetryCount
?:
number
)
{
)
{
super
(
apiVersion
,
storage
);
super
(
apiVersion
,
storage
);
this
.
keyVault
=
keyVault
;
this
.
keyVault
=
keyVault
;
this
.
azureStorage
=
azureStorage
;
this
.
azureStorage
=
azureStorage
;
this
.
uploadRetryCount
=
uploadRetryCount
;
}
}
public
get
storageType
():
KubernetesStorageKind
{
public
get
storageType
():
KubernetesStorageKind
{
...
@@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
...
@@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
kubernetesClusterConfigObjectAzure
.
azureStorage
,
kubernetesClusterConfigObjectAzure
.
azureStorage
,
kubernetesClusterConfigObjectAzure
.
storage
kubernetesClusterConfigObjectAzure
.
storage
,
kubernetesClusterConfigObjectAzure
.
uploadRetryCount
);
);
}
}
}
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
2d252c9e
...
@@ -31,13 +31,14 @@ import { getLogger, Logger } from '../../common/log';
...
@@ -31,13 +31,14 @@ import { getLogger, Logger } from '../../common/log';
import
{
import
{
NNIManagerIpConfig
,
TrialJobDetail
,
TrialJobMetric
NNIManagerIpConfig
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
}
from
'
../../common/trainingService
'
;
import
{
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
delay
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
var
yaml
=
require
(
'
js-yaml
'
);
var
fs
=
require
(
'
fs
'
);
var
fs
=
require
(
'
fs
'
);
/**
/**
...
@@ -357,6 +358,52 @@ abstract class KubernetesTrainingService {
...
@@ -357,6 +358,52 @@ abstract class KubernetesTrainingService {
);
);
return
registrySecretName
;
return
registrySecretName
;
}
}
protected
async
uploadFilesToAzureStorage
(
trialJobId
:
string
,
trialLocalTempFolder
:
String
,
codeDir
:
String
,
uploadRetryCount
:
number
|
undefined
):
Promise
<
string
>
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
let
trialJobOutputUrl
:
string
=
''
;
let
retryCount
:
number
=
1
;
if
(
uploadRetryCount
)
{
retryCount
=
uploadRetryCount
;
}
let
resultUploadNNIScript
:
boolean
=
false
;
let
resultUploadCodeFile
:
boolean
=
false
;
try
{
do
{
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
if
(
!
resultUploadNNIScript
)
{
resultUploadNNIScript
=
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
}
//upload code files to azure storage
if
(
!
resultUploadCodeFile
)
{
resultUploadCodeFile
=
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
codeDir
}
`
);
}
if
(
resultUploadNNIScript
&&
resultUploadCodeFile
)
{
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
`
+
`/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
break
;
}
else
{
//wait for 5 seconds to re-upload files
await
delay
(
5000
);
this
.
log
.
info
(
'
Upload failed, Retry: upload files to azure-storage
'
);
}
}
while
(
retryCount
--
>=
0
)
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
//return a empty url when got error
return
Promise
.
resolve
(
""
);
}
if
(
!
trialJobOutputUrl
)
{
this
.
log
.
info
(
`Retry-count is used up, upload files to azureStorage for trial
${
trialJobId
}
failed!`
);
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
}
}
export
{
KubernetesTrainingService
};
export
{
KubernetesTrainingService
};
tools/nni_cmd/config_schema.py
View file @
2d252c9e
...
@@ -315,7 +315,8 @@ kubeflow_config_schema = {
...
@@ -315,7 +315,8 @@ kubeflow_config_schema = {
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
}
},
Optional
(
'uploadRetryCount'
):
setNumberRange
(
'uploadRetryCount'
,
int
,
1
,
99999
)
})
})
}
}
...
@@ -361,7 +362,8 @@ frameworkcontroller_config_schema = {
...
@@ -361,7 +362,8 @@ frameworkcontroller_config_schema = {
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
}
},
Optional
(
'uploadRetryCount'
):
setNumberRange
(
'uploadRetryCount'
,
int
,
1
,
99999
)
})
})
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment