Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
871f031d
Unverified
Commit
871f031d
authored
Sep 04, 2019
by
Guoxin
Committed by
GitHub
Sep 04, 2019
Browse files
Merge pull request #1520 from suiguoxin/v1.0-conf-resolve
merge v1.0 back to master (conflicts resolved)
parents
8f71479e
b75a2914
Changes
36
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
167 additions
and
97 deletions
+167
-97
src/nni_manager/rest_server/restValidationSchemas.ts
src/nni_manager/rest_server/restValidationSchemas.ts
+9
-2
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
...er/training_service/kubernetes/azureStorageClientUtils.ts
+41
-28
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+10
-21
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+8
-19
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+6
-2
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+48
-1
src/nni_manager/training_service/pai/paiConfig.ts
src/nni_manager/training_service/pai/paiConfig.ts
+18
-2
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+3
-0
src/nni_manager/yarn.lock
src/nni_manager/yarn.lock
+1
-8
src/sdk/pynni/nni/nas_utils.py
src/sdk/pynni/nni/nas_utils.py
+2
-2
src/sdk/pynni/nni/smartparam.py
src/sdk/pynni/nni/smartparam.py
+1
-1
src/sdk/pynni/nni/trial.py
src/sdk/pynni/nni/trial.py
+3
-1
src/webui/src/components/trial-detail/DefaultMetricPoint.tsx
src/webui/src/components/trial-detail/DefaultMetricPoint.tsx
+3
-1
src/webui/src/static/style/table.scss
src/webui/src/static/style/table.scss
+3
-2
tools/nni_cmd/config_schema.py
tools/nni_cmd/config_schema.py
+10
-3
tools/nni_cmd/launcher_utils.py
tools/nni_cmd/launcher_utils.py
+1
-4
No files found.
src/nni_manager/rest_server/restValidationSchemas.ts
View file @
871f031d
...
...
@@ -53,6 +53,11 @@ export namespace ValidationSchemas {
shmMB
:
joi
.
number
(),
authFile
:
joi
.
string
(),
nasMode
:
joi
.
string
().
valid
(
'
classic_mode
'
,
'
enas_mode
'
,
'
oneshot_mode
'
,
'
darts_mode
'
),
portList
:
joi
.
array
().
items
(
joi
.
object
({
label
:
joi
.
string
().
required
(),
beginAt
:
joi
.
number
().
required
(),
portNumber
:
joi
.
number
().
required
(),
})),
worker
:
joi
.
object
({
replicas
:
joi
.
number
().
min
(
1
).
required
(),
image
:
joi
.
string
().
min
(
1
),
...
...
@@ -120,7 +125,8 @@ export namespace ValidationSchemas {
azureStorage
:
joi
.
object
({
accountName
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,31}
$/
),
azureShare
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,63}
$/
)
})
}),
uploadRetryCount
:
joi
.
number
().
min
(
1
)
}),
frameworkcontroller_config
:
joi
.
object
({
storage
:
joi
.
string
().
min
(
1
),
...
...
@@ -136,7 +142,8 @@ export namespace ValidationSchemas {
azureStorage
:
joi
.
object
({
accountName
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,31}
$/
),
azureShare
:
joi
.
string
().
regex
(
/^
([
0-9
]
|
[
a-z
]
|
[
A-Z
]
|-
){3,63}
$/
)
})
}),
uploadRetryCount
:
joi
.
number
().
min
(
1
)
}),
nni_manager_ip
:
joi
.
object
({
nniManagerIp
:
joi
.
string
().
min
(
1
)
...
...
src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts
View file @
871f031d
...
...
@@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient
* @param azureShare
*/
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
export
async
function
createShare
(
fileServerClient
:
any
,
azureShare
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
fileServerClient
.
createShareIfNotExists
(
azureShare
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Create share failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
});
...
...
@@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility {
* @param azureFoler
* @param azureShare
*/
export
async
function
createDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
export
async
function
createDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureFoler
:
any
,
azureShare
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
fileServerClient
.
createDirectoryIfNotExists
(
azureShare
,
azureFoler
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Create directory failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
});
return
deferred
.
promise
;
}
...
...
@@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility {
* @param azureDirectory
*/
export
async
function
createDirectoryRecursive
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
azureShare
:
any
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
directories
:
string
[]
=
azureDirectory
.
split
(
'
/
'
);
let
rootDirectory
:
string
=
''
;
for
(
const
directory
of
directories
)
{
rootDirectory
+=
directory
;
await
createDirectory
(
fileServerClient
,
rootDirectory
,
azureShare
);
let
result
:
boolean
=
await
createDirectory
(
fileServerClient
,
rootDirectory
,
azureShare
);
if
(
!
result
)
{
deferred
.
resolve
(
false
);
return
deferred
.
promise
;
}
rootDirectory
+=
'
/
'
;
}
deferred
.
resolve
();
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
}
...
...
@@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility {
* @param localFilePath
*/
async
function
uploadFileToAzure
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
localFilePath
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
await
fileServerClient
.
createFileFromLocalFile
(
azureShare
,
azureDirectory
,
azureFileName
,
localFilePath
,
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Upload file failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
});
...
...
@@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility {
* @param localFilePath
*/
async
function
downloadFile
(
fileServerClient
:
any
,
azureDirectory
:
string
,
azureFileName
:
any
,
azureShare
:
any
,
localFilePath
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
localFilePath
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
// tslint:disable-next-line:non-literal-fs-path
await
fileServerClient
.
getFileToStream
(
azureShare
,
azureDirectory
,
azureFileName
,
fs
.
createWriteStream
(
localFilePath
),
(
error
:
any
,
result
:
any
,
response
:
any
)
=>
{
if
(
error
)
{
getLogger
()
.
error
(
`Download file failed:,
${
error
}
`
);
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
}
else
{
deferred
.
resolve
();
deferred
.
resolve
(
true
);
}
});
...
...
@@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility {
*/
// tslint:disable:non-literal-fs-path
export
async
function
uploadDirectory
(
fileServerClient
:
azureStorage
.
FileService
,
azureDirectory
:
string
,
azureShare
:
any
,
localDirectory
:
string
):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
localDirectory
:
string
):
Promise
<
boolean
>
{
const
deferred
:
Deferred
<
boolean
>
=
new
Deferred
<
boolean
>
();
const
fileNameArray
:
string
[]
=
fs
.
readdirSync
(
localDirectory
);
await
createDirectoryRecursive
(
fileServerClient
,
azureDirectory
,
azureShare
);
let
result
:
boolean
=
await
createDirectoryRecursive
(
fileServerClient
,
azureDirectory
,
azureShare
);
if
(
!
result
)
{
deferred
.
resolve
(
false
);
return
deferred
.
promise
;
}
for
(
const
fileName
of
fileNameArray
)
{
const
fullFilePath
:
string
=
path
.
join
(
localDirectory
,
fileName
);
try
{
let
resultUploadFile
:
boolean
=
true
;
let
resultUploadDir
:
boolean
=
true
;
if
(
fs
.
lstatSync
(
fullFilePath
)
.
isFile
())
{
await
uploadFileToAzure
(
fileServerClient
,
azureDirectory
,
fileName
,
azureShare
,
fullFilePath
);
resultUploadFile
=
await
uploadFileToAzure
(
fileServerClient
,
azureDirectory
,
fileName
,
azureShare
,
fullFilePath
);
}
else
{
// If filePath is a directory, recuisively copy it to azure
await
uploadDirectory
(
fileServerClient
,
String
.
Format
(
'
{0}/{1}
'
,
azureDirectory
,
fileName
),
azureShare
,
fullFilePath
);
resultUploadDir
=
await
uploadDirectory
(
fileServerClient
,
String
.
Format
(
'
{0}/{1}
'
,
azureDirectory
,
fileName
),
azureShare
,
fullFilePath
);
}
if
(
!
(
resultUploadFile
&&
resultUploadDir
))
{
deferred
.
resolve
(
false
);
return
deferred
.
promise
;
}
}
catch
(
error
)
{
deferred
.
re
ject
(
error
);
deferred
.
re
solve
(
false
);
return
deferred
.
promise
;
}
}
// All files/directories are copied successfully, resolve
deferred
.
resolve
();
deferred
.
resolve
(
true
);
return
deferred
.
promise
;
}
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
871f031d
...
...
@@ -25,7 +25,7 @@ import * as path from 'path';
import
*
as
component
from
'
../../../common/component
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
...
...
@@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
//upload code files
const
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
if
(
!
trialJobOutputUrl
)
{
initStatus
=
'
FAILED
'
;
}
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
'
WAITING
'
,
initStatus
,
Date
.
now
(),
trialWorkingFolder
,
form
,
...
...
@@ -208,24 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
let
trialJobOutputUrl
:
string
=
''
;
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
try
{
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
//upload code files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
this
.
fcTrialConfig
.
codeDir
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/`
+
`
${
this
.
azureStorageShare
}
/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
const
azureFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
trialJobOutputUrl
=
await
this
.
uploadFilesToAzureStorage
(
trialJobId
,
trialLocalTempFolder
,
this
.
fcTrialConfig
.
codeDir
,
azureFrameworkControllerClusterConfig
.
uploadRetryCount
);
}
else
if
(
this
.
fcClusterConfig
.
storageType
===
'
nfs
'
)
{
const
nfsFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigNFS
=
<
FrameworkControllerClusterConfigNFS
>
this
.
fcClusterConfig
;
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
871f031d
...
...
@@ -27,7 +27,7 @@ import * as component from '../../../common/component';
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
JobApplicationForm
,
NNIManagerIpConfig
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../../common/trainingService
'
;
import
{
delay
,
generateParamFileName
,
getExperimentRootDir
,
uniqueString
}
from
'
../../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../../common/containerJobData
'
;
...
...
@@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
curTrialSequenceId
,
form
);
//upload files to sotrage
const
trialJobOutputUrl
:
string
=
await
this
.
uploadCodeFiles
(
trialJobId
,
trialLocalTempFolder
);
let
initStatus
:
TrialJobStatus
=
'
WAITING
'
;
if
(
!
trialJobOutputUrl
)
{
initStatus
=
'
FAILED
'
;
}
const
trialJobDetail
:
KubernetesTrialJobDetail
=
new
KubernetesTrialJobDetail
(
trialJobId
,
'
WAITING
'
,
initStatus
,
Date
.
now
(),
trialWorkingFolder
,
form
,
...
...
@@ -215,23 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
try
{
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
//upload code files to azure storage
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
this
.
kubeflowTrialConfig
.
codeDir
}
`
);
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
`
+
`/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
return
Promise
.
reject
(
error
);
}
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
trialJobOutputUrl
=
await
this
.
uploadFilesToAzureStorage
(
trialJobId
,
trialLocalTempFolder
,
this
.
kubeflowTrialConfig
.
codeDir
,
azureKubeflowClusterConfig
.
uploadRetryCount
);
}
else
if
(
this
.
kubeflowClusterConfig
.
storage
===
'
nfs
'
||
this
.
kubeflowClusterConfig
.
storage
===
undefined
)
{
const
nfsKubeflowClusterConfig
:
KubeflowClusterConfigNFS
=
<
KubeflowClusterConfigNFS
>
this
.
kubeflowClusterConfig
;
// Creat work dir for current trial in NFS directory
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
871f031d
...
...
@@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
export
class
KubernetesClusterConfigAzure
extends
KubernetesClusterConfig
{
public
readonly
keyVault
:
KeyVaultConfig
;
public
readonly
azureStorage
:
AzureStorage
;
public
readonly
uploadRetryCount
:
number
|
undefined
;
constructor
(
apiVersion
:
string
,
keyVault
:
KeyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
,
uploadRetryCount
?:
number
)
{
super
(
apiVersion
,
storage
);
this
.
keyVault
=
keyVault
;
this
.
azureStorage
=
azureStorage
;
this
.
uploadRetryCount
=
uploadRetryCount
;
}
public
get
storageType
():
KubernetesStorageKind
{
...
...
@@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
kubernetesClusterConfigObjectAzure
.
apiVersion
,
kubernetesClusterConfigObjectAzure
.
keyVault
,
kubernetesClusterConfigObjectAzure
.
azureStorage
,
kubernetesClusterConfigObjectAzure
.
storage
kubernetesClusterConfigObjectAzure
.
storage
,
kubernetesClusterConfigObjectAzure
.
uploadRetryCount
);
}
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
871f031d
...
...
@@ -31,13 +31,14 @@ import { getLogger, Logger } from '../../common/log';
import
{
NNIManagerIpConfig
,
TrialJobDetail
,
TrialJobMetric
}
from
'
../../common/trainingService
'
;
import
{
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
delay
,
getExperimentRootDir
,
getIPV4Address
,
getJobCancelStatus
,
getVersion
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
AzureStorageClientUtility
}
from
'
./azureStorageClientUtils
'
;
import
{
GeneralK8sClient
,
KubernetesCRDClient
}
from
'
./kubernetesApiClient
'
;
import
{
KubernetesClusterConfig
}
from
'
./kubernetesConfig
'
;
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
var
yaml
=
require
(
'
js-yaml
'
);
var
fs
=
require
(
'
fs
'
);
/**
...
...
@@ -357,6 +358,52 @@ abstract class KubernetesTrainingService {
);
return
registrySecretName
;
}
protected
async
uploadFilesToAzureStorage
(
trialJobId
:
string
,
trialLocalTempFolder
:
String
,
codeDir
:
String
,
uploadRetryCount
:
number
|
undefined
):
Promise
<
string
>
{
if
(
this
.
azureStorageClient
===
undefined
)
{
throw
new
Error
(
'
azureStorageClient is not initialized
'
);
}
let
trialJobOutputUrl
:
string
=
''
;
let
retryCount
:
number
=
1
;
if
(
uploadRetryCount
)
{
retryCount
=
uploadRetryCount
;
}
let
resultUploadNNIScript
:
boolean
=
false
;
let
resultUploadCodeFile
:
boolean
=
false
;
try
{
do
{
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
if
(
!
resultUploadNNIScript
)
{
resultUploadNNIScript
=
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
trialLocalTempFolder
}
`
);
}
//upload code files to azure storage
if
(
!
resultUploadCodeFile
)
{
resultUploadCodeFile
=
await
AzureStorageClientUtility
.
uploadDirectory
(
this
.
azureStorageClient
,
`nni/
${
getExperimentId
()}
/
${
trialJobId
}
`
,
this
.
azureStorageShare
,
`
${
codeDir
}
`
);
}
if
(
resultUploadNNIScript
&&
resultUploadCodeFile
)
{
trialJobOutputUrl
=
`https://
${
this
.
azureStorageAccountName
}
.file.core.windows.net/
${
this
.
azureStorageShare
}
`
+
`/
${
path
.
join
(
'
nni
'
,
getExperimentId
(),
trialJobId
,
'
output
'
)}
`
;
break
;
}
else
{
//wait for 5 seconds to re-upload files
await
delay
(
5000
);
this
.
log
.
info
(
'
Upload failed, Retry: upload files to azure-storage
'
);
}
}
while
(
retryCount
--
>=
0
)
}
catch
(
error
)
{
this
.
log
.
error
(
error
);
//return a empty url when got error
return
Promise
.
resolve
(
""
);
}
if
(
!
trialJobOutputUrl
)
{
this
.
log
.
info
(
`Retry-count is used up, upload files to azureStorage for trial
${
trialJobId
}
failed!`
);
}
return
Promise
.
resolve
(
trialJobOutputUrl
);
}
}
export
{
KubernetesTrainingService
};
src/nni_manager/training_service/pai/paiConfig.ts
View file @
871f031d
...
...
@@ -39,6 +39,8 @@ export class PAITaskRole {
public
readonly
command
:
string
;
//Shared memory for one task in the task role
public
readonly
shmMB
?:
number
;
//portList to specify the port used in container
public
portList
?:
portListMetaData
[];
/**
* Constructor
...
...
@@ -50,7 +52,7 @@ export class PAITaskRole {
* @param command Executable command for tasks in the task role, can not be empty
*/
constructor
(
name
:
string
,
taskNumber
:
number
,
cpuNumber
:
number
,
memoryMB
:
number
,
gpuNumber
:
number
,
command
:
string
,
shmMB
?:
number
)
{
command
:
string
,
shmMB
?:
number
,
portList
?:
portListMetaData
[]
)
{
this
.
name
=
name
;
this
.
taskNumber
=
taskNumber
;
this
.
cpuNumber
=
cpuNumber
;
...
...
@@ -58,6 +60,7 @@ export class PAITaskRole {
this
.
gpuNumber
=
gpuNumber
;
this
.
command
=
command
;
this
.
shmMB
=
shmMB
;
this
.
portList
=
portList
;
}
}
...
...
@@ -120,6 +123,16 @@ export class PAIClusterConfig {
}
}
/**
* portList data structure used in PAI taskRole
*/
export
class
portListMetaData
{
public
readonly
label
:
string
=
''
;
public
readonly
beginAt
:
number
=
0
;
public
readonly
portNumber
:
number
=
0
;
}
/**
* PAI trial configuration
*/
...
...
@@ -134,9 +147,11 @@ export class NNIPAITrialConfig extends TrialConfig {
public
shmMB
?:
number
;
//authentication file used for private Docker registry
public
authFile
?:
string
;
//portList to specify the port used in container
public
portList
?:
portListMetaData
[];
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
,
authFile
?:
string
)
{
image
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
,
authFile
?:
string
,
portList
?:
portListMetaData
[]
)
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
...
...
@@ -144,5 +159,6 @@ export class NNIPAITrialConfig extends TrialConfig {
this
.
virtualCluster
=
virtualCluster
;
this
.
shmMB
=
shmMB
;
this
.
authFile
=
authFile
;
this
.
portList
=
portList
;
}
}
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
871f031d
...
...
@@ -79,6 +79,7 @@ class PAITrainingService implements TrainingService {
private
logCollection
:
string
;
private
isMultiPhase
:
boolean
=
false
;
private
authFileHdfsPath
:
string
|
undefined
=
undefined
;
private
portList
?:
string
|
undefined
;
constructor
()
{
this
.
log
=
getLogger
();
...
...
@@ -446,6 +447,8 @@ class PAITrainingService implements TrainingService {
nniPaiTrialCommand
,
// Task shared memory
this
.
paiTrialConfig
.
shmMB
,
// Task portList
this
.
paiTrialConfig
.
portList
)
];
...
...
src/nni_manager/yarn.lock
View file @
871f031d
...
...
@@ -1410,14 +1410,7 @@ js-tokens@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499"
js-yaml@^3.10.0:
version "3.12.0"
resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-3.12.0.tgz#eaed656ec8344f10f527c6bfa1b6e2244de167d1"
dependencies:
argparse "^1.0.7"
esprima "^4.0.0"
js-yaml@^3.13.1:
js-yaml@^3.10.0, js-yaml@^3.13.1:
version "3.13.1"
resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-3.13.1.tgz#aff151b30bfdfa8e49e05da22e7415e9dfa37847"
dependencies:
...
...
src/sdk/pynni/nni/nas_utils.py
View file @
871f031d
...
...
@@ -32,7 +32,7 @@ def classic_mode(
'''Execute the chosen function and inputs directly.
In this mode, the trial code is only running the chosen subgraph (i.e., the chosen ops and inputs),
without touching the full model graph.'''
if
trial
.
_params
is
None
:
if
trial
.
get_current_parameter
()
is
None
:
trial
.
get_next_parameter
()
mutable_block
=
trial
.
get_current_parameter
(
mutable_id
)
chosen_layer
=
mutable_block
[
mutable_layer_id
][
"chosen_layer"
]
...
...
@@ -118,7 +118,7 @@ def oneshot_mode(
The difference is that oneshot mode does not receive subgraph.
Instead, it uses dropout to randomly dropout inputs and ops.'''
# NNI requires to get_next_parameter before report a result. But the parameter will not be used in this mode
if
trial
.
_params
is
None
:
if
trial
.
get_current_parameter
()
is
None
:
trial
.
get_next_parameter
()
optional_inputs
=
list
(
optional_inputs
.
values
())
inputs_num
=
len
(
optional_inputs
)
...
...
src/sdk/pynni/nni/smartparam.py
View file @
871f031d
...
...
@@ -189,6 +189,6 @@ else:
raise
RuntimeError
(
'Unrecognized mode: %s'
%
mode
)
def
_get_param
(
key
):
if
trial
.
_params
is
None
:
if
trial
.
get_current_parameter
()
is
None
:
trial
.
get_next_parameter
()
return
trial
.
get_current_parameter
(
key
)
src/sdk/pynni/nni/trial.py
View file @
871f031d
...
...
@@ -50,10 +50,12 @@ def get_next_parameter():
return
None
return
_params
[
'parameters'
]
def
get_current_parameter
(
tag
):
def
get_current_parameter
(
tag
=
None
):
global
_params
if
_params
is
None
:
return
None
if
tag
is
None
:
return
_params
[
'parameters'
]
return
_params
[
'parameters'
][
tag
]
def
get_experiment_id
():
...
...
src/webui/src/components/trial-detail/DefaultMetricPoint.tsx
View file @
871f031d
...
...
@@ -85,7 +85,9 @@ class DefaultPoint extends React.Component<DefaultPointProps, DefaultPointState>
});
// deal with best metric line
const
bestCurve
:
Array
<
number
|
object
>
[]
=
[];
// best curve data source
bestCurve
.
push
([
lineListDefault
[
0
][
0
],
lineListDefault
[
0
][
1
],
accSource
[
0
].
searchSpace
]);
if
(
lineListDefault
[
0
]
!==
undefined
)
{
bestCurve
.
push
([
lineListDefault
[
0
][
0
],
lineListDefault
[
0
][
1
],
accSource
[
0
].
searchSpace
]);
}
if
(
optimize
===
'
maximize
'
)
{
for
(
let
i
=
1
;
i
<
lineListDefault
.
length
;
i
++
)
{
const
val
=
lineListDefault
[
i
][
1
];
...
...
src/webui/src/static/style/table.scss
View file @
871f031d
...
...
@@ -115,6 +115,7 @@
}
#detail-button
{
margin
:
2px
0
;
.common-style
,
.common-style
:visited
,
.common-style
:focus
{
height
:
26px
;
border
:
none
;
...
...
@@ -131,7 +132,7 @@
.common-style
:disabled
{
background-color
:
#f4f4f4
;
}
.special
,
.special
:visited
,
.special
:focus
{
.special
,
.special
:visited
,
.special
:focus
,
.special
button
{
height
:
26px
;
border
:
none
;
border-radius
:
0
;
...
...
@@ -146,7 +147,7 @@
background-color
:
#c8c8c8
;
outline
:
0
;
}
.special
:disabled
{
.special
:disabled
,
.special
button
:disabled
{
background-color
:
#f4f4f4
;
color
:
#d9d9d9
;
}
...
...
tools/nni_cmd/config_schema.py
View file @
871f031d
...
...
@@ -240,7 +240,12 @@ pai_trial_schema = {
Optional
(
'outputDir'
):
And
(
Regex
(
r
'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'
),
\
error
=
'ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'
),
Optional
(
'virtualCluster'
):
setType
(
'virtualCluster'
,
str
),
Optional
(
'nasMode'
):
setChoice
(
'nasMode'
,
'classic_mode'
,
'enas_mode'
,
'oneshot_mode'
,
'darts_mode'
)
Optional
(
'nasMode'
):
setChoice
(
'nasMode'
,
'classic_mode'
,
'enas_mode'
,
'oneshot_mode'
,
'darts_mode'
),
Optional
(
'portList'
):
[{
"label"
:
setType
(
'label'
,
str
),
"beginAt"
:
setType
(
'beginAt'
,
int
),
"portNumber"
:
setType
(
'portNumber'
,
int
)
}]
}
}
...
...
@@ -310,7 +315,8 @@ kubeflow_config_schema = {
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
}
},
Optional
(
'uploadRetryCount'
):
setNumberRange
(
'uploadRetryCount'
,
int
,
1
,
99999
)
})
}
...
...
@@ -356,7 +362,8 @@ frameworkcontroller_config_schema = {
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
}
},
Optional
(
'uploadRetryCount'
):
setNumberRange
(
'uploadRetryCount'
,
int
,
1
,
99999
)
})
}
...
...
tools/nni_cmd/launcher_utils.py
View file @
871f031d
...
...
@@ -198,10 +198,7 @@ def validate_common_content(experiment_config):
Schema
({
**
separate_schema_dict
[
separate_key
][
'customized'
]}).
validate
(
experiment_config
[
separate_key
])
except
SchemaError
as
error
:
print_error
(
'Your config file is not correct, please check your config file content!'
)
if
error
.
__str__
().
__contains__
(
'Wrong key'
):
print_error
(
' '
.
join
(
error
.
__str__
().
split
()[:
3
]))
else
:
print_error
(
error
)
print_error
(
error
.
code
)
exit
(
1
)
#set default value
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment