Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
88ef6c04
Unverified
Commit
88ef6c04
authored
Aug 01, 2019
by
SparkSnail
Committed by
GitHub
Aug 01, 2019
Browse files
Merge pull request #197 from microsoft/master
merge master
parents
5f3c5ffd
555334de
Changes
48
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
951 additions
and
61 deletions
+951
-61
docs/img/parallel_tpe_search_result.PNG
docs/img/parallel_tpe_search_result.PNG
+0
-0
docs/img/parallel_tpe_search_tpe.PNG
docs/img/parallel_tpe_search_tpe.PNG
+0
-0
examples/notebooks/retrieve_nni_info_with_python.ipynb
examples/notebooks/retrieve_nni_info_with_python.ipynb
+497
-0
examples/trials/auto-feature-engineering/README.md
examples/trials/auto-feature-engineering/README.md
+8
-0
examples/tuners/random_nas_tuner/random_nas_tuner.py
examples/tuners/random_nas_tuner/random_nas_tuner.py
+3
-1
setup.py
setup.py
+2
-1
src/nni_manager/rest_server/restValidationSchemas.ts
src/nni_manager/rest_server/restValidationSchemas.ts
+5
-0
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
...bernetes/frameworkcontroller/frameworkcontrollerConfig.ts
+2
-2
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+25
-14
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
...er/training_service/kubernetes/kubeflow/kubeflowConfig.ts
+2
-2
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+45
-32
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
...i_manager/training_service/kubernetes/kubernetesConfig.ts
+5
-1
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+31
-0
src/nni_manager/training_service/pai/paiConfig.ts
src/nni_manager/training_service/pai/paiConfig.ts
+8
-2
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+4
-2
src/sdk/pycli/nnicli/__init__.py
src/sdk/pycli/nnicli/__init__.py
+21
-0
src/sdk/pycli/nnicli/nni_client.py
src/sdk/pycli/nnicli/nni_client.py
+156
-0
src/sdk/pycli/setup.py
src/sdk/pycli/setup.py
+18
-0
src/sdk/pynni/nni/hyperopt_tuner/hyperopt_tuner.py
src/sdk/pynni/nni/hyperopt_tuner/hyperopt_tuner.py
+63
-4
test/cli_test.py
test/cli_test.py
+56
-0
No files found.
docs/img/parallel_tpe_search_result.PNG
0 → 100644
View file @
88ef6c04
363 KB
docs/img/parallel_tpe_search_tpe.PNG
0 → 100644
View file @
88ef6c04
9.5 KB
examples/notebooks/retrieve_nni_info_with_python.ipynb
0 → 100644
View file @
88ef6c04
This diff is collapsed.
Click to expand it.
examples/trials/auto-feature-engineering/README.md
0 → 100644
View file @
88ef6c04
**Automatic Feature Engineering in nni**
===
Now we have an
[
example
](
https://github.com/SpongebBob/tabular_automl_NNI
)
, which could automaticlly do feature engineering in nni.
These code come from our contributors. And thanks our lovely contributors!
And welcome more and more people to join us!
examples/tuners/random_nas_tuner/random_nas_tuner.py
View file @
88ef6c04
...
...
@@ -7,7 +7,9 @@ def random_archi_generator(nas_ss, random_state):
'''
chosen_archi
=
{}
print
(
"zql: nas search space: "
,
nas_ss
)
for
block_name
,
block
in
nas_ss
.
items
():
for
block_name
,
block_value
in
nas_ss
.
items
():
assert
block_value
[
'_type'
]
==
"mutable_layer"
,
"Random NAS Tuner only receives NAS search space whose _type is 'mutable_layer'"
block
=
block_value
[
'_value'
]
tmp_block
=
{}
for
layer_name
,
layer
in
block
.
items
():
tmp_layer
=
{}
...
...
setup.py
View file @
88ef6c04
...
...
@@ -35,9 +35,10 @@ setup(
license
=
'MIT'
,
url
=
'https://github.com/Microsoft/nni'
,
packages
=
find_packages
(
'src/sdk/pynni'
,
exclude
=
[
'tests'
])
+
find_packages
(
'tools'
),
packages
=
find_packages
(
'src/sdk/pynni'
,
exclude
=
[
'tests'
])
+
find_packages
(
'src/sdk/pycli'
)
+
find_packages
(
'tools'
),
package_dir
=
{
'nni'
:
'src/sdk/pynni/nni'
,
'nnicli'
:
'src/sdk/pycli/nnicli'
,
'nni_annotation'
:
'tools/nni_annotation'
,
'nni_cmd'
:
'tools/nni_cmd'
,
'nni_trial_tool'
:
'tools/nni_trial_tool'
,
...
...
src/nni_manager/rest_server/restValidationSchemas.ts
View file @
88ef6c04
...
...
@@ -51,10 +51,12 @@ export namespace ValidationSchemas {
command
:
joi
.
string
().
min
(
1
),
virtualCluster
:
joi
.
string
(),
shmMB
:
joi
.
number
(),
authFile
:
joi
.
string
(),
nasMode
:
joi
.
string
().
valid
(
'
classic_mode
'
,
'
enas_mode
'
,
'
oneshot_mode
'
),
worker
:
joi
.
object
({
replicas
:
joi
.
number
().
min
(
1
).
required
(),
image
:
joi
.
string
().
min
(
1
),
privateRegistryAuthPath
:
joi
.
string
().
min
(
1
),
outputDir
:
joi
.
string
(),
cpuNum
:
joi
.
number
().
min
(
1
),
memoryMB
:
joi
.
number
().
min
(
100
),
...
...
@@ -64,6 +66,7 @@ export namespace ValidationSchemas {
ps
:
joi
.
object
({
replicas
:
joi
.
number
().
min
(
1
).
required
(),
image
:
joi
.
string
().
min
(
1
),
privateRegistryAuthPath
:
joi
.
string
().
min
(
1
),
outputDir
:
joi
.
string
(),
cpuNum
:
joi
.
number
().
min
(
1
),
memoryMB
:
joi
.
number
().
min
(
100
),
...
...
@@ -73,6 +76,7 @@ export namespace ValidationSchemas {
master
:
joi
.
object
({
replicas
:
joi
.
number
().
min
(
1
).
required
(),
image
:
joi
.
string
().
min
(
1
),
privateRegistryAuthPath
:
joi
.
string
().
min
(
1
),
outputDir
:
joi
.
string
(),
cpuNum
:
joi
.
number
().
min
(
1
),
memoryMB
:
joi
.
number
().
min
(
100
),
...
...
@@ -83,6 +87,7 @@ export namespace ValidationSchemas {
name
:
joi
.
string
().
min
(
1
),
taskNum
:
joi
.
number
().
min
(
1
).
required
(),
image
:
joi
.
string
().
min
(
1
),
privateRegistryAuthPath
:
joi
.
string
().
min
(
1
),
outputDir
:
joi
.
string
(),
cpuNum
:
joi
.
number
().
min
(
1
),
memoryMB
:
joi
.
number
().
min
(
100
),
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerConfig.ts
View file @
88ef6c04
...
...
@@ -43,8 +43,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
public
readonly
taskNum
:
number
;
constructor
(
taskNum
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
frameworkAttemptCompletionPolicy
:
FrameworkAttemptCompletionPolicy
,
privateRegistryFilePath
?:
string
|
undefined
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
,
privateRegistryFilePath
);
this
.
frameworkAttemptCompletionPolicy
=
frameworkAttemptCompletionPolicy
;
this
.
name
=
name
;
this
.
taskNum
=
taskNum
;
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
88ef6c04
...
...
@@ -305,7 +305,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
// Generate frameworkcontroller job resource config object
const
frameworkcontrollerJobConfig
:
any
=
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
await
this
.
generateFrameworkControllerJobConfig
(
trialJobId
,
trialWorkingFolder
,
frameworkcontrollerJobName
,
podResources
);
return
Promise
.
resolve
(
frameworkcontrollerJobConfig
);
}
...
...
@@ -329,8 +329,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* @param frameworkcontrollerJobName job name
* @param podResources pod template
*/
private
generateFrameworkControllerJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
,
podResources
:
any
)
:
any
{
private
async
generateFrameworkControllerJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
frameworkcontrollerJobName
:
string
,
podResources
:
any
)
:
Promise
<
any
>
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller Cluster config is not initialized
'
);
}
...
...
@@ -345,12 +345,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if
(
containerPort
===
undefined
)
{
throw
new
Error
(
'
Container port is not initialized
'
);
}
const
taskRole
:
any
=
this
.
generateTaskRoleConfig
(
trialWorkingFolder
,
this
.
fcTrialConfig
.
taskRoles
[
index
].
image
,
`run_
${
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
}
.sh`
,
podResources
[
index
],
containerPort
containerPort
,
await
this
.
createRegistrySecret
(
this
.
fcTrialConfig
.
taskRoles
[
index
].
privateRegistryAuthPath
)
);
taskRoles
.
push
({
name
:
this
.
fcTrialConfig
.
taskRoles
[
index
].
name
,
...
...
@@ -363,7 +365,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
});
}
return
{
return
Promise
.
resolve
(
{
apiVersion
:
`frameworkcontroller.microsoft.com/v1`
,
kind
:
'
Framework
'
,
metadata
:
{
...
...
@@ -379,11 +381,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
executionType
:
'
Start
'
,
taskRoles
:
taskRoles
}
};
}
)
;
}
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
):
any
{
private
generateTaskRoleConfig
(
trialWorkingFolder
:
string
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
,
containerPort
:
number
,
privateRegistrySecretName
:
string
|
undefined
):
any
{
if
(
this
.
fcClusterConfig
===
undefined
)
{
throw
new
Error
(
'
frameworkcontroller Cluster config is not initialized
'
);
}
...
...
@@ -451,13 +453,22 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
mountPath
:
'
/mnt/frameworkbarrier
'
}]
}];
const
spec
:
any
=
{
containers
:
containers
,
initContainers
:
initContainers
,
restartPolicy
:
'
OnFailure
'
,
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
),
hostNetwork
:
false
let
spec
:
any
=
{
containers
:
containers
,
initContainers
:
initContainers
,
restartPolicy
:
'
OnFailure
'
,
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
),
hostNetwork
:
false
};
if
(
privateRegistrySecretName
)
{
spec
.
imagePullSecrets
=
[
{
name
:
privateRegistrySecretName
}
]
}
if
(
this
.
fcClusterConfig
.
serviceAccountName
!==
undefined
)
{
spec
.
serviceAccountName
=
this
.
fcClusterConfig
.
serviceAccountName
;
}
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
View file @
88ef6c04
...
...
@@ -135,8 +135,8 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
export
class
KubeflowTrialConfigTemplate
extends
KubernetesTrialConfigTemplate
{
public
readonly
replicas
:
number
;
constructor
(
replicas
:
number
,
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
);
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
privateRegistryAuthPath
?:
string
)
{
super
(
command
,
gpuNum
,
cpuNum
,
memoryMB
,
image
,
privateRegistryAuthPath
);
this
.
replicas
=
replicas
;
}
}
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
88ef6c04
...
...
@@ -347,7 +347,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
const
kubeflowJobConfig
:
any
=
await
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
...
...
@@ -361,8 +361,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master
*/
private
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
any
{
private
async
generateKubeflowJobConfig
(
trialJobId
:
string
,
trialWorkingFolder
:
string
,
kubeflowJobName
:
string
,
workerPodResources
:
any
,
nonWorkerPodResources
?:
any
)
:
Promise
<
any
>
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
...
...
@@ -377,29 +377,32 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const
replicaSpecsObj
:
any
=
{};
const
replicaSpecsObjMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
tf-operator
'
)
{
const
tensorflowTrialConfig
:
KubeflowTrialConfigTensorflow
=
<
KubeflowTrialConfigTensorflow
>
this
.
kubeflowTrialConfig
;
let
privateRegistrySecretName
=
await
this
.
createRegistrySecret
(
tensorflowTrialConfig
.
worker
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
worker
.
replicas
,
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
tensorflowTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
,
privateRegistrySecretName
);
if
(
tensorflowTrialConfig
.
ps
!==
undefined
)
{
let
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
tensorflowTrialConfig
.
ps
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Ps
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
tensorflowTrialConfig
.
ps
.
replicas
,
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
);
tensorflowTrialConfig
.
ps
.
image
,
'
run_ps.sh
'
,
nonWorkerPodResources
,
privateRegistrySecretName
);
}
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
tfReplicaSpecs
:
replicaSpecsObj
});
}
else
if
(
this
.
kubeflowTrialConfig
.
operatorType
===
'
pytorch-operator
'
)
{
const
pytorchTrialConfig
:
KubeflowTrialConfigPytorch
=
<
KubeflowTrialConfigPytorch
>
this
.
kubeflowTrialConfig
;
if
(
pytorchTrialConfig
.
worker
!==
undefined
)
{
let
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
pytorchTrialConfig
.
worker
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Worker
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
worker
.
replicas
,
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
);
pytorchTrialConfig
.
worker
.
image
,
'
run_worker.sh
'
,
workerPodResources
,
privateRegistrySecretName
);
}
let
privateRegistrySecretName
:
string
|
undefined
=
await
this
.
createRegistrySecret
(
pytorchTrialConfig
.
master
.
privateRegistryAuthPath
);
replicaSpecsObj
.
Master
=
this
.
generateReplicaConfig
(
trialWorkingFolder
,
pytorchTrialConfig
.
master
.
replicas
,
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
);
pytorchTrialConfig
.
master
.
image
,
'
run_master.sh
'
,
nonWorkerPodResources
,
privateRegistrySecretName
);
replicaSpecsObjMap
.
set
(
this
.
kubernetesCRDClient
.
jobKind
,
{
pytorchReplicaSpecs
:
replicaSpecsObj
});
}
return
{
return
Promise
.
resolve
(
{
apiVersion
:
`kubeflow.org/
${
this
.
kubernetesCRDClient
.
apiVersion
}
`
,
kind
:
this
.
kubernetesCRDClient
.
jobKind
,
metadata
:
{
...
...
@@ -412,7 +415,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
},
spec
:
replicaSpecsObjMap
.
get
(
this
.
kubernetesCRDClient
.
jobKind
)
};
}
)
;
}
/**
...
...
@@ -424,7 +427,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param podResources pod resource config section
*/
private
generateReplicaConfig
(
trialWorkingFolder
:
string
,
replicaNumber
:
number
,
replicaImage
:
string
,
runScriptFile
:
string
,
podResources
:
any
):
any
{
podResources
:
any
,
privateRegistrySecretName
:
string
|
undefined
):
any
{
if
(
this
.
kubeflowClusterConfig
===
undefined
)
{
throw
new
Error
(
'
Kubeflow Cluster config is not initialized
'
);
}
...
...
@@ -436,7 +439,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if
(
this
.
kubernetesCRDClient
===
undefined
)
{
throw
new
Error
(
'
Kubeflow operator client is not initialized
'
);
}
// The config spec for volume field
const
volumeSpecMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
volumeSpecMap
.
set
(
'
nniVolumes
'
,
[
...
...
@@ -459,7 +462,34 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
}]);
}
// The config spec for container field
const
containersSpecMap
:
Map
<
string
,
object
>
=
new
Map
<
string
,
object
>
();
containersSpecMap
.
set
(
'
containers
'
,
[
{
// Kubeflow tensorflow operator requires that containers' name must be tensorflow
// TODO: change the name based on operator's type
name
:
this
.
kubernetesCRDClient
.
containerName
,
image
:
replicaImage
,
args
:
[
'
sh
'
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
volumeMounts
:
[
{
name
:
'
nni-vol
'
,
mountPath
:
this
.
CONTAINER_MOUNT_PATH
}],
resources
:
podResources
}
]);
let
spec
:
any
=
{
containers
:
containersSpecMap
.
get
(
'
containers
'
),
restartPolicy
:
'
ExitCode
'
,
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
)
}
if
(
privateRegistrySecretName
)
{
spec
.
imagePullSecrets
=
[
{
name
:
privateRegistrySecretName
}]
}
return
{
replicas
:
replicaNumber
,
template
:
{
...
...
@@ -467,26 +497,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// tslint:disable-next-line:no-null-keyword
creationTimestamp
:
null
},
spec
:
{
containers
:
[
{
// Kubeflow tensorflow operator requires that containers' name must be tensorflow
// TODO: change the name based on operator's type
name
:
this
.
kubernetesCRDClient
.
containerName
,
image
:
replicaImage
,
args
:
[
'
sh
'
,
`
${
path
.
join
(
trialWorkingFolder
,
runScriptFile
)}
`
],
volumeMounts
:
[
{
name
:
'
nni-vol
'
,
mountPath
:
this
.
CONTAINER_MOUNT_PATH
}],
resources
:
podResources
}],
restartPolicy
:
'
ExitCode
'
,
volumes
:
volumeSpecMap
.
get
(
'
nniVolumes
'
)
}
spec
:
spec
}
}
;
}
}
}
// tslint:enable: no-unsafe-any no-any
...
...
src/nni_manager/training_service/kubernetes/kubernetesConfig.ts
View file @
88ef6c04
...
...
@@ -179,6 +179,9 @@ export class KubernetesTrialConfigTemplate {
// Docker image
public
readonly
image
:
string
;
// Private registry config file path to download docker iamge
public
readonly
privateRegistryAuthPath
?:
string
;
// Trail command
public
readonly
command
:
string
;
...
...
@@ -186,12 +189,13 @@ export class KubernetesTrialConfigTemplate {
public
readonly
gpuNum
:
number
;
constructor
(
command
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
)
{
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
privateRegistryAuthPath
?:
string
)
{
this
.
command
=
command
;
this
.
gpuNum
=
gpuNum
;
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
image
=
image
;
this
.
privateRegistryAuthPath
=
privateRegistryAuthPath
;
}
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
88ef6c04
...
...
@@ -38,6 +38,8 @@ import { KubernetesClusterConfig } from './kubernetesConfig';
import
{
kubernetesScriptFormat
,
KubernetesTrialJobDetail
}
from
'
./kubernetesData
'
;
import
{
KubernetesJobRestServer
}
from
'
./kubernetesJobRestServer
'
;
var
fs
=
require
(
'
fs
'
);
/**
* Training Service implementation for Kubernetes
*/
...
...
@@ -327,5 +329,34 @@ abstract class KubernetesTrainingService {
return
Promise
.
resolve
();
}
protected
async
createRegistrySecret
(
filePath
:
string
|
undefined
):
Promise
<
string
|
undefined
>
{
if
(
filePath
===
undefined
||
filePath
===
''
)
{
return
undefined
;
}
let
body
=
fs
.
readFileSync
(
filePath
).
toString
(
'
base64
'
);
let
registrySecretName
=
String
.
Format
(
'
nni-secret-{0}
'
,
uniqueString
(
8
)
.
toLowerCase
());
await
this
.
genericK8sClient
.
createSecret
(
{
apiVersion
:
'
v1
'
,
kind
:
'
Secret
'
,
metadata
:
{
name
:
registrySecretName
,
namespace
:
'
default
'
,
labels
:
{
app
:
this
.
NNI_KUBERNETES_TRIAL_LABEL
,
expId
:
getExperimentId
()
}
},
type
:
'
kubernetes.io/dockerconfigjson
'
,
data
:
{
'
.dockerconfigjson
'
:
body
}
}
);
return
registrySecretName
;
}
}
export
{
KubernetesTrainingService
};
src/nni_manager/training_service/pai/paiConfig.ts
View file @
88ef6c04
...
...
@@ -71,6 +71,8 @@ export class PAIJobConfig {
public
readonly
image
:
string
;
// Code directory on HDFS
public
readonly
codeDir
:
string
;
//authentication file used for private Docker registry
public
readonly
authFile
?:
string
;
// List of taskRole, one task role at least
public
taskRoles
:
PAITaskRole
[];
...
...
@@ -87,12 +89,13 @@ export class PAIJobConfig {
* @param taskRoles List of taskRole, one task role at least
*/
constructor
(
jobName
:
string
,
image
:
string
,
codeDir
:
string
,
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
)
{
taskRoles
:
PAITaskRole
[],
virtualCluster
:
string
,
authFile
?:
string
)
{
this
.
jobName
=
jobName
;
this
.
image
=
image
;
this
.
codeDir
=
codeDir
;
this
.
taskRoles
=
taskRoles
;
this
.
virtualCluster
=
virtualCluster
;
this
.
authFile
=
authFile
;
}
}
...
...
@@ -129,14 +132,17 @@ export class NNIPAITrialConfig extends TrialConfig {
public
virtualCluster
?:
string
;
//Shared memory for one task in the task role
public
shmMB
?:
number
;
//authentication file used for private Docker registry
public
authFile
?:
string
;
constructor
(
command
:
string
,
codeDir
:
string
,
gpuNum
:
number
,
cpuNum
:
number
,
memoryMB
:
number
,
image
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
)
{
image
:
string
,
virtualCluster
?:
string
,
shmMB
?:
number
,
authFile
?:
string
)
{
super
(
command
,
codeDir
,
gpuNum
);
this
.
cpuNum
=
cpuNum
;
this
.
memoryMB
=
memoryMB
;
this
.
image
=
image
;
this
.
virtualCluster
=
virtualCluster
;
this
.
shmMB
=
shmMB
;
this
.
authFile
=
authFile
;
}
}
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
88ef6c04
...
...
@@ -442,7 +442,7 @@ class PAITrainingService implements TrainingService {
// Task command
nniPaiTrialCommand
,
// Task shared memory
this
.
paiTrialConfig
.
shmMB
this
.
paiTrialConfig
.
shmMB
,
)
];
...
...
@@ -456,7 +456,9 @@ class PAITrainingService implements TrainingService {
// PAI Task roles
paiTaskRoles
,
// Add Virutal Cluster
this
.
paiTrialConfig
.
virtualCluster
===
undefined
?
'
default
'
:
this
.
paiTrialConfig
.
virtualCluster
.
toString
()
this
.
paiTrialConfig
.
virtualCluster
===
undefined
?
'
default
'
:
this
.
paiTrialConfig
.
virtualCluster
.
toString
(),
//Task auth File
this
.
paiTrialConfig
.
authFile
);
// Step 2. Upload code files in codeDir onto HDFS
...
...
src/sdk/pycli/nnicli/__init__.py
0 → 100644
View file @
88ef6c04
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
from
.nni_client
import
*
src/sdk/pycli/nnicli/nni_client.py
0 → 100644
View file @
88ef6c04
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
""" A python wrapper for nni rest api
Example:
import nnicli as nc
nc.start_nni('../../../../examples/trials/mnist/config.yml')
nc.set_endpoint('http://localhost:8080')
print(nc.version())
print(nc.get_experiment_status())
print(nc.get_job_statistics())
print(nc.list_trial_jobs())
nc.stop_nni()
"""
import
sys
import
os
import
subprocess
import
requests
__all__
=
[
'start_nni'
,
'stop_nni'
,
'set_endpoint'
,
'version'
,
'get_experiment_status'
,
'get_experiment_profile'
,
'get_trial_job'
,
'list_trial_jobs'
,
'get_job_statistics'
,
'get_job_metrics'
,
'export_data'
]
EXPERIMENT_PATH
=
'experiment'
VERSION_PATH
=
'version'
STATUS_PATH
=
'check-status'
JOB_STATISTICS_PATH
=
'job-statistics'
TRIAL_JOBS_PATH
=
'trial-jobs'
METRICS_PATH
=
'metric-data'
EXPORT_DATA_PATH
=
'export-data'
API_ROOT_PATH
=
'api/v1/nni'
_api_endpoint
=
None
def
set_endpoint
(
endpoint
):
"""set endpoint of nni rest server for nnicli, for example:
http://localhost:8080
"""
global
_api_endpoint
_api_endpoint
=
endpoint
def
_check_endpoint
():
if
_api_endpoint
is
None
:
raise
AssertionError
(
"Please call set_endpoint to specify nni endpoint"
)
def
_nni_rest_get
(
api_path
,
response_type
=
'json'
):
_check_endpoint
()
uri
=
'{}/{}/{}'
.
format
(
_api_endpoint
,
API_ROOT_PATH
,
api_path
)
res
=
requests
.
get
(
uri
)
if
_http_succeed
(
res
.
status_code
):
if
response_type
==
'json'
:
return
res
.
json
()
elif
response_type
==
'text'
:
return
res
.
text
else
:
raise
AssertionError
(
'Incorrect response_type'
)
else
:
return
None
def
_http_succeed
(
status_code
):
return
status_code
//
100
==
2
def
_create_process
(
cmd
):
if
sys
.
platform
==
'win32'
:
process
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
creationflags
=
subprocess
.
CREATE_NEW_PROCESS_GROUP
)
else
:
process
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
)
while
process
.
poll
()
is
None
:
output
=
process
.
stdout
.
readline
()
if
output
:
print
(
output
.
decode
(
'utf-8'
).
strip
())
return
process
.
returncode
def
start_nni
(
config_file
):
"""start nni experiment with specified configuration file"""
cmd
=
'nnictl create --config {}'
.
format
(
config_file
).
split
(
' '
)
if
_create_process
(
cmd
)
!=
0
:
raise
RuntimeError
(
'Failed to start nni.'
)
def
stop_nni
():
"""stop nni experiment"""
cmd
=
'nnictl stop'
.
split
(
' '
)
if
_create_process
(
cmd
)
!=
0
:
raise
RuntimeError
(
'Failed to stop nni.'
)
def
version
():
"""return version of nni"""
return
_nni_rest_get
(
VERSION_PATH
,
'text'
)
def
get_experiment_status
():
"""return experiment status as a dict"""
return
_nni_rest_get
(
STATUS_PATH
)
def
get_experiment_profile
():
"""return experiment profile as a dict"""
return
_nni_rest_get
(
EXPERIMENT_PATH
)
def
get_trial_job
(
trial_job_id
):
"""return trial job information as a dict"""
assert
trial_job_id
is
not
None
return
_nni_rest_get
(
os
.
path
.
join
(
TRIAL_JOBS_PATH
,
trial_job_id
))
def
list_trial_jobs
():
"""return information for all trial jobs as a list"""
return
_nni_rest_get
(
TRIAL_JOBS_PATH
)
def
get_job_statistics
():
"""return trial job statistics information as a dict"""
return
_nni_rest_get
(
JOB_STATISTICS_PATH
)
def
get_job_metrics
(
trial_job_id
=
None
):
"""return trial job metrics"""
api_path
=
METRICS_PATH
if
trial_job_id
is
None
else
os
.
path
.
join
(
METRICS_PATH
,
trial_job_id
)
return
_nni_rest_get
(
api_path
)
def
export_data
():
"""return exported information for all trial jobs"""
return
_nni_rest_get
(
EXPORT_DATA_PATH
)
src/sdk/pycli/setup.py
0 → 100644
View file @
88ef6c04
import
setuptools
setuptools
.
setup
(
name
=
'nnicli'
,
version
=
'999.0.0-developing'
,
packages
=
setuptools
.
find_packages
(),
python_requires
=
'>=3.5'
,
install_requires
=
[
'requests'
],
author
=
'Microsoft NNI Team'
,
author_email
=
'nni@microsoft.com'
,
description
=
'nnicli for Neural Network Intelligence project'
,
license
=
'MIT'
,
url
=
'https://github.com/Microsoft/nni'
,
)
src/sdk/pynni/nni/hyperopt_tuner/hyperopt_tuner.py
View file @
88ef6c04
...
...
@@ -190,13 +190,19 @@ class HyperoptTuner(Tuner):
HyperoptTuner is a tuner which using hyperopt algorithm.
"""
def
__init__
(
self
,
algorithm_name
,
optimize_mode
=
'minimize'
):
def
__init__
(
self
,
algorithm_name
,
optimize_mode
=
'minimize'
,
parallel_optimize
=
False
,
constant_liar_type
=
'min'
):
"""
Parameters
----------
algorithm_name : str
algorithm_name includes "tpe", "random_search" and anneal".
optimize_mode : str
parallel_optimize : bool
More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
constant_liar_type : str
constant_liar_type including "min", "max" and "mean"
More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
"""
self
.
algorithm_name
=
algorithm_name
self
.
optimize_mode
=
OptimizeMode
(
optimize_mode
)
...
...
@@ -205,6 +211,13 @@ class HyperoptTuner(Tuner):
self
.
rval
=
None
self
.
supplement_data_num
=
0
self
.
parallel
=
parallel_optimize
if
self
.
parallel
:
self
.
CL_rval
=
None
self
.
constant_liar_type
=
constant_liar_type
self
.
running_data
=
[]
self
.
optimal_y
=
None
def
_choose_tuner
(
self
,
algorithm_name
):
"""
Parameters
...
...
@@ -266,6 +279,10 @@ class HyperoptTuner(Tuner):
# but it can cause deplicate parameter rarely
total_params
=
self
.
get_suggestion
(
random_search
=
True
)
self
.
total_data
[
parameter_id
]
=
total_params
if
self
.
parallel
:
self
.
running_data
.
append
(
parameter_id
)
params
=
split_index
(
total_params
)
return
params
...
...
@@ -287,10 +304,39 @@ class HyperoptTuner(Tuner):
raise
RuntimeError
(
'Received parameter_id not in total_data.'
)
params
=
self
.
total_data
[
parameter_id
]
# code for parallel
if
self
.
parallel
:
constant_liar
=
kwargs
.
get
(
'constant_liar'
,
False
)
if
constant_liar
:
rval
=
self
.
CL_rval
else
:
rval
=
self
.
rval
self
.
running_data
.
remove
(
parameter_id
)
# update the reward of optimal_y
if
self
.
optimal_y
is
None
:
if
self
.
constant_liar_type
==
'mean'
:
self
.
optimal_y
=
[
reward
,
1
]
else
:
self
.
optimal_y
=
reward
else
:
if
self
.
constant_liar_type
==
'mean'
:
_sum
=
self
.
optimal_y
[
0
]
+
reward
_number
=
self
.
optimal_y
[
1
]
+
1
self
.
optimal_y
=
[
_sum
,
_number
]
elif
self
.
constant_liar_type
==
'min'
:
self
.
optimal_y
=
min
(
self
.
optimal_y
,
reward
)
elif
self
.
constant_liar_type
==
'max'
:
self
.
optimal_y
=
max
(
self
.
optimal_y
,
reward
)
logger
.
debug
(
"Update optimal_y with reward, optimal_y = %s"
,
self
.
optimal_y
)
else
:
rval
=
self
.
rval
if
self
.
optimize_mode
is
OptimizeMode
.
Maximize
:
reward
=
-
reward
rval
=
self
.
rval
domain
=
rval
.
domain
trials
=
rval
.
trials
...
...
@@ -375,13 +421,26 @@ class HyperoptTuner(Tuner):
total_params : dict
parameter suggestion
"""
if
self
.
parallel
and
len
(
self
.
total_data
)
>
20
and
len
(
self
.
running_data
)
and
self
.
optimal_y
is
not
None
:
self
.
CL_rval
=
copy
.
deepcopy
(
self
.
rval
)
if
self
.
constant_liar_type
==
'mean'
:
_constant_liar_y
=
self
.
optimal_y
[
0
]
/
self
.
optimal_y
[
1
]
else
:
_constant_liar_y
=
self
.
optimal_y
for
_parameter_id
in
self
.
running_data
:
self
.
receive_trial_result
(
parameter_id
=
_parameter_id
,
parameters
=
None
,
value
=
_constant_liar_y
,
constant_liar
=
True
)
rval
=
self
.
CL_rval
rval
=
self
.
rval
random_state
=
np
.
random
.
randint
(
2
**
31
-
1
)
else
:
rval
=
self
.
rval
random_state
=
rval
.
rstate
.
randint
(
2
**
31
-
1
)
trials
=
rval
.
trials
algorithm
=
rval
.
algo
new_ids
=
rval
.
trials
.
new_trial_ids
(
1
)
rval
.
trials
.
refresh
()
random_state
=
rval
.
rstate
.
randint
(
2
**
31
-
1
)
if
random_search
:
new_trials
=
hp
.
rand
.
suggest
(
new_ids
,
rval
.
domain
,
trials
,
random_state
)
...
...
test/cli_test.py
0 → 100644
View file @
88ef6c04
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import
sys
import
time
import
traceback
from
utils
import
GREEN
,
RED
,
CLEAR
,
setup_experiment
def
test_nni_cli
():
import
nnicli
as
nc
config_file
=
'config_test/examples/mnist.test.yml'
try
:
# Sleep here to make sure previous stopped exp has enough time to exit to avoid port conflict
time
.
sleep
(
6
)
print
(
GREEN
+
'Testing nnicli:'
+
config_file
+
CLEAR
)
nc
.
start_nni
(
config_file
)
time
.
sleep
(
3
)
nc
.
set_endpoint
(
'http://localhost:8080'
)
print
(
nc
.
version
())
print
(
nc
.
get_job_statistics
())
print
(
nc
.
get_experiment_status
())
nc
.
list_trial_jobs
()
print
(
GREEN
+
'Test nnicli {}: TEST PASS'
.
format
(
config_file
)
+
CLEAR
)
except
Exception
as
error
:
print
(
RED
+
'Test nnicli {}: TEST FAIL'
.
format
(
config_file
)
+
CLEAR
)
print
(
'%r'
%
error
)
traceback
.
print_exc
()
raise
error
finally
:
nc
.
stop_nni
()
if
__name__
==
'__main__'
:
installed
=
(
sys
.
argv
[
-
1
]
!=
'--preinstall'
)
setup_experiment
(
installed
)
test_nni_cli
()
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment