Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
f5b89bb6
Unverified
Commit
f5b89bb6
authored
Apr 19, 2022
by
J-shang
Committed by
GitHub
Apr 19, 2022
Browse files
Merge pull request #4776 from microsoft/v2.7
parents
7aa44612
1546962f
Changes
104
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
83 additions
and
48 deletions
+83
-48
nni/retiarii/nn/pytorch/mutator.py
nni/retiarii/nn/pytorch/mutator.py
+11
-8
nni/retiarii/nn/pytorch/nasbench101.py
nni/retiarii/nn/pytorch/nasbench101.py
+1
-1
nni/retiarii/utils.py
nni/retiarii/utils.py
+1
-1
nni/tools/nnictl/config_schema.py
nni/tools/nnictl/config_schema.py
+2
-0
pipelines/fast-test.yml
pipelines/fast-test.yml
+5
-0
test/config/examples/cifar10_search_space.json
test/config/examples/cifar10_search_space.json
+1
-1
test/config/training_service.yml
test/config/training_service.yml
+3
-1
test/config/training_service_v2.yml
test/config/training_service_v2.yml
+2
-0
test/nni_test/nnitest/utils.py
test/nni_test/nnitest/utils.py
+10
-4
test/ut/retiarii/test_highlevel_apis.py
test/ut/retiarii/test_highlevel_apis.py
+5
-0
ts/nni_manager/common/experimentConfig.ts
ts/nni_manager/common/experimentConfig.ts
+2
-1
ts/nni_manager/config/aml/amlUtil.py
ts/nni_manager/config/aml/amlUtil.py
+2
-1
ts/nni_manager/training_service/kubernetes/adl/adlApiClient.ts
...i_manager/training_service/kubernetes/adl/adlApiClient.ts
+1
-1
ts/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+6
-5
ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
...training_service/kubernetes/kubeflow/kubeflowApiClient.ts
+8
-8
ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
...er/training_service/kubernetes/kubeflow/kubeflowConfig.ts
+12
-8
ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
...ning_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
+2
-2
ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+6
-4
ts/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
...anager/training_service/kubernetes/kubernetesApiClient.ts
+1
-0
ts/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+2
-2
No files found.
nni/retiarii/nn/pytorch/mutator.py
View file @
f5b89bb6
...
...
@@ -6,7 +6,7 @@ from typing import Any, List, Optional, Tuple, Dict, Iterator
import
torch.nn
as
nn
from
nni.common.serializer
import
is_traceable
from
nni.common.serializer
import
is_traceable
,
is_wrapped_with_trace
from
nni.retiarii.graph
import
Cell
,
Graph
,
Model
,
ModelStatus
,
Node
,
Evaluator
from
nni.retiarii.mutator
import
Mutator
from
nni.retiarii.serializer
import
is_basic_unit
,
is_model_wrapped
...
...
@@ -361,7 +361,7 @@ class EvaluatorValueChoiceMutator(Mutator):
# we only need one such mutator for one model/evaluator
def
_mutate_traceable_object
(
self
,
obj
:
Any
,
value_choice_decisions
:
Dict
[
str
,
Any
])
->
Any
:
if
not
is_traceable
(
obj
):
if
not
_
is_traceable
_object
(
obj
):
return
obj
updates
=
{}
...
...
@@ -400,7 +400,7 @@ class EvaluatorValueChoiceMutator(Mutator):
def
process_evaluator_mutations
(
evaluator
:
Evaluator
,
existing_mutators
:
List
[
Mutator
])
->
List
[
Mutator
]:
# take all the value choice in the kwargs of evaluaator into a list
# `existing_mutators` can mutators generated from `model`
if
not
is_traceable
(
evaluator
):
if
not
_
is_traceable
_object
(
evaluator
):
return
[]
mutator_candidates
=
{}
for
param
in
_expand_nested_trace_kwargs
(
evaluator
):
...
...
@@ -464,9 +464,12 @@ def _expand_nested_trace_kwargs(obj: Any) -> Iterator[Any]:
# Get items from `trace_kwargs`.
# If some item is traceable itself, get items recursively.
if
not
is_traceable
(
obj
):
return
if
_is_traceable_object
(
obj
):
for
param
in
obj
.
trace_kwargs
.
values
():
yield
param
yield
from
_expand_nested_trace_kwargs
(
param
)
for
param
in
obj
.
trace_kwargs
.
values
():
yield
param
yield
from
_expand_nested_trace_kwargs
(
param
)
def
_is_traceable_object
(
obj
:
Any
)
->
bool
:
# Is it a traceable "object" (not class)?
return
is_traceable
(
obj
)
and
not
is_wrapped_with_trace
(
obj
)
nni/retiarii/nn/pytorch/nasbench101.py
View file @
f5b89bb6
...
...
@@ -280,7 +280,7 @@ class NasBench101Cell(Mutable):
Warnings
--------
:class:`NasBench101Cell` is not supported in :ref:`graph-based execution engine <graph-based-exe
u
ction-engine>`.
:class:`NasBench101Cell` is not supported in :ref:`graph-based execution engine <graph-based-exec
u
tion-engine>`.
"""
@
staticmethod
...
...
nni/retiarii/utils.py
View file @
f5b89bb6
...
...
@@ -11,7 +11,7 @@ from pathlib import Path
from
nni.common.hpo_utils
import
ParameterSpec
__all__
=
[
'NoContextError'
,
'ContextStack'
,
'ModelNamespace'
]
__all__
=
[
'NoContextError'
,
'ContextStack'
,
'ModelNamespace'
,
'original_state_dict_hooks'
]
def
import_
(
target
:
str
,
allow_none
:
bool
=
False
)
->
Any
:
...
...
nni/tools/nnictl/config_schema.py
View file @
f5b89bb6
...
...
@@ -359,6 +359,7 @@ kubeflow_config_schema = {
'path'
:
setType
(
'path'
,
str
)
},
Optional
(
'reuse'
):
setType
(
'reuse'
,
bool
),
Optional
(
'namespace'
):
setType
(
'namespace'
,
str
),
},
{
'operator'
:
setChoice
(
'operator'
,
'tf-operator'
,
'pytorch-operator'
),
'apiVersion'
:
setType
(
'apiVersion'
,
str
),
...
...
@@ -377,6 +378,7 @@ kubeflow_config_schema = {
},
Optional
(
'uploadRetryCount'
):
setNumberRange
(
'uploadRetryCount'
,
int
,
1
,
99999
),
Optional
(
'reuse'
):
setType
(
'reuse'
,
bool
),
Optional
(
'namespace'
):
setType
(
'namespace'
,
str
),
})
}
...
...
pipelines/fast-test.yml
View file @
f5b89bb6
...
...
@@ -42,6 +42,11 @@ stages:
python tools/chineselink.py check
displayName
:
Translation up-to-date
-
script
:
|
cd docs
make -e SPHINXOPTS="-W -T -b linkcheck -q --keep-going" html
displayName
:
External links integrity check
-
job
:
python
pool
:
vmImage
:
ubuntu-latest
...
...
test/config/examples/cifar10_search_space.json
View file @
f5b89bb6
{
"lr"
:{
"_type"
:
"choice"
,
"_value"
:[
0.1
,
0.01
,
0.001
,
0.0001
]},
"optimizer"
:{
"_type"
:
"choice"
,
"_value"
:[
"SGD"
,
"Adadelta"
,
"Adagrad"
,
"Adam"
,
"Adamax"
]},
"model"
:{
"_type"
:
"choice"
,
"_value"
:[
"vgg"
,
"resnet18"
]}
"model"
:{
"_type"
:
"choice"
,
"_value"
:[
"vgg"
]}
}
test/config/training_service.yml
View file @
f5b89bb6
...
...
@@ -18,6 +18,7 @@ kubeflow:
azureStorage
:
accountName
:
azureShare
:
namespace
:
kubeflow
trial
:
worker
:
replicas
:
1
...
...
@@ -35,7 +36,7 @@ frameworkcontroller:
maxTrialNum
:
2
trialConcurrency
:
2
frameworkcontrollerConfig
:
serviceAccountName
:
framework
barri
er
serviceAccountName
:
framework
controll
er
storage
:
azureStorage
keyVault
:
vaultName
:
...
...
@@ -43,6 +44,7 @@ frameworkcontroller:
azureStorage
:
accountName
:
azureShare
:
namespace
:
kubeflow
trial
:
taskRoles
:
-
name
:
worker
...
...
test/config/training_service_v2.yml
View file @
f5b89bb6
...
...
@@ -20,6 +20,7 @@ kubeflow:
trainingService
:
reuseMode
:
true
platform
:
kubeflow
namespace
:
kubeflow
worker
:
command
:
code_directory
:
...
...
@@ -44,6 +45,7 @@ frameworkcontroller:
trainingService
:
reuseMode
:
true
platform
:
frameworkcontroller
namespace
:
kubeflow
serviceAccountName
:
frameworkcontroller
taskRoles
:
-
name
:
worker
...
...
test/nni_test/nnitest/utils.py
View file @
f5b89bb6
...
...
@@ -122,12 +122,18 @@ def print_file_content(filepath):
print
(
content
,
flush
=
True
)
def
print_trial_job_log
(
training_service
,
trial_jobs_url
):
trial_jobs
=
get_trial_jobs
(
trial_jobs_url
)
for
trial_job
in
trial_jobs
:
trial_log_dir
=
os
.
path
.
join
(
get_experiment_dir
(
EXPERIMENT_URL
),
'trials'
,
trial_job
[
'trialJobId'
])
trial_log_root
=
os
.
path
.
join
(
get_experiment_dir
(
EXPERIMENT_URL
),
'trials'
)
if
not
os
.
path
.
exists
(
trial_log_root
):
print
(
'trial log folder does not exist: {}'
.
format
(
trial_log_root
),
flush
=
True
)
return
folders
=
os
.
listdir
(
trial_log_root
)
for
name
in
folders
:
trial_log_dir
=
os
.
path
.
join
(
trial_log_root
,
name
)
log_files
=
[
'stderr'
,
'trial.log'
]
if
training_service
==
'local'
else
[
'stdout_log_collection.log'
]
for
log_file
in
log_files
:
print_file_content
(
os
.
path
.
join
(
trial_log_dir
,
log_file
))
log_file_path
=
os
.
path
.
join
(
trial_log_dir
,
log_file
)
if
os
.
path
.
exists
(
log_file_path
):
print_file_content
(
log_file_path
)
def
print_experiment_log
(
experiment_id
):
log_dir
=
get_nni_log_dir
(
experiment_id
=
experiment_id
)
...
...
test/ut/retiarii/test_highlevel_apis.py
View file @
f5b89bb6
...
...
@@ -1229,6 +1229,11 @@ class Shared(unittest.TestCase):
assert
len
(
set
(
values
))
==
3
@
unittest
.
skipIf
(
pytorch_lightning
.
__version__
<
'1.0'
,
'Legacy PyTorch-lightning not supported'
)
def
test_valuechoice_classification
(
self
):
evaluator
=
pl
.
Classification
(
criterion
=
nn
.
CrossEntropyLoss
)
process_evaluator_mutations
(
evaluator
,
[])
def
test_retiarii_nn_import
(
self
):
dummy
=
torch
.
zeros
(
1
,
16
,
32
,
24
)
nn
.
init
.
uniform_
(
dummy
)
...
...
ts/nni_manager/common/experimentConfig.ts
View file @
f5b89bb6
...
...
@@ -132,6 +132,7 @@ export interface KubeflowConfig extends TrainingServiceConfig {
master
?:
KubeflowRoleConfig
;
reuseMode
:
boolean
;
maxTrialNumberPerGpu
?:
number
;
namespace
?:
string
;
}
export
interface
FrameworkControllerTaskRoleConfig
{
...
...
@@ -156,7 +157,7 @@ export interface FrameworkControllerConfig extends TrainingServiceConfig {
taskRoles
:
FrameworkControllerTaskRoleConfig
[];
reuseMode
:
boolean
;
maxTrialNumberPerGpu
?:
number
;
namespace
?:
'
default
'
;
namespace
?:
string
;
apiVersion
?:
string
;
}
...
...
ts/nni_manager/config/aml/amlUtil.py
View file @
f5b89bb6
...
...
@@ -52,7 +52,8 @@ if __name__ == "__main__":
print
(
'stop_result:failed'
)
exit
(
0
)
loop_count
+=
1
time
.
sleep
(
500
)
time
.
sleep
(
5
)
status
=
run
.
get_status
()
print
(
'stop_result:success'
)
exit
(
0
)
elif
line
==
'receive'
:
...
...
ts/nni_manager/training_service/kubernetes/adl/adlApiClient.ts
View file @
f5b89bb6
...
...
@@ -11,7 +11,7 @@ class AdlClientV1 extends KubernetesCRDClient {
/**
* constructor, to initialize adl CRD definition
*/
p
rotected
readonly
namespace
:
string
;
p
ublic
readonly
namespace
:
string
;
public
constructor
(
namespace
:
string
)
{
super
();
...
...
ts/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
f5b89bb6
...
...
@@ -118,7 +118,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}
else
{
configTaskRoles
=
this
.
parseCustomTaskRoles
(
this
.
fcTemplate
.
spec
.
taskRoles
)
}
const
namespace
=
this
.
fcClusterConfig
.
namespace
?
this
.
fcClusterConfig
.
namespace
:
"
default
"
;
const
namespace
=
this
.
fcClusterConfig
.
namespace
?
?
"
default
"
;
this
.
genericK8sClient
.
setNamespace
=
namespace
;
if
(
this
.
kubernetesRestServerPort
===
undefined
)
{
...
...
@@ -134,7 +134,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
const
trialJobId
:
string
=
uniqueString
(
5
);
// Set trial's NFS working folder
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials
-local
'
,
trialJobId
);
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials
'
,
trialJobId
);
let
frameworkcontrollerJobName
:
string
=
`nniexp
${
this
.
experimentId
}
trial
${
trialJobId
}
`
.
toLowerCase
();
let
frameworkcontrollerJobConfig
:
any
;
...
...
@@ -204,6 +204,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
let
namespace
:
string
|
undefined
;
this
.
fcClusterConfig
=
FrameworkControllerClusterConfigFactory
.
generateFrameworkControllerClusterConfig
(
frameworkcontrollerClusterJsonObject
);
this
.
genericK8sClient
.
setNamespace
=
this
.
fcClusterConfig
.
namespace
??
"
default
"
;
if
(
this
.
fcClusterConfig
.
storageType
===
'
azureStorage
'
)
{
const
azureFrameworkControllerClusterConfig
:
FrameworkControllerClusterConfigAzure
=
<
FrameworkControllerClusterConfigAzure
>
this
.
fcClusterConfig
;
...
...
@@ -346,8 +347,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
for
(
const
taskRole
of
configTaskRoles
)
{
const
runScriptContent
:
string
=
await
this
.
generateRunScript
(
'
frameworkcontroller
'
,
trialJobId
,
trialWorkingFolder
,
this
.
generateCommandScript
(
configTaskRoles
,
taskRole
.
command
),
form
.
sequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
?
taskRole
.
gpuNum
:
0
);
this
.
generateCommandScript
(
configTaskRoles
,
taskRole
.
command
),
form
.
sequenceId
.
toString
(),
taskRole
.
name
,
taskRole
.
gpuNum
?
taskRole
.
gpuNum
:
0
);
await
fs
.
promises
.
writeFile
(
path
.
join
(
trialLocalTempFolder
,
`run_
${
taskRole
.
name
}
.sh`
),
runScriptContent
,
{
encoding
:
'
utf8
'
});
}
...
...
@@ -439,7 +440,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
kind
:
'
Framework
'
,
metadata
:
{
name
:
frameworkcontrollerJobName
,
namespace
:
this
.
fcClusterConfig
.
namespace
?
this
.
fcClusterConfig
.
namespace
:
"
default
"
,
namespace
:
this
.
fcClusterConfig
.
namespace
?
?
"
default
"
,
labels
:
{
app
:
this
.
NNI_KUBERNETES_TRIAL_LABEL
,
expId
:
getExperimentId
(),
...
...
ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowApiClient.ts
View file @
f5b89bb6
...
...
@@ -17,7 +17,7 @@ class TFOperatorClientV1Alpha2 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1alpha2
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1alpha2
.
namespaces
(
this
.
namespace
).
tfjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -36,7 +36,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta1
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta1
.
namespaces
(
this
.
namespace
).
tfjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -55,7 +55,7 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta2
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta2
.
namespaces
(
this
.
namespace
).
tfjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -74,7 +74,7 @@ class TFOperatorClientV1 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1
.
namespaces
(
'
default
'
).
tfjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1
.
namespaces
(
this
.
namespace
).
tfjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -92,7 +92,7 @@ class PyTorchOperatorClientV1 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1
.
namespaces
(
this
.
namespace
).
pytorchjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -110,7 +110,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1alpha2
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1alpha2
.
namespaces
(
this
.
namespace
).
pytorchjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -129,7 +129,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta1
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta1
.
namespaces
(
this
.
namespace
).
pytorchjobs
;
}
public
get
containerName
():
string
{
...
...
@@ -148,7 +148,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
}
protected
get
operator
():
any
{
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta2
.
namespaces
(
'
default
'
).
pytorchjobs
;
return
this
.
client
.
apis
[
'
kubeflow.org
'
].
v1beta2
.
namespaces
(
this
.
namespace
).
pytorchjobs
;
}
public
get
containerName
():
string
{
...
...
ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowConfig.ts
View file @
f5b89bb6
...
...
@@ -18,8 +18,8 @@ export type OperatorApiVersion = 'v1alpha2' | 'v1beta1' | 'v1beta2' | 'v1';
*/
export
class
KubeflowClusterConfig
extends
KubernetesClusterConfig
{
public
readonly
operator
:
KubeflowOperator
;
constructor
(
apiVersion
:
string
,
operator
:
KubeflowOperator
)
{
super
(
apiVersion
);
constructor
(
apiVersion
:
string
,
operator
:
KubeflowOperator
,
namespace
?:
string
)
{
super
(
apiVersion
,
undefined
,
namespace
);
this
.
operator
=
operator
;
}
}
...
...
@@ -30,9 +30,10 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
operator
:
KubeflowOperator
,
apiVersion
:
string
,
nfs
:
NFSConfig
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
,
namespace
?:
string
)
{
super
(
apiVersion
,
nfs
,
storage
);
super
(
apiVersion
,
nfs
,
storage
,
namespace
);
this
.
operator
=
operator
;
}
...
...
@@ -48,7 +49,8 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
kubeflowClusterConfigObjectNFS
.
operator
,
kubeflowClusterConfigObjectNFS
.
apiVersion
,
kubeflowClusterConfigObjectNFS
.
nfs
,
kubeflowClusterConfigObjectNFS
.
storage
kubeflowClusterConfigObjectNFS
.
storage
,
kubeflowClusterConfigObjectNFS
.
namespace
);
}
}
...
...
@@ -61,9 +63,10 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
apiVersion
:
string
,
keyVault
:
KeyVaultConfig
,
azureStorage
:
AzureStorage
,
storage
?:
KubernetesStorageKind
storage
?:
KubernetesStorageKind
,
namespace
?:
string
)
{
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
);
super
(
apiVersion
,
keyVault
,
azureStorage
,
storage
,
undefined
,
namespace
);
this
.
operator
=
operator
;
}
...
...
@@ -79,7 +82,8 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
kubeflowClusterConfigObjectAzure
.
apiVersion
,
kubeflowClusterConfigObjectAzure
.
keyVault
,
kubeflowClusterConfigObjectAzure
.
azureStorage
,
kubeflowClusterConfigObjectAzure
.
storage
kubeflowClusterConfigObjectAzure
.
storage
,
kubeflowClusterConfigObjectAzure
.
namespace
);
}
}
...
...
ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowJobRestServer.ts
View file @
f5b89bb6
...
...
@@ -14,7 +14,7 @@ export class KubeflowJobRestServer extends KubernetesJobRestServer {
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor
()
{
super
(
component
.
get
(
K
ubeflowTrainingService
)
)
;
constructor
(
kubeflowTrainingService
:
KubeflowTrainingService
)
{
super
(
k
ubeflowTrainingService
);
}
}
ts/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
f5b89bb6
...
...
@@ -69,7 +69,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
if
(
this
.
kubernetesRestServerPort
===
undefined
)
{
const
restServer
:
KubeflowJobRestServer
=
component
.
get
(
KubeflowJobRestServer
);
const
restServer
:
KubeflowJobRestServer
=
new
KubeflowJobRestServer
(
this
);
this
.
kubernetesRestServerPort
=
restServer
.
clusterRestServerPort
;
}
...
...
@@ -81,7 +81,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const
trialJobId
:
string
=
uniqueString
(
5
);
const
trialWorkingFolder
:
string
=
path
.
join
(
this
.
CONTAINER_MOUNT_PATH
,
'
nni
'
,
getExperimentId
(),
trialJobId
);
const
kubeflowJobName
:
string
=
`nni-exp-
${
this
.
experimentId
}
-trial-
${
trialJobId
}
`
.
toLowerCase
();
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials
-local
'
,
trialJobId
);
const
trialLocalTempFolder
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials
'
,
trialJobId
);
//prepare the runscript
await
this
.
prepareRunScript
(
trialLocalTempFolder
,
trialJobId
,
trialWorkingFolder
,
form
);
//upload script files to sotrage
...
...
@@ -120,6 +120,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
case
TrialConfigMetadataKey
.
KUBEFLOW_CLUSTER_CONFIG
:
{
const
kubeflowClusterJsonObject
:
object
=
JSON
.
parse
(
value
);
this
.
kubeflowClusterConfig
=
KubeflowClusterConfigFactory
.
generateKubeflowClusterConfig
(
kubeflowClusterJsonObject
);
this
.
genericK8sClient
.
setNamespace
=
this
.
kubeflowClusterConfig
.
namespace
??
"
default
"
;
if
(
this
.
kubeflowClusterConfig
.
storageType
===
'
azureStorage
'
)
{
const
azureKubeflowClusterConfig
:
KubeflowClusterConfigAzure
=
<
KubeflowClusterConfigAzure
>
this
.
kubeflowClusterConfig
;
this
.
azureStorageAccountName
=
azureKubeflowClusterConfig
.
azureStorage
.
accountName
;
...
...
@@ -137,6 +138,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}
this
.
kubernetesCRDClient
=
KubeflowOperatorClientFactory
.
createClient
(
this
.
kubeflowClusterConfig
.
operator
,
this
.
kubeflowClusterConfig
.
apiVersion
);
this
.
kubernetesCRDClient
.
namespace
=
this
.
kubeflowClusterConfig
.
namespace
??
"
default
"
;
break
;
}
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
{
...
...
@@ -310,7 +312,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// Generate kubeflow job resource config object
const
kubeflowJobConfig
:
any
=
await
this
.
generateKubeflowJobConfig
(
trialJobId
,
trialWorkingFolder
,
kubeflowJobName
,
workerPodResources
,
nonWorkerResources
);
this
.
log
.
info
(
'
kubeflowJobConfig:
'
,
kubeflowJobConfig
);
return
Promise
.
resolve
(
kubeflowJobConfig
);
}
...
...
@@ -368,7 +370,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
kind
:
this
.
kubernetesCRDClient
.
jobKind
,
metadata
:
{
name
:
kubeflowJobName
,
namespace
:
'
default
'
,
namespace
:
this
.
kubernetesCRDClient
.
namespace
,
labels
:
{
app
:
this
.
NNI_KUBERNETES_TRIAL_LABEL
,
expId
:
getExperimentId
(),
...
...
ts/nni_manager/training_service/kubernetes/kubernetesApiClient.ts
View file @
f5b89bb6
...
...
@@ -150,6 +150,7 @@ abstract class KubernetesCRDClient {
protected
readonly
client
:
any
;
protected
readonly
log
:
Logger
=
getLogger
(
'
KubernetesCRDClient
'
);
protected
crdSchema
:
any
;
public
namespace
:
string
=
'
default
'
;
constructor
()
{
this
.
client
=
new
Client1_10
({
config
:
getKubernetesConfig
()});
...
...
ts/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
f5b89bb6
...
...
@@ -230,7 +230,7 @@ abstract class KubernetesTrainingService {
this
.
azureStorageSecretName
=
String
.
Format
(
'
nni-secret-{0}
'
,
uniqueString
(
8
)
.
toLowerCase
());
const
namespace
=
this
.
genericK8sClient
.
getNamespace
?
this
.
genericK8sClient
.
getNamespace
:
"
default
"
const
namespace
=
this
.
genericK8sClient
.
getNamespace
?
?
"
default
"
;
await
this
.
genericK8sClient
.
createSecret
(
{
apiVersion
:
'
v1
'
,
...
...
@@ -330,7 +330,7 @@ abstract class KubernetesTrainingService {
const
body
=
fs
.
readFileSync
(
filePath
).
toString
(
'
base64
'
);
const
registrySecretName
=
String
.
Format
(
'
nni-secret-{0}
'
,
uniqueString
(
8
)
.
toLowerCase
());
const
namespace
=
this
.
genericK8sClient
.
getNamespace
?
this
.
genericK8sClient
.
getNamespace
:
"
default
"
const
namespace
=
this
.
genericK8sClient
.
getNamespace
?
?
"
default
"
;
await
this
.
genericK8sClient
.
createSecret
(
{
apiVersion
:
'
v1
'
,
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment