Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
143c6615
Unverified
Commit
143c6615
authored
Jul 30, 2020
by
Chi Song
Committed by
GitHub
Jul 30, 2020
Browse files
Reusable environment support GPU scheduler, add test cases and refactoring. (#2627)
parent
8a20c348
Changes
30
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
301 additions
and
122 deletions
+301
-122
src/nni_manager/training_service/reusable/trial.ts
src/nni_manager/training_service/reusable/trial.ts
+1
-1
src/nni_manager/training_service/reusable/trialDispatcher.ts
src/nni_manager/training_service/reusable/trialDispatcher.ts
+214
-57
src/nni_manager/yarn.lock
src/nni_manager/yarn.lock
+4
-12
test/nni_test/nnitest/generate_ts_config.py
test/nni_test/nnitest/generate_ts_config.py
+3
-0
test/pipelines/pipelines-it-pai.yml
test/pipelines/pipelines-it-pai.yml
+1
-1
tools/nni_cmd/config_schema.py
tools/nni_cmd/config_schema.py
+58
-45
tools/nni_trial_tool/base_channel.py
tools/nni_trial_tool/base_channel.py
+5
-1
tools/nni_trial_tool/log_utils.py
tools/nni_trial_tool/log_utils.py
+5
-1
tools/nni_trial_tool/trial.py
tools/nni_trial_tool/trial.py
+8
-2
tools/nni_trial_tool/web_channel.py
tools/nni_trial_tool/web_channel.py
+2
-2
No files found.
src/nni_manager/training_service/reusable/trial.ts
View file @
143c6615
...
...
@@ -25,7 +25,7 @@ export class TrialDetail implements TrialJobDetail {
// it's used to aggregate node status for multiple node trial
public
nodes
:
Map
<
string
,
NodeInfomation
>
;
// assigned GPUs for multi-trial scheduled.
public
assignedGpus
:
GPUInfo
[]
=
[]
;
public
assignedGpus
:
GPUInfo
[]
|
undefined
;
public
readonly
TRIAL_METADATA_DIR
=
"
.nni
"
;
...
...
src/nni_manager/training_service/reusable/trialDispatcher.ts
View file @
143c6615
This diff is collapsed.
Click to expand it.
src/nni_manager/yarn.lock
View file @
143c6615
...
...
@@ -262,6 +262,10 @@
version "2.3.1"
resolved "https://registry.yarnpkg.com/@types/js-base64/-/js-base64-2.3.1.tgz#c39f14f129408a3d96a1105a650d8b2b6eeb4168"
"@types/js-yaml@^3.12.5":
version "3.12.5"
resolved "https://registry.yarnpkg.com/@types/js-yaml/-/js-yaml-3.12.5.tgz#136d5e6a57a931e1cce6f9d8126aa98a9c92a6bb"
"@types/json-schema@^7.0.3":
version "7.0.3"
resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.3.tgz#bdfd69d61e464dcc81b25159c270d75a73c1a636"
...
...
@@ -277,7 +281,6 @@
"@types/minipass@*":
version "2.2.0"
resolved "https://registry.yarnpkg.com/@types/minipass/-/minipass-2.2.0.tgz#51ad404e8eb1fa961f75ec61205796807b6f9651"
integrity sha512-wuzZksN4w4kyfoOv/dlpov4NOunwutLA/q7uc00xU02ZyUY+aoM5PWIXEKBMnm0NHd4a+N71BMjq+x7+2Af1fg==
dependencies:
"@types/node" "*"
...
...
@@ -430,7 +433,6 @@
"@types/tar@^4.0.3":
version "4.0.3"
resolved "https://registry.yarnpkg.com/@types/tar/-/tar-4.0.3.tgz#e2cce0b8ff4f285293243f5971bd7199176ac489"
integrity sha512-Z7AVMMlkI8NTWF0qGhC4QIX0zkV/+y0J8x7b/RsHrN0310+YNjoJd8UrApCiGBCWtKjxS9QhNqLi2UJNToh5hA==
dependencies:
"@types/minipass" "*"
"@types/node" "*"
...
...
@@ -1017,7 +1019,6 @@ chownr@^1.1.2, chownr@^1.1.3:
chownr@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/chownr/-/chownr-2.0.0.tgz#15bfbe53d2eab4cf70f18a8cd68ebe5b3cb1dece"
integrity sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==
ci-info@^1.5.0:
version "1.6.0"
...
...
@@ -1912,7 +1913,6 @@ fs-minipass@^1.2.5:
fs-minipass@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/fs-minipass/-/fs-minipass-2.1.0.tgz#7f5036fdbf12c63c169190cbe4199c852271f9fb"
integrity sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==
dependencies:
minipass "^3.0.0"
...
...
@@ -2331,7 +2331,6 @@ ignore@^4.0.6:
ignore@^5.1.4:
version "5.1.4"
resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.1.4.tgz#84b7b3dbe64552b6ef0eca99f6743dbec6d97adf"
integrity sha512-MzbUSahkTW1u7JpKKjY7LCARd1fU5W2rLdxlM4kdkayuCwZImjkpluF9CM1aLewYJguPDqewLam18Y6AU69A8A==
import-fresh@^3.0.0:
version "3.2.1"
...
...
@@ -2650,7 +2649,6 @@ istanbul-lib-source-maps@^4.0.0:
istanbul-reports@^3.0.2:
version "3.0.2"
resolved "https://registry.yarnpkg.com/istanbul-reports/-/istanbul-reports-3.0.2.tgz#d593210e5000683750cb09fc0644e4b6e27fd53b"
integrity sha512-9tZvz7AiR3PEDNGiV9vIouQ/EAcqMXFmkcA1CDFTwOB98OZVDL0PH9glHotf5Ugp6GCOTypfzGWI/OqjWNCRUw==
dependencies:
html-escaper "^2.0.0"
istanbul-lib-report "^3.0.0"
...
...
@@ -3193,7 +3191,6 @@ minipass@^2.3.5, minipass@^2.8.6, minipass@^2.9.0:
minipass@^3.0.0:
version "3.1.3"
resolved "https://registry.yarnpkg.com/minipass/-/minipass-3.1.3.tgz#7d42ff1f39635482e15f9cdb53184deebd5815fd"
integrity sha512-Mgd2GdMVzY+x3IJ+oHnVM+KG3lA5c8tnabyJKmHSaG2kAGpudxuOf8ToDkhumF7UzME7DecbQE9uOZhNm7PuJg==
dependencies:
yallist "^4.0.0"
...
...
@@ -3212,7 +3209,6 @@ minizlib@^1.2.1:
minizlib@^2.1.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/minizlib/-/minizlib-2.1.0.tgz#fd52c645301ef09a63a2c209697c294c6ce02cf3"
integrity sha512-EzTZN/fjSvifSX0SlqUERCN39o6T40AMarPbv0MrarSFtIITCBh7bi+dU8nxGFHuqs9jdIAeoYoKuQAAASsPPA==
dependencies:
minipass "^3.0.0"
yallist "^4.0.0"
...
...
@@ -3249,7 +3245,6 @@ mkdirp@^0.5.1:
mkdirp@^1.0.3:
version "1.0.4"
resolved "https://registry.yarnpkg.com/mkdirp/-/mkdirp-1.0.4.tgz#3eb5ed62622756d79a5f0e2a221dfebad75c2f7e"
integrity sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==
mocha@^7.1.1:
version "7.1.1"
...
...
@@ -3707,7 +3702,6 @@ number-is-nan@^1.0.0:
nyc@^15.0.0:
version "15.0.1"
resolved "https://registry.yarnpkg.com/nyc/-/nyc-15.0.1.tgz#bd4d5c2b17f2ec04370365a5ca1fc0ed26f9f93d"
integrity sha512-n0MBXYBYRqa67IVt62qW1r/d9UH/Qtr7SF1w/nQLJ9KxvWF6b2xCHImRAixHN9tnMMYHC2P14uo6KddNGwMgGg==
dependencies:
"@istanbuljs/load-nyc-config" "^1.0.0"
"@istanbuljs/schema" "^0.1.2"
...
...
@@ -5065,7 +5059,6 @@ tar@^4.4.10, tar@^4.4.12, tar@^4.4.13:
tar@^6.0.2:
version "6.0.2"
resolved "https://registry.yarnpkg.com/tar/-/tar-6.0.2.tgz#5df17813468a6264ff14f766886c622b84ae2f39"
integrity sha512-Glo3jkRtPcvpDlAs/0+hozav78yoXKFr+c4wgw62NNMO3oo4AaJdCo21Uu7lcwr55h39W2XD1LMERc64wtbItg==
dependencies:
chownr "^2.0.0"
fs-minipass "^2.0.0"
...
...
@@ -5541,7 +5534,6 @@ yallist@^3.0.2, yallist@^3.0.3:
yallist@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72"
integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==
yargs-parser@13.1.2, yargs-parser@^13.1.2:
version "13.1.2"
...
...
test/nni_test/nnitest/generate_ts_config.py
View file @
143c6615
...
...
@@ -35,6 +35,8 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'paiConfig'
][
'host'
]
=
args
.
pai_host
if
args
.
pai_token
is
not
None
:
config
[
args
.
ts
][
'paiConfig'
][
'token'
]
=
args
.
pai_token
if
args
.
pai_reuse
is
not
None
:
config
[
args
.
ts
][
'paiConfig'
][
'reuse'
]
=
args
.
pai_reuse
.
lower
()
==
'true'
if
args
.
nni_docker_image
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'image'
]
=
args
.
nni_docker_image
if
args
.
nni_manager_nfs_mount_path
is
not
None
:
...
...
@@ -101,6 +103,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--output_dir"
,
type
=
str
)
parser
.
add_argument
(
"--vc"
,
type
=
str
)
parser
.
add_argument
(
"--pai_token"
,
type
=
str
)
parser
.
add_argument
(
"--pai_reuse"
,
type
=
str
)
parser
.
add_argument
(
"--pai_storage_config_name"
,
type
=
str
)
parser
.
add_argument
(
"--nni_manager_nfs_mount_path"
,
type
=
str
)
parser
.
add_argument
(
"--container_nfs_mount_path"
,
type
=
str
)
...
...
test/pipelines/pipelines-it-pai.yml
View file @
143c6615
...
...
@@ -57,7 +57,7 @@ jobs:
echo "TEST_IMG:$TEST_IMG"
cd test
python3 nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_config_name $(pai_storage_config_name)\
python3 nni_test/nnitest/generate_ts_config.py --ts pai
--pai_reuse $(pai_reuse)
--pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_config_name $(pai_storage_config_name)\
--pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) --vc $(virtual_cluster)
PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai
displayName
:
'
integration
test'
tools/nni_cmd/config_schema.py
View file @
143c6615
...
...
@@ -14,10 +14,12 @@ def setType(key, valueType):
'''check key type'''
return
And
(
valueType
,
error
=
SCHEMA_TYPE_ERROR
%
(
key
,
valueType
.
__name__
))
def
setChoice
(
key
,
*
args
):
'''check choice'''
return
And
(
lambda
n
:
n
in
args
,
error
=
SCHEMA_RANGE_ERROR
%
(
key
,
str
(
args
)))
def
setNumberRange
(
key
,
keyType
,
start
,
end
):
'''check number range'''
return
And
(
...
...
@@ -25,16 +27,19 @@ def setNumberRange(key, keyType, start, end):
And
(
lambda
n
:
start
<=
n
<=
end
,
error
=
SCHEMA_RANGE_ERROR
%
(
key
,
'(%s,%s)'
%
(
start
,
end
))),
)
def
setPathCheck
(
key
):
'''check if path exist'''
return
And
(
os
.
path
.
exists
,
error
=
SCHEMA_PATH_ERROR
%
key
)
class
AlgoSchema
:
"""
This class is the schema of 'tuner', 'assessor' and 'advisor' sections of experiment configuraion file.
For example:
AlgoSchema('tuner') creates the schema of tuner section.
"""
def
__init__
(
self
,
algo_type
):
"""
Parameters:
...
...
@@ -108,6 +113,7 @@ class AlgoSchema:
Schema
(
self
.
algo_schema
).
validate
(
data
)
self
.
validate_extras
(
data
,
self
.
algo_type
)
common_schema
=
{
'authorName'
:
setType
(
'authorName'
,
str
),
'experimentName'
:
setType
(
'experimentName'
,
str
),
...
...
@@ -138,7 +144,7 @@ common_schema = {
}
common_trial_schema
=
{
'trial'
:{
'trial'
:
{
'command'
:
setType
(
'command'
,
str
),
'codeDir'
:
setPathCheck
(
'codeDir'
),
Optional
(
'gpuNum'
):
setNumberRange
(
'gpuNum'
,
int
,
0
,
99999
),
...
...
@@ -147,7 +153,7 @@ common_trial_schema = {
}
pai_yarn_trial_schema
=
{
'trial'
:{
'trial'
:
{
'command'
:
setType
(
'command'
,
str
),
'codeDir'
:
setPathCheck
(
'codeDir'
),
'gpuNum'
:
setNumberRange
(
'gpuNum'
,
int
,
0
,
99999
),
...
...
@@ -156,10 +162,10 @@ pai_yarn_trial_schema = {
'image'
:
setType
(
'image'
,
str
),
Optional
(
'authFile'
):
And
(
os
.
path
.
exists
,
error
=
SCHEMA_PATH_ERROR
%
'authFile'
),
Optional
(
'shmMB'
):
setType
(
'shmMB'
,
int
),
Optional
(
'dataDir'
):
And
(
Regex
(
r
'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'
),
\
error
=
'ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'
),
Optional
(
'outputDir'
):
And
(
Regex
(
r
'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'
),
\
error
=
'ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'
),
Optional
(
'dataDir'
):
And
(
Regex
(
r
'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'
),
error
=
'ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'
),
Optional
(
'outputDir'
):
And
(
Regex
(
r
'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'
),
error
=
'ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'
),
Optional
(
'virtualCluster'
):
setType
(
'virtualCluster'
,
str
),
Optional
(
'nasMode'
):
setChoice
(
'nasMode'
,
'classic_mode'
,
'enas_mode'
,
'oneshot_mode'
,
'darts_mode'
),
Optional
(
'portList'
):
[{
...
...
@@ -184,7 +190,7 @@ pai_yarn_config_schema = {
pai_trial_schema
=
{
'trial'
:{
'trial'
:
{
'codeDir'
:
setPathCheck
(
'codeDir'
),
'nniManagerNFSMountPath'
:
setPathCheck
(
'nniManagerNFSMountPath'
),
'containerNFSMountPath'
:
setType
(
'containerNFSMountPath'
,
str
),
...
...
@@ -200,21 +206,21 @@ pai_trial_schema = {
}
pai_config_schema
=
{
'paiConfig'
:
Or
({
'userName'
:
setType
(
'userName'
,
str
),
'passWord'
:
setType
(
'passWord'
,
str
),
'host'
:
setType
(
'host'
,
str
),
Optional
(
'reuse'
):
setType
(
'reuse'
,
bool
)
},
{
'paiConfig'
:
{
'userName'
:
setType
(
'userName'
,
str
),
'token'
:
setType
(
'token'
,
str
)
,
Or
(
'passWord'
,
'token'
,
only_one
=
True
):
str
,
'host'
:
setType
(
'host'
,
str
),
Optional
(
'reuse'
):
setType
(
'reuse'
,
bool
)
})
Optional
(
'reuse'
):
setType
(
'reuse'
,
bool
),
Optional
(
'gpuNum'
):
setNumberRange
(
'gpuNum'
,
int
,
0
,
99999
),
Optional
(
'cpuNum'
):
setNumberRange
(
'cpuNum'
,
int
,
0
,
99999
),
Optional
(
'memoryMB'
):
setType
(
'memoryMB'
,
int
),
Optional
(
'maxTrialNumPerGpu'
):
setType
(
'maxTrialNumPerGpu'
,
int
),
Optional
(
'useActiveGpu'
):
setType
(
'useActiveGpu'
,
bool
),
}
}
dlts_trial_schema
=
{
'trial'
:{
'trial'
:
{
'command'
:
setType
(
'command'
,
str
),
'codeDir'
:
setPathCheck
(
'codeDir'
),
'gpuNum'
:
setNumberRange
(
'gpuNum'
,
int
,
0
,
99999
),
...
...
@@ -235,7 +241,7 @@ dlts_config_schema = {
}
aml_trial_schema
=
{
'trial'
:{
'trial'
:
{
'codeDir'
:
setPathCheck
(
'codeDir'
),
'command'
:
setType
(
'command'
,
str
),
'image'
:
setType
(
'image'
,
str
),
...
...
@@ -252,7 +258,7 @@ aml_config_schema = {
}
kubeflow_trial_schema
=
{
'trial'
:{
'trial'
:
{
'codeDir'
:
setPathCheck
(
'codeDir'
),
Optional
(
'nasMode'
):
setChoice
(
'nasMode'
,
'classic_mode'
,
'enas_mode'
,
'oneshot_mode'
,
'darts_mode'
),
Optional
(
'ps'
):
{
...
...
@@ -273,7 +279,7 @@ kubeflow_trial_schema = {
'image'
:
setType
(
'image'
,
str
),
Optional
(
'privateRegistryAuthPath'
):
And
(
os
.
path
.
exists
,
error
=
SCHEMA_PATH_ERROR
%
'privateRegistryAuthPath'
)
},
Optional
(
'worker'
):{
Optional
(
'worker'
):
{
'replicas'
:
setType
(
'replicas'
,
int
),
'command'
:
setType
(
'command'
,
str
),
'gpuNum'
:
setNumberRange
(
'gpuNum'
,
int
,
0
,
99999
),
...
...
@@ -286,7 +292,7 @@ kubeflow_trial_schema = {
}
kubeflow_config_schema
=
{
'kubeflowConfig'
:
Or
({
'kubeflowConfig'
:
Or
({
'operator'
:
setChoice
(
'operator'
,
'tf-operator'
,
'pytorch-operator'
),
'apiVersion'
:
setType
(
'apiVersion'
,
str
),
Optional
(
'storage'
):
setChoice
(
'storage'
,
'nfs'
,
'azureStorage'
),
...
...
@@ -299,23 +305,23 @@ kubeflow_config_schema = {
'apiVersion'
:
setType
(
'apiVersion'
,
str
),
Optional
(
'storage'
):
setChoice
(
'storage'
,
'nfs'
,
'azureStorage'
),
'keyVault'
:
{
'vaultName'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){1,127}'
),
\
error
=
'ERROR: vaultName format error, vaultName support using (0-9|a-z|A-Z|-)'
),
'name'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){1,127}'
),
\
error
=
'ERROR: name format error, name support using (0-9|a-z|A-Z|-)'
)
'vaultName'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){1,127}'
),
error
=
'ERROR: vaultName format error, vaultName support using (0-9|a-z|A-Z|-)'
),
'name'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){1,127}'
),
error
=
'ERROR: name format error, name support using (0-9|a-z|A-Z|-)'
)
},
'azureStorage'
:
{
'accountName'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,31}'
),
\
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
'accountName'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,31}'
),
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
},
Optional
(
'uploadRetryCount'
):
setNumberRange
(
'uploadRetryCount'
,
int
,
1
,
99999
)
})
}
frameworkcontroller_trial_schema
=
{
'trial'
:{
'trial'
:
{
'codeDir'
:
setPathCheck
(
'codeDir'
),
'taskRoles'
:
[{
'name'
:
setType
(
'name'
,
str
),
...
...
@@ -335,7 +341,7 @@ frameworkcontroller_trial_schema = {
}
frameworkcontroller_config_schema
=
{
'frameworkcontrollerConfig'
:
Or
({
'frameworkcontrollerConfig'
:
Or
({
Optional
(
'storage'
):
setChoice
(
'storage'
,
'nfs'
,
'azureStorage'
),
Optional
(
'serviceAccountName'
):
setType
(
'serviceAccountName'
,
str
),
'nfs'
:
{
...
...
@@ -346,23 +352,23 @@ frameworkcontroller_config_schema = {
Optional
(
'storage'
):
setChoice
(
'storage'
,
'nfs'
,
'azureStorage'
),
Optional
(
'serviceAccountName'
):
setType
(
'serviceAccountName'
,
str
),
'keyVault'
:
{
'vaultName'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){1,127}'
),
\
error
=
'ERROR: vaultName format error, vaultName support using (0-9|a-z|A-Z|-)'
),
'name'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){1,127}'
),
\
error
=
'ERROR: name format error, name support using (0-9|a-z|A-Z|-)'
)
'vaultName'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){1,127}'
),
error
=
'ERROR: vaultName format error, vaultName support using (0-9|a-z|A-Z|-)'
),
'name'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){1,127}'
),
error
=
'ERROR: name format error, name support using (0-9|a-z|A-Z|-)'
)
},
'azureStorage'
:
{
'accountName'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,31}'
),
\
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
\
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
'accountName'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,31}'
),
error
=
'ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'
),
'azureShare'
:
And
(
Regex
(
'([0-9]|[a-z]|[A-Z]|-){3,63}'
),
error
=
'ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)'
)
},
Optional
(
'uploadRetryCount'
):
setNumberRange
(
'uploadRetryCount'
,
int
,
1
,
99999
)
})
}
machine_list_schema
=
{
'machineList'
:[
Or
(
'machineList'
:
[
Or
(
{
'ip'
:
setType
(
'ip'
,
str
),
Optional
(
'port'
):
setNumberRange
(
'port'
,
int
,
1
,
65535
),
...
...
@@ -395,6 +401,7 @@ training_service_schema_dict = {
'dlts'
:
Schema
({
**
common_schema
,
**
dlts_trial_schema
,
**
dlts_config_schema
}),
}
class
NNIConfigSchema
:
def
validate
(
self
,
data
):
train_service
=
data
[
'trainingServicePlatform'
]
...
...
@@ -483,19 +490,25 @@ class NNIConfigSchema:
if
not
taskRoles_dict
:
raise
SchemaError
(
'Please set taskRoles in paiConfigPath config file!'
)
else
:
pai_trial_fields_required_list
=
[
'image'
,
'gpuNum'
,
'cpuNum'
,
'memoryMB'
,
'paiStorageConfigName'
,
'command'
]
pai_trial_fields_required_list
=
[
'image'
,
'paiStorageConfigName'
,
'command'
]
for
trial_field
in
pai_trial_fields_required_list
:
if
experiment_config
[
'trial'
].
get
(
trial_field
)
is
None
:
raise
SchemaError
(
'Please set {0} in trial configuration,
\
or set additional pai configuration file path in paiConfigPath!'
.
format
(
trial_field
))
pai_resource_fields_required_list
=
[
'gpuNum'
,
'cpuNum'
,
'memoryMB'
]
for
required_field
in
pai_resource_fields_required_list
:
if
experiment_config
[
'trial'
].
get
(
required_field
)
is
None
and
\
experiment_config
[
'paiConfig'
].
get
(
required_field
)
is
None
:
raise
SchemaError
(
'Please set {0} in trial or paiConfig configuration,
\
or set additional pai configuration file path in paiConfigPath!'
.
format
(
required_field
))
def
validate_pai_trial_conifg
(
self
,
experiment_config
):
'''validate the trial config in pai platform'''
if
experiment_config
.
get
(
'trainingServicePlatform'
)
in
[
'pai'
,
'paiYarn'
]:
if
experiment_config
.
get
(
'trial'
).
get
(
'shmMB'
)
and
\
experiment_config
[
'trial'
][
'shmMB'
]
>
experiment_config
[
'trial'
][
'memoryMB'
]:
experiment_config
[
'trial'
][
'shmMB'
]
>
experiment_config
[
'trial'
][
'memoryMB'
]:
raise
SchemaError
(
'shmMB should be no more than memoryMB!'
)
#backward compatibility
#
backward compatibility
warning_information
=
'{0} is not supported in NNI anymore, please remove the field in config file!
\
please refer https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/PaiMode.md#run-an-experiment
\
for the practices of how to get data and output model in trial code'
...
...
@@ -508,6 +521,6 @@ class NNIConfigSchema:
def
validate_eth0_device
(
self
,
experiment_config
):
'''validate whether the machine has eth0 device'''
if
experiment_config
.
get
(
'trainingServicePlatform'
)
not
in
[
'local'
]
\
and
not
experiment_config
.
get
(
'nniManagerIp'
)
\
and
'eth0'
not
in
netifaces
.
interfaces
():
and
not
experiment_config
.
get
(
'nniManagerIp'
)
\
and
'eth0'
not
in
netifaces
.
interfaces
():
raise
SchemaError
(
'This machine does not contain eth0 network device, please set nniManagerIp in config file!'
)
tools/nni_trial_tool/base_channel.py
View file @
143c6615
...
...
@@ -57,7 +57,11 @@ class BaseChannel(ABC):
def
close
(
self
):
self
.
is_running
=
False
self
.
_inner_close
()
try
:
self
.
_inner_close
()
except
Exception
as
err
:
# ignore any error on closing
print
(
"error on closing channel: %s"
%
err
)
def
send
(
self
,
command
,
data
):
"""Send command to Training Service.
...
...
tools/nni_trial_tool/log_utils.py
View file @
143c6615
...
...
@@ -82,7 +82,11 @@ class RemoteLogger(object):
'''
constructor
'''
self
.
logger
=
logging
.
getLogger
(
'nni_syslog_{}'
.
format
(
tag
))
logger_name
=
'nni_syslog_{}'
.
format
(
tag
)
# to prevent multiple trial logged in same logger
if
trial_id
is
not
None
:
logger_name
=
'{}_{}'
.
format
(
logger_name
,
trial_id
)
self
.
logger
=
logging
.
getLogger
(
logger_name
)
self
.
log_level
=
log_level
self
.
logger
.
setLevel
(
self
.
log_level
)
self
.
pipeReader
=
None
...
...
tools/nni_trial_tool/trial.py
View file @
143c6615
...
...
@@ -86,11 +86,17 @@ class Trial:
break
time
.
sleep
(
0.1
)
trial_command
=
self
.
args
.
trial_command
gpuIndices
=
self
.
data
.
get
(
'gpuIndices'
)
if
(
gpuIndices
is
not
None
):
trial_command
=
'CUDA_VISIBLE_DEVICES="%s " %s'
%
(
gpuIndices
,
trial_command
)
self
.
log_pipe_stdout
=
self
.
trial_syslogger_stdout
.
get_pipelog_reader
()
self
.
process
=
Popen
(
self
.
args
.
trial_command
,
shell
=
True
,
stdout
=
self
.
log_pipe_stdout
,
self
.
process
=
Popen
(
trial_command
,
shell
=
True
,
stdout
=
self
.
log_pipe_stdout
,
stderr
=
self
.
log_pipe_stdout
,
cwd
=
trial_code_dir
,
env
=
dict
(
environ
))
nni_log
(
LogType
.
Info
,
'{0}: spawns a subprocess (pid {1}) to run command: {2}'
.
format
(
self
.
name
,
self
.
process
.
pid
,
shlex
.
split
(
self
.
args
.
trial_command
)))
format
(
self
.
name
,
self
.
process
.
pid
,
shlex
.
split
(
trial_command
)))
def
save_parameter_file
(
self
,
command_data
):
parameters
=
command_data
[
"parameters"
]
...
...
tools/nni_trial_tool/web_channel.py
View file @
143c6615
...
...
@@ -37,9 +37,9 @@ class WebChannel(BaseChannel):
def
_inner_close
(
self
):
if
self
.
client
is
not
None
:
self
.
client
.
close
()
if
self
.
_event_loop
.
is_running
():
self
.
_event_loop
.
close
()
self
.
client
=
None
if
self
.
_event_loop
.
is_running
():
self
.
_event_loop
.
stop
()
self
.
_event_loop
=
None
def
_inner_send
(
self
,
message
):
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment