Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
a689e619
Unverified
Commit
a689e619
authored
Jul 27, 2022
by
yjjinjie
Committed by
GitHub
Jul 27, 2022
Browse files
[DLC] update the failed job and dlcclient request and user flow control (#5003)
parent
b1a532ae
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
81 additions
and
49 deletions
+81
-49
ts/nni_manager/config/dlc/dlcUtil.py
ts/nni_manager/config/dlc/dlcUtil.py
+31
-6
ts/nni_manager/training_service/reusable/dlc/dlcClient.ts
ts/nni_manager/training_service/reusable/dlc/dlcClient.ts
+40
-41
ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts
...ng_service/reusable/environments/dlcEnvironmentService.ts
+10
-2
No files found.
ts/nni_manager/config/dlc/dlcUtil.py
View file @
a689e619
...
@@ -7,6 +7,7 @@ import os
...
@@ -7,6 +7,7 @@ import os
import
pathlib
import
pathlib
import
sys
import
sys
import
traceback
import
traceback
import
time
from
argparse
import
ArgumentParser
from
argparse
import
ArgumentParser
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
from
alibabacloud_pai_dlc20201203.client
import
Client
from
alibabacloud_pai_dlc20201203.client
import
Client
...
@@ -83,18 +84,42 @@ if __name__ == "__main__":
...
@@ -83,18 +84,42 @@ if __name__ == "__main__":
response
=
client
.
create_job
(
req
)
response
=
client
.
create_job
(
req
)
job_id
=
response
.
body
.
job_id
job_id
=
response
.
body
.
job_id
print
(
'job
id:
'
+
job_id
)
print
(
'job
_
id:'
+
job_id
)
while
True
:
while
True
:
line
=
sys
.
stdin
.
readline
().
rstrip
()
line
=
sys
.
stdin
.
readline
().
rstrip
()
if
line
==
'update_status'
:
if
line
==
'update_status'
:
print
(
'status:'
+
client
.
get_job
(
job_id
).
body
.
status
)
# when the dlc sudden failure,such as 503,
# we will not get the status
# We'll keep getting the state until we get it
while
True
:
try
:
# to avoid user flow control
time
.
sleep
(
60
)
status
=
client
.
get_job
(
job_id
).
body
.
status
logging
.
info
(
'job_id %s, client.get_job(job_id).body.status %s'
,
job_id
,
status
)
print
(
'status:'
+
status
)
break
except
Exception
as
e
:
logging
.
exception
(
'dlc get status error:
\n
'
)
logging
.
info
(
"exit job_id %s update status"
,
job_id
)
elif
line
==
'tracking_url'
:
elif
line
==
'tracking_url'
:
#TODO: 1. get this url by api? 2. change this url in private dlc mode.
#TODO: 1. get this url by api? 2. change this url in private dlc mode.
print
(
'tracking_url:'
+
f
'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId=
{
job_id
}
®ionId=
{
args
.
region
}
'
)
print
(
'tracking_url:'
+
f
'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId=
{
job_id
}
®ionId=
{
args
.
region
}
'
)
elif
line
==
'stop'
:
elif
line
==
'stop'
:
client
.
stop_job
(
job_id
)
# when the dlc 503,we will not stop the job
exit
(
0
)
# We'll keep stopping the job until we stop it
while
True
:
try
:
# to avoid user flow control
time
.
sleep
(
60
)
client
.
stop_job
(
job_id
)
exit
(
0
)
except
Exception
as
e
:
logging
.
exception
(
'dlc stop error:
\n
'
)
except
Exception
as
e
:
except
Exception
as
e
:
logging
.
e
rror
(
'DLC submit Exception:
\n
'
)
logging
.
e
xception
(
'DLC submit Exception:
\n
'
)
logging
.
error
(
e
,
exc_info
=
1
)
ts/nni_manager/training_service/reusable/dlc/dlcClient.ts
View file @
a689e619
...
@@ -26,6 +26,7 @@ export class DlcClient {
...
@@ -26,6 +26,7 @@ export class DlcClient {
// dlcUtil exception log dir
// dlcUtil exception log dir
public
logDir
:
string
;
public
logDir
:
string
;
public
pythonShellClient
:
undefined
|
PythonShell
;
public
pythonShellClient
:
undefined
|
PythonShell
;
public
status
:
string
;
constructor
(
constructor
(
type
:
string
,
type
:
string
,
...
@@ -65,6 +66,7 @@ export class DlcClient {
...
@@ -65,6 +66,7 @@ export class DlcClient {
this
.
environmentId
=
environmentId
;
this
.
environmentId
=
environmentId
;
this
.
userCommand
=
userCommand
;
this
.
userCommand
=
userCommand
;
this
.
logDir
=
logDir
;
this
.
logDir
=
logDir
;
this
.
status
=
''
;
}
}
public
submit
():
Promise
<
string
>
{
public
submit
():
Promise
<
string
>
{
...
@@ -91,18 +93,40 @@ export class DlcClient {
...
@@ -91,18 +93,40 @@ export class DlcClient {
]
]
});
});
this
.
log
.
debug
(
this
.
pythonShellClient
.
command
);
this
.
log
.
debug
(
this
.
pythonShellClient
.
command
);
this
.
pythonShellClient
.
on
(
'
message
'
,
function
(
envId
:
any
)
{
this
.
onMessage
();
// received a message sent from the Python script (a simple "print" statement)
this
.
log
.
debug
(
`on message`
);
deferred
.
resolve
(
envId
);
});
this
.
monitorError
(
this
.
pythonShellClient
,
deferred
);
this
.
monitorError
(
this
.
pythonShellClient
,
deferred
);
this
.
log
.
debug
(
`monitor submit`
);
const
log
=
this
.
log
;
this
.
pythonShellClient
.
on
(
'
message
'
,
(
message
:
any
)
=>
{
const
jobid
=
this
.
parseContent
(
'
job_id
'
,
message
);
if
(
jobid
!==
''
)
{
log
.
debug
(
`reslove job_id
${
jobid
}
`
);
deferred
.
resolve
(
jobid
);
}
});
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
private
onMessage
():
void
{
if
(
this
.
pythonShellClient
===
undefined
)
{
throw
Error
(
'
python shell client not initialized!
'
);
}
const
log
=
this
.
log
;
this
.
pythonShellClient
.
on
(
'
message
'
,
(
message
:
any
)
=>
{
const
status
:
string
=
this
.
parseContent
(
'
status
'
,
message
);
if
(
status
.
length
>
0
)
{
log
.
debug
(
`on message status:
${
status
}
`
)
this
.
status
=
status
;
return
;
}
});
}
public
stop
():
void
{
public
stop
():
void
{
if
(
this
.
pythonShellClient
===
undefined
)
{
if
(
this
.
pythonShellClient
===
undefined
)
{
this
.
log
.
debug
(
`python shell client not initialized!`
);
throw
Error
(
'
python shell client not initialized!
'
);
throw
Error
(
'
python shell client not initialized!
'
);
}
}
this
.
log
.
debug
(
`send stop`
);
this
.
pythonShellClient
.
send
(
'
stop
'
);
this
.
pythonShellClient
.
send
(
'
stop
'
);
}
}
...
@@ -111,14 +135,17 @@ export class DlcClient {
...
@@ -111,14 +135,17 @@ export class DlcClient {
if
(
this
.
pythonShellClient
===
undefined
)
{
if
(
this
.
pythonShellClient
===
undefined
)
{
throw
Error
(
'
python shell client not initialized!
'
);
throw
Error
(
'
python shell client not initialized!
'
);
}
}
this
.
log
.
debug
(
`send tracking_url`
);
this
.
pythonShellClient
.
send
(
'
tracking_url
'
);
this
.
pythonShellClient
.
send
(
'
tracking_url
'
);
const
log
=
this
.
log
;
this
.
pythonShellClient
.
on
(
'
message
'
,
(
status
:
any
)
=>
{
this
.
pythonShellClient
.
on
(
'
message
'
,
(
status
:
any
)
=>
{
const
trackingUrl
=
this
.
parseContent
(
'
tracking_url
'
,
status
);
const
trackingUrl
=
this
.
parseContent
(
'
tracking_url
'
,
status
);
if
(
trackingUrl
!==
''
)
{
if
(
trackingUrl
!==
''
)
{
log
.
debug
(
`trackingUrl:
${
trackingUrl
}
`
);
deferred
.
resolve
(
trackingUrl
);
deferred
.
resolve
(
trackingUrl
);
}
}
});
});
this
.
monitorError
(
this
.
pythonShellClient
,
deferred
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
...
@@ -128,47 +155,19 @@ export class DlcClient {
...
@@ -128,47 +155,19 @@ export class DlcClient {
throw
Error
(
'
python shell client not initialized!
'
);
throw
Error
(
'
python shell client not initialized!
'
);
}
}
this
.
pythonShellClient
.
send
(
'
update_status
'
);
this
.
pythonShellClient
.
send
(
'
update_status
'
);
this
.
pythonShellClient
.
on
(
'
message
'
,
(
status
:
any
)
=>
{
if
(
this
.
status
===
''
)
{
let
newStatus
=
this
.
parseContent
(
'
status
'
,
status
);
this
.
status
=
oldStatus
;
if
(
newStatus
===
''
)
{
newStatus
=
oldStatus
;
}
deferred
.
resolve
(
newStatus
);
});
this
.
monitorError
(
this
.
pythonShellClient
,
deferred
);
return
deferred
.
promise
;
}
public
sendCommand
(
message
:
string
):
void
{
if
(
this
.
pythonShellClient
===
undefined
)
{
throw
Error
(
'
python shell client not initialized!
'
);
}
}
this
.
log
.
debug
(
`command:
${
message
}
`
);
this
.
log
.
debug
(
`update_status:
${
this
.
status
}
`
);
this
.
pythonShellClient
.
send
(
`command:
${
message
}
`
);
deferred
.
resolve
(
this
.
status
);
}
public
receiveCommand
():
Promise
<
string
>
{
const
deferred
:
Deferred
<
string
>
=
new
Deferred
<
string
>
();
if
(
this
.
pythonShellClient
===
undefined
)
{
throw
Error
(
'
python shell client not initialized!
'
);
}
this
.
pythonShellClient
.
send
(
'
receive
'
);
this
.
pythonShellClient
.
on
(
'
message
'
,
(
command
:
any
)
=>
{
const
message
=
this
.
parseContent
(
'
receive
'
,
command
);
if
(
message
!==
''
)
{
deferred
.
resolve
(
JSON
.
parse
(
message
))
}
});
this
.
monitorError
(
this
.
pythonShellClient
,
deferred
);
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
// Monitor error information in dlc python shell client
// Monitor error information in dlc python shell client
private
monitorError
(
pythonShellClient
:
PythonShell
,
deferred
:
Deferred
<
any
>
):
void
{
private
monitorError
(
pythonShellClient
:
PythonShell
,
deferred
:
Deferred
<
any
>
):
void
{
const
log
=
this
.
log
;
pythonShellClient
.
on
(
'
error
'
,
function
(
error
:
any
)
{
pythonShellClient
.
on
(
'
error
'
,
function
(
error
:
any
)
{
deferred
.
reject
(
error
);
log
.
info
(
`error:
${
error
}
`
);
});
pythonShellClient
.
on
(
'
close
'
,
function
(
error
:
any
)
{
deferred
.
reject
(
error
);
deferred
.
reject
(
error
);
});
});
}
}
...
...
ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts
View file @
a689e619
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
import
fs
from
'
fs
'
;
import
fs
from
'
fs
'
;
import
path
from
'
path
'
;
import
path
from
'
path
'
;
import
*
as
component
from
'
common/component
'
;
import
*
as
component
from
'
common/component
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
getLogger
,
Logger
}
from
'
common/log
'
;
import
{
getLogger
,
Logger
}
from
'
common/log
'
;
import
{
DlcConfig
}
from
'
common/experimentConfig
'
;
import
{
DlcConfig
}
from
'
common/experimentConfig
'
;
import
{
ExperimentStartupInfo
}
from
'
common/experimentStartupInfo
'
;
import
{
ExperimentStartupInfo
}
from
'
common/experimentStartupInfo
'
;
...
@@ -16,6 +17,7 @@ import { MountedStorageService } from '../storages/mountedStorageService';
...
@@ -16,6 +17,7 @@ import { MountedStorageService } from '../storages/mountedStorageService';
import
{
Scope
}
from
'
typescript-ioc
'
;
import
{
Scope
}
from
'
typescript-ioc
'
;
import
{
StorageService
}
from
'
../storageService
'
;
import
{
StorageService
}
from
'
../storageService
'
;
import
{
getLogDir
}
from
'
common/utils
'
;
import
{
getLogDir
}
from
'
common/utils
'
;
import
{
setTimeout
}
from
'
timers/promises
'
;
/**
/**
* Collector DLC jobs info from DLC cluster, and update dlc job status locally
* Collector DLC jobs info from DLC cluster, and update dlc job status locally
...
@@ -53,8 +55,9 @@ export class DlcEnvironmentService extends EnvironmentService {
...
@@ -53,8 +55,9 @@ export class DlcEnvironmentService extends EnvironmentService {
public
get
getName
():
string
{
public
get
getName
():
string
{
return
'
dlc
'
;
return
'
dlc
'
;
}
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
environments
.
forEach
(
async
(
environment
)
=>
{
environments
.
forEach
(
async
(
environment
)
=>
{
const
dlcClient
=
(
environment
as
DlcEnvironmentInformation
).
dlcClient
;
const
dlcClient
=
(
environment
as
DlcEnvironmentInformation
).
dlcClient
;
if
(
!
dlcClient
)
{
if
(
!
dlcClient
)
{
...
@@ -76,8 +79,11 @@ export class DlcEnvironmentService extends EnvironmentService {
...
@@ -76,8 +79,11 @@ export class DlcEnvironmentService extends EnvironmentService {
environment
.
setStatus
(
'
SUCCEEDED
'
);
environment
.
setStatus
(
'
SUCCEEDED
'
);
break
;
break
;
case
'
FAILED
'
:
case
'
FAILED
'
:
// the job create failed,we will sleep(60) to create new job
await
setTimeout
(
60000
);
this
.
log
.
debug
(
`await 60s to create new job,DLC: job
${
environment
.
id
}
is failed!`
);
environment
.
setStatus
(
'
FAILED
'
);
environment
.
setStatus
(
'
FAILED
'
);
return
Promise
.
reject
(
`DLC: job
${
environment
.
envId
}
is failed!`
)
;
break
;
case
'
STOPPED
'
:
case
'
STOPPED
'
:
case
'
STOPPING
'
:
case
'
STOPPING
'
:
environment
.
setStatus
(
'
USER_CANCELED
'
);
environment
.
setStatus
(
'
USER_CANCELED
'
);
...
@@ -86,6 +92,8 @@ export class DlcEnvironmentService extends EnvironmentService {
...
@@ -86,6 +92,8 @@ export class DlcEnvironmentService extends EnvironmentService {
environment
.
setStatus
(
'
UNKNOWN
'
);
environment
.
setStatus
(
'
UNKNOWN
'
);
}
}
});
});
deferred
.
resolve
();
return
deferred
.
promise
;
}
}
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment