Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
2b53d719
Unverified
Commit
2b53d719
authored
Jun 30, 2023
by
OlivierDehaene
Committed by
GitHub
Jun 30, 2023
Browse files
fix(launcher): fix issue where launcher does not properly report shard failures (#522)
parent
ecf6dc3a
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
21 deletions
+37
-21
launcher/src/main.rs
launcher/src/main.rs
+37
-21
No files found.
launcher/src/main.rs
View file @
2b53d719
...
...
@@ -6,8 +6,7 @@ use std::io::{BufRead, BufReader, Read};
use
std
::
path
::
Path
;
use
std
::
sync
::
atomic
::{
AtomicBool
,
Ordering
};
use
std
::
sync
::
mpsc
::
TryRecvError
;
use
std
::
sync
::
Arc
;
use
std
::
sync
::{
mpsc
,
Mutex
};
use
std
::
sync
::{
mpsc
,
Arc
};
use
std
::
thread
;
use
std
::
thread
::
sleep
;
use
std
::
time
::{
Duration
,
Instant
};
...
...
@@ -274,7 +273,7 @@ struct Args {
#[derive(Debug)]
enum
ShardStatus
{
Ready
,
Failed
((
usize
,
String
)),
Failed
((
usize
,
Option
<
String
>
)),
}
#[allow(clippy::too_many_arguments)]
...
...
@@ -296,7 +295,7 @@ fn shard_manager(
watermark_delta
:
Option
<
f32
>
,
otlp_endpoint
:
Option
<
String
>
,
status_sender
:
mpsc
::
Sender
<
ShardStatus
>
,
shutdown
:
Arc
<
Mutex
<
b
ool
>
>
,
shutdown
:
Arc
<
AtomicB
ool
>
,
_
shutdown_sender
:
mpsc
::
Sender
<
()
>
,
)
{
// Get UDS path
...
...
@@ -433,20 +432,20 @@ fn shard_manager(
}
}
status_sender
.send
(
ShardStatus
::
Failed
((
rank
,
err
.to_string
())))
.send
(
ShardStatus
::
Failed
((
rank
,
Some
(
err
.to_string
())))
)
.unwrap
();
return
;
}
};
// Redirect STDOUT to the console
let
shard_stdout
=
p
.stdout
.take
()
.unwrap
();
let
shard_stdout_reader
=
BufReader
::
new
(
p
.stdout
.take
()
.unwrap
());
let
mut
shard_stderr_reader
=
BufReader
::
new
(
p
.stderr
.take
()
.unwrap
());
thread
::
spawn
(
move
||
{
// Enter shard-manager tracing span
let
stdout
=
BufReader
::
new
(
shard_stdout
);
let
_
span
=
tracing
::
span!
(
tracing
::
Level
::
INFO
,
"shard-manager"
,
rank
=
rank
)
.entered
();
for
line
in
s
tdout
.lines
()
{
for
line
in
s
hard_stdout_reader
.lines
()
{
// Parse loguru logs
if
let
Ok
(
log
)
=
serde_json
::
from_str
::
<
PythonLogMessage
>
(
&
line
.unwrap
())
{
log
.trace
();
...
...
@@ -460,8 +459,22 @@ fn shard_manager(
loop
{
// Process exited
if
let
Some
(
exit_status
)
=
p
.poll
()
{
// We read stderr in another thread as it seems that `read_to_string` can block
// indefinitely in some cases
let
(
err_sender
,
err_receiver
)
=
mpsc
::
channel
();
thread
::
spawn
(
move
||
{
let
mut
err
=
String
::
new
();
p
.stderr
.take
()
.unwrap
()
.read_to_string
(
&
mut
err
)
.unwrap
();
shard_stderr_reader
.read_to_string
(
&
mut
err
)
.unwrap
();
err_sender
.send
(
err
)
.unwrap_or
(());
});
let
err
=
err_receiver
.recv_timeout
(
Duration
::
from_millis
(
100
))
.map_err
(|
err
|
{
tracing
::
error!
(
"Unable to read shard {rank} error from stderr"
);
err
})
.ok
();
if
let
ExitStatus
::
Signaled
(
signal
)
=
exit_status
{
tracing
::
error!
(
"Shard process was signaled to shutdown with signal {signal}"
);
...
...
@@ -474,7 +487,7 @@ fn shard_manager(
}
// We received a shutdown signal
if
*
shutdown
.lo
ck
()
.unwrap
(
)
{
if
shutdown
.lo
ad
(
Ordering
::
SeqCst
)
{
p
.kill
()
.unwrap
();
let
_
=
p
.wait_timeout
(
Duration
::
from_secs
(
90
));
tracing
::
info!
(
"Shard {rank} terminated"
);
...
...
@@ -494,14 +507,11 @@ fn shard_manager(
}
}
fn
shutdown_shards
(
shutdown
:
Arc
<
Mutex
<
b
ool
>
>
,
shutdown_receiver
:
&
mpsc
::
Receiver
<
()
>
)
{
fn
shutdown_shards
(
shutdown
:
Arc
<
AtomicB
ool
>
,
shutdown_receiver
:
&
mpsc
::
Receiver
<
()
>
)
{
tracing
::
info!
(
"Shutting down shards"
);
// Update shutdown value to true
// This will be picked up by the shard manager
{
let
mut
shutdown
=
shutdown
.lock
()
.unwrap
();
*
shutdown
=
true
;
}
shutdown
.store
(
true
,
Ordering
::
SeqCst
);
// Wait for shards to shutdown
// This will block till all shutdown_sender are dropped
...
...
@@ -743,7 +753,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
fn
spawn_shards
(
num_shard
:
usize
,
args
:
&
Args
,
shutdown
:
Arc
<
Mutex
<
b
ool
>
>
,
shutdown
:
Arc
<
AtomicB
ool
>
,
shutdown_receiver
:
&
mpsc
::
Receiver
<
()
>
,
shutdown_sender
:
mpsc
::
Sender
<
()
>
,
status_receiver
:
&
mpsc
::
Receiver
<
ShardStatus
>
,
...
...
@@ -819,7 +829,10 @@ fn spawn_shards(
sleep
(
Duration
::
from_millis
(
100
));
}
Ok
(
ShardStatus
::
Failed
((
rank
,
err
)))
=>
{
tracing
::
error!
(
"Shard {} failed to start:
\n
{}"
,
rank
,
err
);
tracing
::
error!
(
"Shard {rank} failed to start"
);
if
let
Some
(
err
)
=
err
{
tracing
::
error!
(
"{err}"
);
}
shutdown_shards
(
shutdown
,
shutdown_receiver
);
return
Err
(
LauncherError
::
ShardCannotStart
);
}
...
...
@@ -835,7 +848,7 @@ fn spawn_shards(
fn
spawn_webserver
(
args
:
Args
,
shutdown
:
Arc
<
Mutex
<
b
ool
>
>
,
shutdown
:
Arc
<
AtomicB
ool
>
,
shutdown_receiver
:
&
mpsc
::
Receiver
<
()
>
,
)
->
Result
<
Popen
,
LauncherError
>
{
// All shard started
...
...
@@ -1002,7 +1015,7 @@ fn main() -> Result<(), LauncherError> {
download_convert_model
(
&
args
,
running
.clone
())
?
;
// Shared shutdown bool
let
shutdown
=
Arc
::
new
(
Mutex
::
new
(
false
));
let
shutdown
=
Arc
::
new
(
AtomicBool
::
new
(
false
));
// Shared shutdown channel
// When shutting down, the main thread will wait for all senders to be dropped
let
(
shutdown_sender
,
shutdown_receiver
)
=
mpsc
::
channel
();
...
...
@@ -1034,7 +1047,10 @@ fn main() -> Result<(), LauncherError> {
while
running
.load
(
Ordering
::
SeqCst
)
{
if
let
Ok
(
ShardStatus
::
Failed
((
rank
,
err
)))
=
status_receiver
.try_recv
()
{
tracing
::
error!
(
"Shard {rank} failed:
\n
{err}"
);
tracing
::
error!
(
"Shard {rank} failed to start"
);
if
let
Some
(
err
)
=
err
{
tracing
::
error!
(
"{err}"
);
}
exit_code
=
Err
(
LauncherError
::
ShardFailed
);
break
;
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment