Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
c1fe28d6
Unverified
Commit
c1fe28d6
authored
Sep 06, 2024
by
Nicolas Patry
Committed by
GitHub
Sep 06, 2024
Browse files
Fixing more correctly the invalid drop of the batch. (#2498)
parent
aaea212d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
58 additions
and
50 deletions
+58
-50
backends/v3/src/backend.rs
backends/v3/src/backend.rs
+2
-3
backends/v3/src/queue.rs
backends/v3/src/queue.rs
+54
-47
backends/v3/src/radix.rs
backends/v3/src/radix.rs
+2
-0
No files found.
backends/v3/src/backend.rs
View file @
c1fe28d6
...
@@ -122,7 +122,7 @@ impl Backend for BackendV3 {
...
@@ -122,7 +122,7 @@ impl Backend for BackendV3 {
#[allow(clippy::too_many_arguments)]
#[allow(clippy::too_many_arguments)]
pub
(
crate
)
async
fn
batching_task
(
pub
(
crate
)
async
fn
batching_task
(
mut
client
:
ShardedClient
,
mut
client
:
ShardedClient
,
_
waiting_served_ratio
:
f32
,
waiting_served_ratio
:
f32
,
max_batch_prefill_tokens
:
u32
,
max_batch_prefill_tokens
:
u32
,
max_batch_total_tokens
:
u32
,
max_batch_total_tokens
:
u32
,
max_waiting_tokens
:
usize
,
max_waiting_tokens
:
usize
,
...
@@ -170,8 +170,7 @@ pub(crate) async fn batching_task(
...
@@ -170,8 +170,7 @@ pub(crate) async fn batching_task(
// Minimum batch size
// Minimum batch size
// TODO: temporarily disable to avoid incorrect deallocation +
// TODO: temporarily disable to avoid incorrect deallocation +
// reallocation when using prefix caching.
// reallocation when using prefix caching.
// Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
Some
((
batch_size
as
f32
*
waiting_served_ratio
)
.floor
()
as
usize
)
None
};
};
let
token_budget
=
max_batch_total_tokens
.saturating_sub
(
batch_max_tokens
);
let
token_budget
=
max_batch_total_tokens
.saturating_sub
(
batch_max_tokens
);
...
...
backends/v3/src/queue.rs
View file @
c1fe28d6
...
@@ -252,17 +252,14 @@ impl State {
...
@@ -252,17 +252,14 @@ impl State {
let
next_batch_span
=
info_span!
(
parent
:
None
,
"batch"
,
batch_size
=
tracing
::
field
::
Empty
);
let
next_batch_span
=
info_span!
(
parent
:
None
,
"batch"
,
batch_size
=
tracing
::
field
::
Empty
);
next_batch_span
.follows_from
(
Span
::
current
());
next_batch_span
.follows_from
(
Span
::
current
());
let
mut
batch_requests
=
Vec
::
with_capacity
(
self
.entries
.len
());
let
mut
batch
=
Vec
::
with_capacity
(
self
.entries
.len
());
let
mut
batch_entries
=
IntMap
::
with_capacity_and_hasher
(
self
.entries
.len
(),
BuildNoHashHasher
::
default
());
let
mut
max_input_length
=
0
;
let
mut
max_input_length
=
0
;
let
mut
prefill_tokens
:
u32
=
0
;
let
mut
prefill_tokens
:
u32
=
0
;
let
mut
decode_tokens
:
u32
=
0
;
let
mut
decode_tokens
:
u32
=
0
;
let
mut
max_blocks
=
0
;
let
mut
max_blocks
=
0
;
// Pop entries starting from the front of the queue
// Pop entries starting from the front of the queue
'entry_loop
:
while
let
Some
((
id
,
mut
entry
))
=
self
.entries
.pop_front
()
{
'entry_loop
:
while
let
Some
((
id
,
entry
))
=
self
.entries
.pop_front
()
{
// Filter entries where the response receiver was dropped (== entries where the request
// Filter entries where the response receiver was dropped (== entries where the request
// was dropped by the client)
// was dropped by the client)
if
entry
.response_tx
.is_closed
()
{
if
entry
.response_tx
.is_closed
()
{
...
@@ -276,7 +273,7 @@ impl State {
...
@@ -276,7 +273,7 @@ impl State {
// We pad to max input length in the Python shards
// We pad to max input length in the Python shards
// We need to take these padding tokens into the equation
// We need to take these padding tokens into the equation
max_input_length
=
max_input_length
.max
(
entry
.request.input_length
);
max_input_length
=
max_input_length
.max
(
entry
.request.input_length
);
prefill_tokens
=
(
batch
_requests
.len
()
+
1
)
as
u32
*
max_input_length
;
prefill_tokens
=
(
batch
.len
()
+
1
)
as
u32
*
max_input_length
;
decode_tokens
+=
entry
.request.stopping_parameters.max_new_tokens
;
decode_tokens
+=
entry
.request.stopping_parameters.max_new_tokens
;
let
total_tokens
=
prefill_tokens
+
decode_tokens
+
self
.speculate
;
let
total_tokens
=
prefill_tokens
+
decode_tokens
+
self
.speculate
;
...
@@ -290,7 +287,7 @@ impl State {
...
@@ -290,7 +287,7 @@ impl State {
}
}
None
None
}
}
Some
(
block_allocator
)
=>
{
Some
(
_
block_allocator
)
=>
{
prefill_tokens
+=
entry
.request.input_length
;
prefill_tokens
+=
entry
.request.input_length
;
let
max_new_tokens
=
match
self
.window_size
{
let
max_new_tokens
=
match
self
.window_size
{
None
=>
entry
.request.stopping_parameters.max_new_tokens
,
None
=>
entry
.request.stopping_parameters.max_new_tokens
,
...
@@ -324,23 +321,59 @@ impl State {
...
@@ -324,23 +321,59 @@ impl State {
entry
.request.input_ids
.clone
()
entry
.request.input_ids
.clone
()
};
};
match
block_allocator
.allocate
(
tokens
,
input_ids
)
.await
{
Some
((
tokens
,
input_ids
))
None
=>
{
// Entry is over budget
// Add it back to the front
tracing
::
debug!
(
"Over budget: not enough free blocks"
);
self
.entries
.push_front
((
id
,
entry
));
break
'entry_loop
;
}
Some
(
block_allocation
)
=>
{
tracing
::
debug!
(
"Allocation: {block_allocation:?}"
);
max_blocks
=
max
(
max_blocks
,
block_allocation
.blocks
.len
()
as
u32
);
Some
(
block_allocation
)
}
}
}
}
};
};
batch
.push
((
id
,
entry
,
block_allocation
));
if
Some
(
batch
.len
())
==
max_size
{
break
;
}
}
// Empty batch
if
batch
.is_empty
()
{
tracing
::
debug!
(
"Filterered out all entries"
);
return
None
;
}
// XXX We haven't allocated yet, so we're allowed to ditch the results.
// Check if our batch is big enough
if
let
Some
(
min_size
)
=
min_size
{
// Batch is too small
if
batch
.len
()
<
min_size
{
// Add back entries to the queue in the correct order
for
(
id
,
entry
,
_
)
in
batch
.into_iter
()
.rev
()
{
self
.entries
.push_front
((
id
,
entry
));
}
return
None
;
}
}
let
mut
batch_requests
=
Vec
::
with_capacity
(
self
.entries
.len
());
let
mut
batch_entries
=
IntMap
::
with_capacity_and_hasher
(
self
.entries
.len
(),
BuildNoHashHasher
::
default
());
for
(
id
,
mut
entry
,
block_allocation
)
in
batch
{
let
block_allocation
=
if
let
(
Some
((
tokens
,
input_ids
)),
Some
(
block_allocator
))
=
(
block_allocation
,
&
self
.block_allocator
)
{
match
block_allocator
.allocate
(
tokens
,
input_ids
)
.await
{
None
=>
{
// Entry is over budget
// Add it back to the front
tracing
::
debug!
(
"Over budget: not enough free blocks"
);
self
.entries
.push_front
((
id
,
entry
));
break
;
}
Some
(
block_allocation
)
=>
{
tracing
::
debug!
(
"Allocation: {block_allocation:?}"
);
max_blocks
=
max
(
max_blocks
,
block_allocation
.blocks
.len
()
as
u32
);
Some
(
block_allocation
)
}
}
}
else
{
None
};
tracing
::
debug!
(
"Accepting entry"
);
tracing
::
debug!
(
"Accepting entry"
);
// Create a new span to link the batch back to this entry
// Create a new span to link the batch back to this entry
let
entry_batch_span
=
info_span!
(
parent
:
&
entry
.span
,
"infer"
);
let
entry_batch_span
=
info_span!
(
parent
:
&
entry
.span
,
"infer"
);
...
@@ -400,32 +433,6 @@ impl State {
...
@@ -400,32 +433,6 @@ impl State {
entry
.batch_time
=
Some
(
Instant
::
now
());
entry
.batch_time
=
Some
(
Instant
::
now
());
// Insert in batch_entries IntMap
// Insert in batch_entries IntMap
batch_entries
.insert
(
id
,
entry
);
batch_entries
.insert
(
id
,
entry
);
// Check if max_size
if
Some
(
batch_requests
.len
())
==
max_size
{
break
;
}
}
// Empty batch
if
batch_requests
.is_empty
()
{
tracing
::
debug!
(
"Filterered out all entries"
);
return
None
;
}
// Check if our batch is big enough
if
let
Some
(
min_size
)
=
min_size
{
// Batch is too small
if
batch_requests
.len
()
<
min_size
{
// Add back entries to the queue in the correct order
for
r
in
batch_requests
.into_iter
()
.rev
()
{
let
id
=
r
.id
;
let
entry
=
batch_entries
.remove
(
&
id
)
.unwrap
();
self
.entries
.push_front
((
id
,
entry
));
}
return
None
;
}
}
}
// Final batch size
// Final batch size
...
...
backends/v3/src/radix.rs
View file @
c1fe28d6
...
@@ -89,6 +89,8 @@ impl Allocator for RadixAllocator {
...
@@ -89,6 +89,8 @@ impl Allocator for RadixAllocator {
let
suffix_blocks
=
(
suffix_len
+
self
.block_size
-
1
)
/
self
.block_size
;
let
suffix_blocks
=
(
suffix_len
+
self
.block_size
-
1
)
/
self
.block_size
;
tracing
::
info!
(
"Prefix {prefix_len} - Suffix {suffix_len}"
);
match
self
.alloc_or_reclaim
(
suffix_blocks
as
usize
)
{
match
self
.alloc_or_reclaim
(
suffix_blocks
as
usize
)
{
Some
(
suffix_blocks
)
=>
blocks
.extend
(
suffix_blocks
),
Some
(
suffix_blocks
)
=>
blocks
.extend
(
suffix_blocks
),
None
=>
{
None
=>
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment