Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
c4bb5264
Unverified
Commit
c4bb5264
authored
Jul 06, 2023
by
OlivierDehaene
Committed by
GitHub
Jul 06, 2023
Browse files
fix(server): decrease memory fragmentation (#557)
parent
6f429427
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
22 additions
and
10 deletions
+22
-10
server/text_generation_server/cache.py
server/text_generation_server/cache.py
+4
-0
server/text_generation_server/models/flash_causal_lm.py
server/text_generation_server/models/flash_causal_lm.py
+18
-10
No files found.
server/text_generation_server/cache.py
View file @
c4bb5264
import
torch
from
typing
import
Dict
,
Optional
,
TypeVar
from
text_generation_server.models.types
import
Batch
...
...
@@ -20,6 +22,8 @@ class Cache:
batch
=
self
.
pop
(
batch_id
)
if
batch
is
not
None
:
del
batch
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
empty_cache
()
def
clear
(
self
):
keys
=
list
(
self
.
cache
.
keys
())
...
...
server/text_generation_server/models/flash_causal_lm.py
View file @
c4bb5264
...
...
@@ -638,6 +638,8 @@ class FlashCausalLMBatch(Batch):
# Needed to avoid dropping blocks when the batches will go out of scope
for
b
in
batches
:
b
.
block_tables
=
None
del
b
torch
.
cuda
.
empty_cache
()
return
FlashCausalLMBatch
(
batch_id
=
batches
[
0
].
batch_id
,
...
...
@@ -732,6 +734,7 @@ class FlashCausalLM(Model):
)
raise
e
del
batch
torch
.
cuda
.
empty_cache
()
def
decode
(
self
,
generated_ids
:
Union
[
torch
.
Tensor
,
List
[
int
]])
->
str
:
return
self
.
tokenizer
.
decode
(
...
...
@@ -775,6 +778,7 @@ class FlashCausalLM(Model):
# Allocate blocks to this batch
CACHE_MANAGER
.
allocate
(
batch
)
try
:
out
=
self
.
forward
(
batch
.
input_ids
,
batch
.
position_ids
,
...
...
@@ -785,6 +789,10 @@ class FlashCausalLM(Model):
batch
.
max_seqlen
,
batch
.
prefill_head_indices
,
)
except
Exception
as
e
:
del
batch
torch
.
cuda
.
empty_cache
()
raise
e
if
prefill
:
next_token_logits
=
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment