Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
410225b7
Unverified
Commit
410225b7
authored
Oct 31, 2025
by
sjtu_shenhai
Committed by
GitHub
Oct 31, 2025
Browse files
[Bug fix] Fix severe memory waste issue with torch.empty pin_memory (#12266)
parent
2c9aebea
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
5 deletions
+12
-5
python/sglang/srt/mem_cache/memory_pool_host.py
python/sglang/srt/mem_cache/memory_pool_host.py
+12
-5
No files found.
python/sglang/srt/mem_cache/memory_pool_host.py
View file @
410225b7
...
@@ -238,12 +238,16 @@ class MHATokenToKVPoolHost(HostKVCache):
...
@@ -238,12 +238,16 @@ class MHATokenToKVPoolHost(HostKVCache):
raise
ValueError
(
f
"Unsupported layout:
{
self
.
layout
}
"
)
raise
ValueError
(
f
"Unsupported layout:
{
self
.
layout
}
"
)
self
.
token_stride_size
=
self
.
head_num
*
self
.
head_dim
*
self
.
dtype
.
itemsize
self
.
token_stride_size
=
self
.
head_num
*
self
.
head_dim
*
self
.
dtype
.
itemsize
self
.
layout_dim
=
self
.
token_stride_size
*
self
.
layer_num
self
.
layout_dim
=
self
.
token_stride_size
*
self
.
layer_num
return
torch
.
empty
(
buffer
=
torch
.
empty
(
dims
,
dims
,
dtype
=
self
.
dtype
,
dtype
=
self
.
dtype
,
device
=
self
.
device
,
device
=
self
.
device
,
pin_memory
=
self
.
pin_memory
,
)
)
if
self
.
pin_memory
:
torch
.
cuda
.
cudart
().
cudaHostRegister
(
buffer
.
data_ptr
(),
buffer
.
numel
()
*
buffer
.
element_size
(),
0
)
return
buffer
@
property
@
property
def
k_buffer
(
self
):
def
k_buffer
(
self
):
...
@@ -551,13 +555,16 @@ class MLATokenToKVPoolHost(HostKVCache):
...
@@ -551,13 +555,16 @@ class MLATokenToKVPoolHost(HostKVCache):
self
.
kv_lora_rank
+
self
.
qk_rope_head_dim
self
.
kv_lora_rank
+
self
.
qk_rope_head_dim
)
*
self
.
dtype
.
itemsize
)
*
self
.
dtype
.
itemsize
self
.
layout_dim
=
self
.
token_stride_size
*
self
.
layer_num
self
.
layout_dim
=
self
.
token_stride_size
*
self
.
layer_num
buffer
=
torch
.
empty
(
return
torch
.
empty
(
dims
,
dims
,
dtype
=
self
.
dtype
,
dtype
=
self
.
dtype
,
device
=
self
.
device
,
device
=
self
.
device
,
pin_memory
=
self
.
pin_memory
,
)
)
if
self
.
pin_memory
:
torch
.
cuda
.
cudart
().
cudaHostRegister
(
buffer
.
data_ptr
(),
buffer
.
numel
()
*
buffer
.
element_size
(),
0
)
return
buffer
def
load_to_device_per_layer
(
def
load_to_device_per_layer
(
self
,
device_pool
,
host_indices
,
device_indices
,
layer_id
,
io_backend
self
,
device_pool
,
host_indices
,
device_indices
,
layer_id
,
io_backend
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment