Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1d65ec7e
Unverified
Commit
1d65ec7e
authored
Nov 15, 2024
by
Jee Jee Li
Committed by
GitHub
Nov 15, 2024
Browse files
[Bugfix] Fix fully sharded LoRA bug (#10352)
Signed-off-by:
Jee Jee Li
<
pandaleefree@gmail.com
>
parent
26908554
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
21 additions
and
19 deletions
+21
-19
vllm/lora/fully_sharded_layers.py
vllm/lora/fully_sharded_layers.py
+12
-11
vllm/lora/layers.py
vllm/lora/layers.py
+8
-7
vllm/worker/worker.py
vllm/worker/worker.py
+1
-1
No files found.
vllm/lora/fully_sharded_layers.py
View file @
1d65ec7e
...
@@ -165,15 +165,14 @@ class MergedColumnParallelLinearWithShardedLoRA(
...
@@ -165,15 +165,14 @@ class MergedColumnParallelLinearWithShardedLoRA(
def
slice_lora_a
(
def
slice_lora_a
(
self
,
lora_a
:
List
[
Union
[
torch
.
Tensor
,
None
]]
self
,
lora_a
:
List
[
Union
[
torch
.
Tensor
,
None
]]
)
->
List
[
Union
[
torch
.
Tensor
,
None
]]:
)
->
List
[
Union
[
torch
.
Tensor
,
None
]]:
if
lora_a
[
0
]
is
None
or
lora_a
[
1
]
is
None
:
#NOTE: lora_a contains 2 subloras, and each sublora could be None.
return
lora_a
output_shard_size
=
self
.
lora_a_stacked
[
0
].
shape
[
2
]
output_shard_size
=
self
.
lora_a_stacked
[
0
].
shape
[
2
]
output_start_idx
=
self
.
tp_rank
*
output_shard_size
output_start_idx
=
self
.
tp_rank
*
output_shard_size
lora_a
=
[
lora_a
=
[
lora_a
[
0
][:,
lora_a
[
0
][:,
output_start_idx
:
output_start_idx
+
output_s
t
ar
t_idx
:
output_start_idx
+
output_shard_size
]
,
output_s
h
ar
d_size
]
if
lora_a
[
0
]
is
not
None
else
None
,
lora_a
[
1
][:,
lora_a
[
1
][:,
output_start_idx
:
output_start_idx
+
output_s
t
ar
t_idx
:
output_start_idx
+
output_shard_size
]
,
output_s
h
ar
d_size
]
if
lora_a
[
1
]
is
not
None
else
None
,
]
]
return
lora_a
return
lora_a
...
@@ -261,14 +260,16 @@ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
...
@@ -261,14 +260,16 @@ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
def
slice_lora_a
(
def
slice_lora_a
(
self
,
lora_a
:
List
[
Union
[
torch
.
Tensor
,
None
]]
self
,
lora_a
:
List
[
Union
[
torch
.
Tensor
,
None
]]
)
->
List
[
Union
[
torch
.
Tensor
,
None
]]:
)
->
List
[
Union
[
torch
.
Tensor
,
None
]]:
if
lora_a
[
0
]
is
None
or
lora_a
[
1
]
is
None
or
lora_a
[
2
]
is
None
:
# NOTE: lora_a contains 3 subloras, and each sublora could be None.
return
lora_a
shard_size
=
[
self
.
lora_a_stacked
[
i
].
shape
[
2
]
for
i
in
range
(
3
)]
shard_size
=
[
self
.
lora_a_stacked
[
i
].
shape
[
2
]
for
i
in
range
(
3
)]
start_idx
=
[
self
.
tp_rank
*
shard_size
[
i
]
for
i
in
range
(
3
)]
start_idx
=
[
self
.
tp_rank
*
shard_size
[
i
]
for
i
in
range
(
3
)]
lora_a
=
[
lora_a
=
[
lora_a
[
0
][:,
start_idx
[
0
]:
start_idx
[
0
]
+
shard_size
[
0
]],
lora_a
[
0
][:,
start_idx
[
0
]:
start_idx
[
0
]
+
lora_a
[
1
][:,
start_idx
[
1
]:
start_idx
[
1
]
+
shard_size
[
1
]],
shard_size
[
0
]]
if
lora_a
[
0
]
is
not
None
else
None
,
lora_a
[
2
][:,
start_idx
[
2
]:
start_idx
[
2
]
+
shard_size
[
2
]],
lora_a
[
1
][:,
start_idx
[
1
]:
start_idx
[
1
]
+
shard_size
[
1
]]
if
lora_a
[
1
]
is
not
None
else
None
,
lora_a
[
2
][:,
start_idx
[
2
]:
start_idx
[
2
]
+
shard_size
[
2
]]
if
lora_a
[
2
]
is
not
None
else
None
,
]
]
return
lora_a
return
lora_a
...
...
vllm/lora/layers.py
View file @
1d65ec7e
...
@@ -685,26 +685,27 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
...
@@ -685,26 +685,27 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
def
slice_lora_b
(
def
slice_lora_b
(
self
,
lora_b
:
List
[
Union
[
torch
.
Tensor
,
None
]]
self
,
lora_b
:
List
[
Union
[
torch
.
Tensor
,
None
]]
)
->
List
[
Union
[
torch
.
Tensor
,
None
]]:
)
->
List
[
Union
[
torch
.
Tensor
,
None
]]:
if
lora_b
[
0
]
is
None
or
lora_b
[
1
]
is
None
:
#NOTE: lora_b contains 2 subloras, and each sublora could be None.
return
lora_b
shard_size
=
self
.
output_dim
shard_size
=
self
.
output_dim
start_idx
=
self
.
tp_rank
*
shard_size
start_idx
=
self
.
tp_rank
*
shard_size
end_idx
=
(
self
.
tp_rank
+
1
)
*
shard_size
end_idx
=
(
self
.
tp_rank
+
1
)
*
shard_size
lora_b
=
[
lora_b
=
[
lora_b
[
0
][:,
start_idx
:
end_idx
],
lora_b
[
0
][:,
start_idx
:
end_idx
]
if
lora_b
[
0
]
is
not
None
else
None
,
lora_b
[
1
][:,
start_idx
:
end_idx
],
lora_b
[
1
][:,
start_idx
:
end_idx
]
if
lora_b
[
1
]
is
not
None
else
None
,
]
]
return
lora_b
return
lora_b
def
slice_bias
(
def
slice_bias
(
self
,
bias
:
List
[
Union
[
torch
.
Tensor
,
self
,
bias
:
List
[
Union
[
torch
.
Tensor
,
None
]])
->
List
[
Union
[
torch
.
Tensor
,
None
]]:
None
]])
->
List
[
Union
[
torch
.
Tensor
,
None
]]:
if
bias
[
0
]
is
None
or
bias
[
1
]
is
None
:
# NOTE : each bias could be None.
return
bias
shard_size
=
self
.
output_dim
shard_size
=
self
.
output_dim
start_idx
=
self
.
tp_rank
*
shard_size
start_idx
=
self
.
tp_rank
*
shard_size
end_idx
=
(
self
.
tp_rank
+
1
)
*
shard_size
end_idx
=
(
self
.
tp_rank
+
1
)
*
shard_size
bias
=
[
bias
[
0
][
start_idx
:
end_idx
],
bias
[
1
][
start_idx
:
end_idx
]]
bias
=
[
bias
[
0
][
start_idx
:
end_idx
]
if
bias
[
0
]
is
not
None
else
None
,
bias
[
1
][
start_idx
:
end_idx
]
if
bias
[
1
]
is
not
None
else
None
]
return
bias
return
bias
def
set_lora
(
def
set_lora
(
...
...
vllm/worker/worker.py
View file @
1d65ec7e
...
@@ -232,7 +232,7 @@ class Worker(LocalOrDistributedWorkerBase):
...
@@ -232,7 +232,7 @@ class Worker(LocalOrDistributedWorkerBase):
logger
.
info
(
logger
.
info
(
"Memory profiling results: total_gpu_memory=%.2fGiB"
"Memory profiling results: total_gpu_memory=%.2fGiB"
" initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB"
" initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB"
" memory_usage_post_profile=%.2fGi
b
"
" memory_usage_post_profile=%.2fGi
B
"
" non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB"
" non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB"
" gpu_memory_utilization=%.2f"
,
total_gpu_memory
/
(
1024
**
3
),
" gpu_memory_utilization=%.2f"
,
total_gpu_memory
/
(
1024
**
3
),
(
total_gpu_memory
-
free_memory_pre_profile
)
/
(
1024
**
3
),
(
total_gpu_memory
-
free_memory_pre_profile
)
/
(
1024
**
3
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment