Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
0f4fb19b
Unverified
Commit
0f4fb19b
authored
Sep 30, 2024
by
Ying Sheng
Committed by
GitHub
Sep 30, 2024
Browse files
[Fix, LoRA] fix LoRA with updates in main (#1545)
parent
63ba2f8d
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
31 additions
and
23 deletions
+31
-23
examples/runtime/lora.py
examples/runtime/lora.py
+7
-7
python/sglang/srt/lora/lora.py
python/sglang/srt/lora/lora.py
+6
-6
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+10
-0
test/srt/models/test_lora.py
test/srt/models/test_lora.py
+7
-9
test/srt/run_suite.py
test/srt/run_suite.py
+1
-1
No files found.
examples/runtime/lora.py
View file @
0f4fb19b
# launch server
# python -m sglang.launch_server --model mistralai/Mistral-7B-Instruct-v0.3 --lora-paths /home/ying/test_lora
/home/ying/test_lora_1 /home/ying/test_lora_2 lora3
=/home/ying/test_lora_
3
lora
4
=/home/ying/test_lora_
4
--disable-radix --disable-cuda-graph --max-loras-per-batch 4
# python -m sglang.launch_server --model mistralai/Mistral-7B-Instruct-v0.3 --lora-paths /home/ying/test_lora
lora1
=/home/ying/test_lora_
1
lora
2
=/home/ying/test_lora_
2
--disable-radix --disable-cuda-graph --max-loras-per-batch 4
# send requests
# lora_path[i] specifies the LoRA used for text[i], so make sure they have the same length
...
...
@@ -22,12 +22,12 @@ json_data = {
"sampling_params"
:
{
"max_new_tokens"
:
32
},
"lora_path"
:
[
"/home/ying/test_lora"
,
"
/home/ying/test_
lora
_
1"
,
"
/home/ying/test_
lora
_
2"
,
"lora
3
"
,
"lora
4
"
,
"/home/ying/test_lora"
,
"/home/ying/test_lora_1"
,
"lora1"
,
"lora2"
,
"lora
1
"
,
"lora
2
"
,
None
,
None
,
],
}
response
=
requests
.
post
(
...
...
python/sglang/srt/lora/lora.py
View file @
0f4fb19b
...
...
@@ -28,18 +28,18 @@ from typing import Any, Dict, List, Optional, Tuple
import
safetensors.torch
import
torch
from
torch
import
nn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
QKVParallelLinear
,
RowParallelLinear
,
)
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
)
from
vllm.model_executor.model_loader.loader
import
DefaultModelLoader
from
sglang.srt.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
QKVParallelLinear
,
RowParallelLinear
,
)
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
...
...
python/sglang/srt/server_args.py
View file @
0f4fb19b
...
...
@@ -594,6 +594,16 @@ class ServerArgs:
"Please use sglang<=0.3.2 or wait for later updates."
)
if
isinstance
(
self
.
lora_paths
,
list
):
lora_paths
=
self
.
lora_paths
self
.
lora_paths
=
{}
for
lora_path
in
lora_paths
:
if
"="
in
lora_path
:
name
,
path
=
lora_path
.
split
(
"="
,
1
)
self
.
lora_paths
[
name
]
=
path
else
:
self
.
lora_paths
[
lora_path
]
=
lora_path
def
prepare_server_args
(
argv
:
List
[
str
])
->
ServerArgs
:
"""
...
...
test/srt/models/test_lora.py
View file @
0f4fb19b
...
...
@@ -97,9 +97,7 @@ class TestLoRA(unittest.TestCase):
)
with
HFRunner
(
base_path
,
torch_dtype
=
torch_dtype
,
is_generation
=
True
,
base_path
,
torch_dtype
=
torch_dtype
,
model_type
=
"generation"
)
as
hf_runner
:
hf_outputs
=
hf_runner
.
forward
(
prompts
,
max_new_tokens
=
max_new_tokens
,
lora_paths
=
batch_lora_paths
...
...
@@ -108,7 +106,7 @@ class TestLoRA(unittest.TestCase):
with
HFRunner
(
base_path
,
torch_dtype
=
torch_dtype
,
is_
generation
=
True
,
model_type
=
"
generation
"
,
)
as
hf_runner
:
hf_no_lora_outputs
=
hf_runner
.
forward
(
prompts
,
max_new_tokens
=
max_new_tokens
...
...
@@ -118,7 +116,7 @@ class TestLoRA(unittest.TestCase):
base_path
,
tp_size
=
tp_size
,
torch_dtype
=
torch_dtype
,
is_
generation
=
True
,
model_type
=
"
generation
"
,
)
as
srt_runner
:
srt_no_lora_outputs
=
srt_runner
.
forward
(
prompts
,
max_new_tokens
=
max_new_tokens
...
...
@@ -198,7 +196,7 @@ class TestLoRA(unittest.TestCase):
base_path
,
tp_size
=
tp_size
,
torch_dtype
=
torch_dtype
,
is_
generation
=
True
,
model_type
=
"
generation
"
,
lora_paths
=
all_lora_paths
,
max_loras_per_batch
=
3
,
disable_cuda_graph
=
True
,
...
...
@@ -211,7 +209,7 @@ class TestLoRA(unittest.TestCase):
with
HFRunner
(
base_path
,
torch_dtype
=
torch_dtype
,
is_
generation
=
True
,
model_type
=
"
generation
"
,
output_str_only
=
True
,
)
as
hf_runner
:
hf_outputs
=
hf_runner
.
forward
(
...
...
@@ -237,7 +235,7 @@ class TestLoRA(unittest.TestCase):
base_path
,
tp_size
=
tp_size
,
torch_dtype
=
torch_dtype
,
is_
generation
=
True
,
model_type
=
"
generation
"
,
)
as
srt_runner
:
srt_no_lora_outputs
=
srt_runner
.
forward
(
prompts
,
max_new_tokens
=
max_new_tokens
...
...
@@ -247,7 +245,7 @@ class TestLoRA(unittest.TestCase):
base_path
,
tp_size
=
tp_size
,
torch_dtype
=
torch_dtype
,
is_
generation
=
True
,
model_type
=
"
generation
"
,
lora_paths
=
all_lora_paths
,
)
as
srt_runner
:
srt_outputs
=
srt_runner
.
forward
(
...
...
test/srt/run_suite.py
View file @
0f4fb19b
...
...
@@ -7,7 +7,7 @@ suites = {
"minimal"
:
[
"models/test_embedding_models.py"
,
"models/test_generation_models.py"
,
#
"models/test_lora.py",
"models/test_lora.py"
,
"models/test_reward_models.py"
,
"sampling/penaltylib"
,
"test_chunked_prefill.py"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment