Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
f2d68ded
"vscode:/vscode.git/clone" did not exist on "3fdf17308435caa13872fa37526897d788eb8f4c"
Unverified
Commit
f2d68ded
authored
Aug 03, 2025
by
Baizhou Zhang
Committed by
GitHub
Aug 03, 2025
Browse files
Rename lora_path to lora_id in batches (#8437)
parent
3b87a9e8
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
27 additions
and
29 deletions
+27
-29
python/sglang/srt/lora/lora_manager.py
python/sglang/srt/lora/lora_manager.py
+3
-7
python/sglang/srt/managers/io_struct.py
python/sglang/srt/managers/io_struct.py
+4
-2
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+4
-4
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+4
-4
python/sglang/srt/managers/tokenizer_manager.py
python/sglang/srt/managers/tokenizer_manager.py
+3
-3
python/sglang/srt/model_executor/cuda_graph_runner.py
python/sglang/srt/model_executor/cuda_graph_runner.py
+6
-6
python/sglang/srt/model_executor/forward_batch_info.py
python/sglang/srt/model_executor/forward_batch_info.py
+2
-2
python/sglang/srt/two_batch_overlap.py
python/sglang/srt/two_batch_overlap.py
+1
-1
No files found.
python/sglang/srt/lora/lora_manager.py
View file @
f2d68ded
...
...
@@ -191,11 +191,7 @@ class LoRAManager:
def
prepare_lora_batch
(
self
,
forward_batch
:
ForwardBatch
):
# Load active loras into lora memory pool
# TODO (lifuhuang): The naming of `forward_batch.lora_paths` is confusing. It actually contains a set of unique
# LoRA IDs, not LoRA paths. While unfortunately we cannot change the name in API for backward compatibility, we
# should consider (1) renaming the incorrect usage within the system, and (2) deprecating the parameter name in
# the current API schema and introducing a better request schema in the future (e.g., use `model_name`).
cur_uids
=
set
(
forward_batch
.
lora_paths
)
cur_uids
=
set
(
forward_batch
.
lora_ids
)
assert
len
(
cur_uids
)
<=
self
.
max_loras_per_batch
self
.
memory_pool
.
prepare_lora_batch
(
cur_uids
,
self
.
loras
,
self
.
lora_modules
)
...
...
@@ -211,10 +207,10 @@ class LoRAManager:
Transfer adapter metadata (weight indices, LoRA rank, scalings) from host
to device (CUDA) asynchronously.
"""
weight_indices
=
[
0
]
*
len
(
forward_batch
.
lora_
path
s
)
weight_indices
=
[
0
]
*
len
(
forward_batch
.
lora_
id
s
)
lora_ranks
=
[
0
]
*
self
.
max_loras_per_batch
scalings
=
[
0
]
*
self
.
max_loras_per_batch
for
i
,
uid
in
enumerate
(
forward_batch
.
lora_
path
s
):
for
i
,
uid
in
enumerate
(
forward_batch
.
lora_
id
s
):
weight_indices
[
i
]
=
self
.
memory_pool
.
get_buffer_id
(
uid
)
if
uid
is
not
None
:
lora
=
self
.
loras
[
uid
]
...
...
python/sglang/srt/managers/io_struct.py
View file @
f2d68ded
...
...
@@ -101,8 +101,10 @@ class GenerateReqInput:
# The modalities of the image data [image, multi-images, video]
modalities
:
Optional
[
List
[
str
]]
=
None
# The path to the LoRA
# The path to the LoRA
adaptors
lora_path
:
Optional
[
Union
[
List
[
Optional
[
str
]],
Optional
[
str
]]]
=
None
# The uid of LoRA adaptors, should be initialized by tokenizer manager
lora_id
:
Optional
[
Union
[
List
[
Optional
[
str
]],
Optional
[
str
]]]
=
None
# Session info for continual prompting
session_params
:
Optional
[
Union
[
List
[
Dict
],
Dict
]]
=
None
...
...
@@ -500,7 +502,7 @@ class TokenizedGenerateReqInput:
stream
:
bool
# LoRA related
lora_
path
:
Optional
[
str
]
=
None
# None means just use the base model
lora_
id
:
Optional
[
str
]
=
None
# None means just use the base model
# The input embeds
input_embeds
:
Optional
[
Union
[
List
[
List
[
List
[
float
]]],
List
[
List
[
float
]]]]
=
None
...
...
python/sglang/srt/managers/schedule_batch.py
View file @
f2d68ded
...
...
@@ -423,7 +423,7 @@ class Req:
token_ids_logprob
:
List
[
int
]
=
None
,
stream
:
bool
=
False
,
origin_input_ids_unpadded
:
Optional
[
Tuple
[
int
]]
=
None
,
lora_
path
:
Optional
[
str
]
=
None
,
lora_
id
:
Optional
[
str
]
=
None
,
input_embeds
:
Optional
[
List
[
List
[
float
]]]
=
None
,
token_type_ids
:
List
[
int
]
=
None
,
session_id
:
Optional
[
str
]
=
None
,
...
...
@@ -467,7 +467,7 @@ class Req:
self
.
sampling_params
=
sampling_params
self
.
custom_logit_processor
=
custom_logit_processor
self
.
return_hidden_states
=
return_hidden_states
self
.
lora_
path
=
lora_
path
self
.
lora_
id
=
lora_
id
# Memory pool info
self
.
req_pool_idx
:
Optional
[
int
]
=
None
...
...
@@ -1750,7 +1750,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
encoder_lens
=
self
.
encoder_lens
,
encoder_lens_cpu
=
self
.
encoder_lens_cpu
,
encoder_out_cache_loc
=
self
.
encoder_out_cache_loc
,
lora_
path
s
=
[
req
.
lora_
path
for
req
in
self
.
reqs
],
lora_
id
s
=
[
req
.
lora_
id
for
req
in
self
.
reqs
],
sampling_info
=
self
.
sampling_info
,
input_embeds
=
self
.
input_embeds
,
token_type_ids
=
self
.
token_type_ids
,
...
...
@@ -1891,7 +1891,7 @@ class ModelWorkerBatch:
encoder_out_cache_loc
:
Optional
[
torch
.
Tensor
]
# For LoRA
lora_
path
s
:
Optional
[
List
[
str
]]
lora_
id
s
:
Optional
[
List
[
str
]]
# Sampling info
sampling_info
:
SamplingBatchInfo
...
...
python/sglang/srt/managers/scheduler.py
View file @
f2d68ded
...
...
@@ -1090,7 +1090,7 @@ class Scheduler(
top_logprobs_num
=
recv_req
.
top_logprobs_num
,
token_ids_logprob
=
recv_req
.
token_ids_logprob
,
stream
=
recv_req
.
stream
,
lora_
path
=
recv_req
.
lora_
path
,
lora_
id
=
recv_req
.
lora_
id
,
input_embeds
=
recv_req
.
input_embeds
,
custom_logit_processor
=
recv_req
.
custom_logit_processor
,
return_hidden_states
=
recv_req
.
return_hidden_states
,
...
...
@@ -1534,7 +1534,7 @@ class Scheduler(
self
.
chunked_req
=
adder
.
add_chunked_req
(
self
.
chunked_req
)
if
self
.
enable_lora
:
lora_set
=
set
([
req
.
lora_
path
for
req
in
self
.
running_batch
.
reqs
])
lora_set
=
set
([
req
.
lora_
id
for
req
in
self
.
running_batch
.
reqs
])
# Get requests from the waiting queue to a new prefill batch
for
req
in
self
.
waiting_queue
:
...
...
@@ -1542,8 +1542,8 @@ class Scheduler(
self
.
enable_lora
and
len
(
lora_set
|
set
([
req
.
lora_
path
for
req
in
adder
.
can_run_list
])
|
set
([
req
.
lora_
path
])
|
set
([
req
.
lora_
id
for
req
in
adder
.
can_run_list
])
|
set
([
req
.
lora_
id
])
)
>
self
.
max_loras_per_batch
):
...
...
python/sglang/srt/managers/tokenizer_manager.py
View file @
f2d68ded
...
...
@@ -556,7 +556,7 @@ class TokenizerManager:
if
self
.
server_args
.
enable_lora
and
obj
.
lora_path
:
# Start tracking ongoing requests for LoRA adapters and replace the user-friendly LoRA names in
# `lora_path` with their corresponding unique LoRA IDs, as required for internal processing.
obj
.
lora_
path
=
await
self
.
lora_registry
.
acquire
(
obj
.
lora_path
)
obj
.
lora_
id
=
await
self
.
lora_registry
.
acquire
(
obj
.
lora_path
)
self
.
_validate_one_request
(
obj
,
input_ids
)
return
self
.
_create_tokenized_object
(
...
...
@@ -665,7 +665,7 @@ class TokenizerManager:
bootstrap_host
=
obj
.
bootstrap_host
,
bootstrap_port
=
obj
.
bootstrap_port
,
bootstrap_room
=
obj
.
bootstrap_room
,
lora_
path
=
obj
.
lora_
path
,
lora_
id
=
obj
.
lora_
id
,
input_embeds
=
input_embeds
,
session_params
=
session_params
,
custom_logit_processor
=
obj
.
custom_logit_processor
,
...
...
@@ -773,7 +773,7 @@ class TokenizerManager:
# Mark ongoing LoRA request as finished.
if
self
.
server_args
.
enable_lora
and
obj
.
lora_path
:
await
self
.
lora_registry
.
release
(
obj
.
lora_
path
)
await
self
.
lora_registry
.
release
(
obj
.
lora_
id
)
# Check if this was an abort/error created by scheduler
if
isinstance
(
out
[
"meta_info"
].
get
(
"finish_reason"
),
dict
):
...
...
python/sglang/srt/model_executor/cuda_graph_runner.py
View file @
f2d68ded
...
...
@@ -576,11 +576,11 @@ class CudaGraphRunner:
)
if
self
.
model_runner
.
server_args
.
enable_lora
:
# It is safe to capture CUDA graph using empty LoRA
path
, as the LoRA kernels will always be launched whenever
# `--enable-lora` is set to True (and return immediately if the LoRA
path
is empty for perf optimization).
lora_
path
s
=
[
None
]
*
bs
# It is safe to capture CUDA graph using empty LoRA
id
, as the LoRA kernels will always be launched whenever
# `--enable-lora` is set to True (and return immediately if the LoRA
id
is empty for perf optimization).
lora_
id
s
=
[
None
]
*
bs
else
:
lora_
path
s
=
None
lora_
id
s
=
None
forward_batch
=
ForwardBatch
(
forward_mode
=
self
.
capture_forward_mode
,
...
...
@@ -607,11 +607,11 @@ class CudaGraphRunner:
capture_hidden_mode
=
self
.
capture_hidden_mode
,
num_token_non_padded
=
self
.
num_token_non_padded
,
global_forward_mode
=
self
.
capture_forward_mode
,
lora_
path
s
=
lora_
path
s
,
lora_
id
s
=
lora_
id
s
,
)
self
.
tbo_plugin
.
capture_one_batch_size
(
forward_batch
,
num_tokens
=
num_tokens
)
if
lora_
path
s
is
not
None
:
if
lora_
id
s
is
not
None
:
self
.
model_runner
.
lora_manager
.
prepare_lora_batch
(
forward_batch
)
# Attention backend
...
...
python/sglang/srt/model_executor/forward_batch_info.py
View file @
f2d68ded
...
...
@@ -248,7 +248,7 @@ class ForwardBatch:
encoder_out_cache_loc
:
Optional
[
torch
.
Tensor
]
=
None
# For LoRA
lora_
path
s
:
Optional
[
List
[
str
]]
=
None
lora_
id
s
:
Optional
[
List
[
str
]]
=
None
# For input embeddings
input_embeds
:
Optional
[
torch
.
Tensor
]
=
None
...
...
@@ -327,7 +327,7 @@ class ForwardBatch:
is_extend_in_batch
=
batch
.
is_extend_in_batch
,
can_run_dp_cuda_graph
=
batch
.
can_run_dp_cuda_graph
,
global_forward_mode
=
batch
.
global_forward_mode
,
lora_
path
s
=
batch
.
lora_
path
s
,
lora_
id
s
=
batch
.
lora_
id
s
,
sampling_info
=
batch
.
sampling_info
,
req_to_token_pool
=
model_runner
.
req_to_token_pool
,
token_to_kv_pool
=
model_runner
.
token_to_kv_pool
,
...
...
python/sglang/srt/two_batch_overlap.py
View file @
f2d68ded
...
...
@@ -468,7 +468,7 @@ class TboForwardBatchPreparer:
"extend_prefix_lens_cpu"
,
"extend_seq_lens_cpu"
,
"extend_logprob_start_lens_cpu"
,
"lora_
path
s"
,
"lora_
id
s"
,
]:
old_value
=
getattr
(
batch
,
key
)
if
old_value
is
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment