Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
641b7d0a
Unverified
Commit
641b7d0a
authored
Dec 09, 2024
by
Lianmin Zheng
Committed by
GitHub
Dec 09, 2024
Browse files
[Minor] Improve code style (#2422)
parent
0ce091a8
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
33 additions
and
21 deletions
+33
-21
python/pyproject.toml
python/pyproject.toml
+2
-1
python/sglang/bench_offline_throughput.py
python/sglang/bench_offline_throughput.py
+1
-1
python/sglang/srt/constrained/xgrammar_backend.py
python/sglang/srt/constrained/xgrammar_backend.py
+4
-1
python/sglang/srt/layers/attention/triton_backend.py
python/sglang/srt/layers/attention/triton_backend.py
+0
-1
python/sglang/srt/layers/radix_attention.py
python/sglang/srt/layers/radix_attention.py
+8
-1
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+1
-1
python/sglang/srt/managers/tokenizer_manager.py
python/sglang/srt/managers/tokenizer_manager.py
+1
-1
python/sglang/srt/model_executor/cuda_graph_runner.py
python/sglang/srt/model_executor/cuda_graph_runner.py
+1
-1
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+8
-4
python/sglang/srt/models/gemma2_reward.py
python/sglang/srt/models/gemma2_reward.py
+0
-1
python/sglang/srt/models/llama_classification.py
python/sglang/srt/models/llama_classification.py
+0
-1
python/sglang/srt/models/llama_reward.py
python/sglang/srt/models/llama_reward.py
+0
-2
python/sglang/srt/server.py
python/sglang/srt/server.py
+1
-1
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+6
-3
test/srt/run_suite.py
test/srt/run_suite.py
+0
-1
No files found.
python/pyproject.toml
View file @
641b7d0a
...
@@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
...
@@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
all_hip
=
["sglang[srt_hip]
", "
sglang
[openai]
", "
sglang
[anthropic]
", "
sglang
[litellm]"]
all_hip
=
["sglang[srt_hip]
", "
sglang
[openai]
", "
sglang
[anthropic]
", "
sglang
[litellm]"]
all_xpu
=
["sglang[srt_xpu]
", "
sglang
[openai]
", "
sglang
[anthropic]
", "
sglang
[litellm]"]
all_xpu
=
["sglang[srt_xpu]
", "
sglang
[openai]
", "
sglang
[anthropic]
", "
sglang
[litellm]"]
all_hpu
=
["sglang[srt_hpu]
", "
sglang
[openai]
", "
sglang
[anthropic]
", "
sglang
[litellm]"]
all_hpu
=
["sglang[srt_hpu]
", "
sglang
[openai]
", "
sglang
[anthropic]
", "
sglang
[litellm]"]
dev
=
["sglang[all]
", "
sglang
[test]"]
dev
=
["sglang[all]
", "
sglang
[test]"]
dev_hip
=
["sglang[all_hip]
", "
sglang
[test]"]
dev_hip
=
["sglang[all_hip]
", "
sglang
[test]"]
dev_xpu
=
["sglang[all_xpu]
", "
sglang
[test]"]
dev_xpu
=
["sglang[all_xpu]
", "
sglang
[test]"]
...
...
python/sglang/bench_offline_throughput.py
View file @
641b7d0a
...
@@ -285,7 +285,7 @@ def throughput_test(
...
@@ -285,7 +285,7 @@ def throughput_test(
else
:
else
:
raise
ValueError
(
'Please set backend to either "engine" or "runtime"'
)
raise
ValueError
(
'Please set backend to either "engine" or "runtime"'
)
tokenizer_id
=
server_args
.
model_path
tokenizer_id
=
server_args
.
tokenizer_path
or
server_args
.
model_path
tokenizer
=
get_tokenizer
(
tokenizer_id
)
tokenizer
=
get_tokenizer
(
tokenizer_id
)
# Set global environmnets
# Set global environmnets
...
...
python/sglang/srt/constrained/xgrammar_backend.py
View file @
641b7d0a
...
@@ -117,6 +117,9 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
...
@@ -117,6 +117,9 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
key_type
,
key_string
=
key
key_type
,
key_string
=
key
if
key_type
==
"json"
:
if
key_type
==
"json"
:
try
:
try
:
if
key_string
==
"$$ANY$$"
:
ctx
=
self
.
grammar_compiler
.
compile_builtin_json_grammar
()
else
:
ctx
=
self
.
grammar_compiler
.
compile_json_schema
(
schema
=
key_string
)
ctx
=
self
.
grammar_compiler
.
compile_json_schema
(
schema
=
key_string
)
except
RuntimeError
as
e
:
except
RuntimeError
as
e
:
logging
.
warning
(
logging
.
warning
(
...
...
python/sglang/srt/layers/attention/triton_backend.py
View file @
641b7d0a
...
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING
...
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING
import
torch
import
torch
from
sglang.srt.layers.attention
import
AttentionBackend
from
sglang.srt.layers.attention
import
AttentionBackend
from
sglang.srt.managers.schedule_batch
import
global_server_args_dict
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
...
...
python/sglang/srt/layers/radix_attention.py
View file @
641b7d0a
...
@@ -48,7 +48,14 @@ class RadixAttention(nn.Module):
...
@@ -48,7 +48,14 @@ class RadixAttention(nn.Module):
self
.
sliding_window_size
=
sliding_window_size
or
-
1
self
.
sliding_window_size
=
sliding_window_size
or
-
1
self
.
is_cross_attention
=
is_cross_attention
self
.
is_cross_attention
=
is_cross_attention
def
forward
(
self
,
q
,
k
,
v
,
forward_batch
:
ForwardBatch
,
save_kv_cache
=
True
):
def
forward
(
self
,
q
,
k
,
v
,
forward_batch
:
ForwardBatch
,
save_kv_cache
:
bool
=
True
,
):
if
k
is
not
None
:
if
k
is
not
None
:
# For cross-layer sharing, kv can be None
# For cross-layer sharing, kv can be None
assert
v
is
not
None
assert
v
is
not
None
...
...
python/sglang/srt/managers/schedule_batch.py
View file @
641b7d0a
...
@@ -484,7 +484,7 @@ bid = 0
...
@@ -484,7 +484,7 @@ bid = 0
@
dataclasses
.
dataclass
@
dataclasses
.
dataclass
class
ScheduleBatch
:
class
ScheduleBatch
:
"""Store all infor
a
mtion of a batch on the scheduler."""
"""Store all inform
a
tion of a batch on the scheduler."""
# Request, memory pool, and cache
# Request, memory pool, and cache
reqs
:
List
[
Req
]
reqs
:
List
[
Req
]
...
...
python/sglang/srt/managers/tokenizer_manager.py
View file @
641b7d0a
...
@@ -22,7 +22,7 @@ import signal
...
@@ -22,7 +22,7 @@ import signal
import
sys
import
sys
import
time
import
time
import
uuid
import
uuid
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Union
import
fastapi
import
fastapi
import
uvloop
import
uvloop
...
...
python/sglang/srt/model_executor/cuda_graph_runner.py
View file @
641b7d0a
...
@@ -127,7 +127,7 @@ class CudaGraphRunner:
...
@@ -127,7 +127,7 @@ class CudaGraphRunner:
# Batch sizes to capture
# Batch sizes to capture
if
model_runner
.
server_args
.
disable_cuda_graph_padding
:
if
model_runner
.
server_args
.
disable_cuda_graph_padding
:
self
.
capture_bs
=
list
(
range
(
1
,
3
2
))
+
[
64
,
128
]
self
.
capture_bs
=
list
(
range
(
1
,
3
3
))
+
[
64
,
128
]
else
:
else
:
self
.
capture_bs
=
[
1
,
2
,
4
]
+
[
i
*
8
for
i
in
range
(
1
,
21
)]
self
.
capture_bs
=
[
1
,
2
,
4
]
+
[
i
*
8
for
i
in
range
(
1
,
21
)]
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
641b7d0a
...
@@ -242,20 +242,22 @@ class ModelRunner:
...
@@ -242,20 +242,22 @@ class ModelRunner:
if
torch
.
cuda
.
get_device_capability
()[
1
]
<
5
:
if
torch
.
cuda
.
get_device_capability
()[
1
]
<
5
:
raise
RuntimeError
(
"SGLang only supports sm75 and above."
)
raise
RuntimeError
(
"SGLang only supports sm75 and above."
)
# Prepare the
vllm
model config
# Prepare the model config
self
.
load_config
=
LoadConfig
(
self
.
load_config
=
LoadConfig
(
load_format
=
self
.
server_args
.
load_format
,
load_format
=
self
.
server_args
.
load_format
,
download_dir
=
self
.
server_args
.
download_dir
,
download_dir
=
self
.
server_args
.
download_dir
,
)
)
if
self
.
server_args
.
load_format
==
"gguf"
:
if
self
.
server_args
.
load_format
==
"gguf"
:
monkey_patch_vllm_gguf_config
()
monkey_patch_vllm_gguf_config
()
# Load the model
self
.
model
=
get_model
(
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
model_config
=
self
.
model_config
,
load_config
=
self
.
load_config
,
load_config
=
self
.
load_config
,
device_config
=
DeviceConfig
(
self
.
device
),
device_config
=
DeviceConfig
(
self
.
device
),
)
)
# Parse other args
self
.
sliding_window_size
=
(
self
.
sliding_window_size
=
(
self
.
model
.
get_attention_sliding_window_size
()
self
.
model
.
get_attention_sliding_window_size
()
if
hasattr
(
self
.
model
,
"get_attention_sliding_window_size"
)
if
hasattr
(
self
.
model
,
"get_attention_sliding_window_size"
)
...
@@ -270,8 +272,10 @@ class ModelRunner:
...
@@ -270,8 +272,10 @@ class ModelRunner:
f
"avail mem=
{
get_available_gpu_memory
(
self
.
device
,
self
.
gpu_id
):.
2
f
}
GB"
f
"avail mem=
{
get_available_gpu_memory
(
self
.
device
,
self
.
gpu_id
):.
2
f
}
GB"
)
)
def
update_weights_from_disk
(
self
,
model_path
:
str
,
load_format
:
str
):
def
update_weights_from_disk
(
"""Update engine weights online from disk."""
self
,
model_path
:
str
,
load_format
:
str
)
->
tuple
[
bool
,
str
]:
"""Update engine weights in-place from the disk."""
from
sglang.srt.model_loader.loader
import
(
from
sglang.srt.model_loader.loader
import
(
DefaultModelLoader
,
DefaultModelLoader
,
device_loading_context
,
device_loading_context
,
...
...
python/sglang/srt/models/gemma2_reward.py
View file @
641b7d0a
...
@@ -32,7 +32,6 @@ class Gemma2ForSequenceClassification(nn.Module):
...
@@ -32,7 +32,6 @@ class Gemma2ForSequenceClassification(nn.Module):
)
->
None
:
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
torchao_config
=
None
self
.
quant_config
=
quant_config
self
.
quant_config
=
quant_config
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
model
=
Gemma2Model
(
config
,
quant_config
=
quant_config
)
self
.
model
=
Gemma2Model
(
config
,
quant_config
=
quant_config
)
...
...
python/sglang/srt/models/llama_classification.py
View file @
641b7d0a
...
@@ -33,7 +33,6 @@ class LlamaForClassification(nn.Module):
...
@@ -33,7 +33,6 @@ class LlamaForClassification(nn.Module):
)
->
None
:
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
torchao_config
=
None
self
.
quant_config
=
quant_config
self
.
quant_config
=
quant_config
self
.
model
=
LlamaModel
(
config
,
quant_config
=
quant_config
)
self
.
model
=
LlamaModel
(
config
,
quant_config
=
quant_config
)
...
...
python/sglang/srt/models/llama_reward.py
View file @
641b7d0a
...
@@ -21,7 +21,6 @@ from transformers import LlamaConfig
...
@@ -21,7 +21,6 @@ from transformers import LlamaConfig
from
sglang.srt.layers.pooler
import
EmbeddingPoolerOutput
,
Pooler
,
PoolingType
from
sglang.srt.layers.pooler
import
EmbeddingPoolerOutput
,
Pooler
,
PoolingType
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
from
sglang.srt.model_loader.weight_utils
import
default_weight_loader
from
sglang.srt.models.llama
import
LlamaForCausalLM
,
LlamaModel
from
sglang.srt.models.llama
import
LlamaForCausalLM
,
LlamaModel
...
@@ -33,7 +32,6 @@ class LlamaForSequenceClassification(nn.Module):
...
@@ -33,7 +32,6 @@ class LlamaForSequenceClassification(nn.Module):
)
->
None
:
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
torchao_config
=
None
self
.
quant_config
=
quant_config
self
.
quant_config
=
quant_config
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
model
=
LlamaModel
(
config
,
quant_config
=
quant_config
)
self
.
model
=
LlamaModel
(
config
,
quant_config
=
quant_config
)
...
...
python/sglang/srt/server.py
View file @
641b7d0a
...
@@ -196,7 +196,7 @@ async def stop_profile_async():
...
@@ -196,7 +196,7 @@ async def stop_profile_async():
@
app
.
post
(
"/update_weights_from_disk"
)
@
app
.
post
(
"/update_weights_from_disk"
)
@
time_func_latency
@
time_func_latency
async
def
update_weights_from_disk
(
obj
:
UpdateWeightFromDiskReqInput
,
request
:
Request
):
async
def
update_weights_from_disk
(
obj
:
UpdateWeightFromDiskReqInput
,
request
:
Request
):
"""Update the weights from disk inplace without re-launching the server."""
"""Update the weights from disk in
-
place without re-launching the server."""
success
,
message
=
await
tokenizer_manager
.
update_weights_from_disk
(
obj
,
request
)
success
,
message
=
await
tokenizer_manager
.
update_weights_from_disk
(
obj
,
request
)
content
=
{
"success"
:
success
,
"message"
:
message
}
content
=
{
"success"
:
success
,
"message"
:
message
}
if
success
:
if
success
:
...
...
python/sglang/srt/utils.py
View file @
641b7d0a
...
@@ -169,7 +169,7 @@ def calculate_time(show=False, min_cost_ms=0.0):
...
@@ -169,7 +169,7 @@ def calculate_time(show=False, min_cost_ms=0.0):
return
wrapper
return
wrapper
def
get_available_gpu_memory
(
device
,
gpu_id
,
distributed
=
False
):
def
get_available_gpu_memory
(
device
,
gpu_id
,
distributed
=
False
,
empty_cache
=
True
):
"""
"""
Get available memory for cuda:gpu_id device.
Get available memory for cuda:gpu_id device.
When distributed is True, the available memory is the minimum available memory of all GPUs.
When distributed is True, the available memory is the minimum available memory of all GPUs.
...
@@ -184,6 +184,7 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
...
@@ -184,6 +184,7 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
"which may cause useless memory allocation for torch CUDA context."
,
"which may cause useless memory allocation for torch CUDA context."
,
)
)
if
empty_cache
:
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
free_gpu_memory
,
_
=
torch
.
cuda
.
mem_get_info
(
gpu_id
)
free_gpu_memory
,
_
=
torch
.
cuda
.
mem_get_info
(
gpu_id
)
...
@@ -196,6 +197,8 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
...
@@ -196,6 +197,8 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
f
"WARNING: current device is not
{
gpu_id
}
, but
{
torch
.
xpu
.
current_device
()
}
, "
,
f
"WARNING: current device is not
{
gpu_id
}
, but
{
torch
.
xpu
.
current_device
()
}
, "
,
"which may cause useless memory allocation for torch XPU context."
,
"which may cause useless memory allocation for torch XPU context."
,
)
)
if
empty_cache
:
torch
.
xpu
.
empty_cache
()
torch
.
xpu
.
empty_cache
()
used_memory
=
torch
.
xpu
.
memory_allocated
()
used_memory
=
torch
.
xpu
.
memory_allocated
()
total_gpu_memory
=
torch
.
xpu
.
get_device_properties
(
gpu_id
).
total_memory
total_gpu_memory
=
torch
.
xpu
.
get_device_properties
(
gpu_id
).
total_memory
...
...
test/srt/run_suite.py
View file @
641b7d0a
...
@@ -15,7 +15,6 @@ suites = {
...
@@ -15,7 +15,6 @@ suites = {
"test_double_sparsity.py"
,
"test_double_sparsity.py"
,
"test_embedding_openai_server.py"
,
"test_embedding_openai_server.py"
,
"test_eval_accuracy_mini.py"
,
"test_eval_accuracy_mini.py"
,
"test_fused_moe.py"
,
"test_get_weights_by_name.py"
,
"test_get_weights_by_name.py"
,
"test_gguf.py"
,
"test_gguf.py"
,
"test_input_embeddings.py"
,
"test_input_embeddings.py"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment