Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
641b7d0a
"src/vscode:/vscode.git/clone" did not exist on "9f91305f8500fa6575671e7e77641919a1bc6fbc"
Unverified
Commit
641b7d0a
authored
Dec 09, 2024
by
Lianmin Zheng
Committed by
GitHub
Dec 09, 2024
Browse files
[Minor] Improve code style (#2422)
parent
0ce091a8
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
33 additions
and
21 deletions
+33
-21
python/pyproject.toml
python/pyproject.toml
+2
-1
python/sglang/bench_offline_throughput.py
python/sglang/bench_offline_throughput.py
+1
-1
python/sglang/srt/constrained/xgrammar_backend.py
python/sglang/srt/constrained/xgrammar_backend.py
+4
-1
python/sglang/srt/layers/attention/triton_backend.py
python/sglang/srt/layers/attention/triton_backend.py
+0
-1
python/sglang/srt/layers/radix_attention.py
python/sglang/srt/layers/radix_attention.py
+8
-1
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+1
-1
python/sglang/srt/managers/tokenizer_manager.py
python/sglang/srt/managers/tokenizer_manager.py
+1
-1
python/sglang/srt/model_executor/cuda_graph_runner.py
python/sglang/srt/model_executor/cuda_graph_runner.py
+1
-1
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+8
-4
python/sglang/srt/models/gemma2_reward.py
python/sglang/srt/models/gemma2_reward.py
+0
-1
python/sglang/srt/models/llama_classification.py
python/sglang/srt/models/llama_classification.py
+0
-1
python/sglang/srt/models/llama_reward.py
python/sglang/srt/models/llama_reward.py
+0
-2
python/sglang/srt/server.py
python/sglang/srt/server.py
+1
-1
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+6
-3
test/srt/run_suite.py
test/srt/run_suite.py
+0
-1
No files found.
python/pyproject.toml
View file @
641b7d0a
...
...
@@ -33,7 +33,7 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
srt_xpu
=
["sglang[runtime_common]"]
#For Intel Gaudi(device : hpu) follow the installation guide
#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
srt_hpu
=
["sglang[runtime_common]"]
srt_hpu
=
["sglang[runtime_common]"]
openai
=
[
"openai>=1.0"
,
"tiktoken"
]
anthropic
=
["anthropic>=0.20.0"]
...
...
@@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
all_hip
=
["sglang[srt_hip]
", "
sglang
[openai]
", "
sglang
[anthropic]
", "
sglang
[litellm]"]
all_xpu
=
["sglang[srt_xpu]
", "
sglang
[openai]
", "
sglang
[anthropic]
", "
sglang
[litellm]"]
all_hpu
=
["sglang[srt_hpu]
", "
sglang
[openai]
", "
sglang
[anthropic]
", "
sglang
[litellm]"]
dev
=
["sglang[all]
", "
sglang
[test]"]
dev_hip
=
["sglang[all_hip]
", "
sglang
[test]"]
dev_xpu
=
["sglang[all_xpu]
", "
sglang
[test]"]
...
...
python/sglang/bench_offline_throughput.py
View file @
641b7d0a
...
...
@@ -285,7 +285,7 @@ def throughput_test(
else
:
raise
ValueError
(
'Please set backend to either "engine" or "runtime"'
)
tokenizer_id
=
server_args
.
model_path
tokenizer_id
=
server_args
.
tokenizer_path
or
server_args
.
model_path
tokenizer
=
get_tokenizer
(
tokenizer_id
)
# Set global environmnets
...
...
python/sglang/srt/constrained/xgrammar_backend.py
View file @
641b7d0a
...
...
@@ -117,7 +117,10 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
key_type
,
key_string
=
key
if
key_type
==
"json"
:
try
:
ctx
=
self
.
grammar_compiler
.
compile_json_schema
(
schema
=
key_string
)
if
key_string
==
"$$ANY$$"
:
ctx
=
self
.
grammar_compiler
.
compile_builtin_json_grammar
()
else
:
ctx
=
self
.
grammar_compiler
.
compile_json_schema
(
schema
=
key_string
)
except
RuntimeError
as
e
:
logging
.
warning
(
f
"Skip invalid json_schema: json_schema=
{
key_string
}
,
{
e
=
}
"
...
...
python/sglang/srt/layers/attention/triton_backend.py
View file @
641b7d0a
...
...
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING
import
torch
from
sglang.srt.layers.attention
import
AttentionBackend
from
sglang.srt.managers.schedule_batch
import
global_server_args_dict
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
if
TYPE_CHECKING
:
...
...
python/sglang/srt/layers/radix_attention.py
View file @
641b7d0a
...
...
@@ -48,7 +48,14 @@ class RadixAttention(nn.Module):
self
.
sliding_window_size
=
sliding_window_size
or
-
1
self
.
is_cross_attention
=
is_cross_attention
def
forward
(
self
,
q
,
k
,
v
,
forward_batch
:
ForwardBatch
,
save_kv_cache
=
True
):
def
forward
(
self
,
q
,
k
,
v
,
forward_batch
:
ForwardBatch
,
save_kv_cache
:
bool
=
True
,
):
if
k
is
not
None
:
# For cross-layer sharing, kv can be None
assert
v
is
not
None
...
...
python/sglang/srt/managers/schedule_batch.py
View file @
641b7d0a
...
...
@@ -484,7 +484,7 @@ bid = 0
@
dataclasses
.
dataclass
class
ScheduleBatch
:
"""Store all infor
a
mtion of a batch on the scheduler."""
"""Store all inform
a
tion of a batch on the scheduler."""
# Request, memory pool, and cache
reqs
:
List
[
Req
]
...
...
python/sglang/srt/managers/tokenizer_manager.py
View file @
641b7d0a
...
...
@@ -22,7 +22,7 @@ import signal
import
sys
import
time
import
uuid
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Union
import
fastapi
import
uvloop
...
...
python/sglang/srt/model_executor/cuda_graph_runner.py
View file @
641b7d0a
...
...
@@ -127,7 +127,7 @@ class CudaGraphRunner:
# Batch sizes to capture
if
model_runner
.
server_args
.
disable_cuda_graph_padding
:
self
.
capture_bs
=
list
(
range
(
1
,
3
2
))
+
[
64
,
128
]
self
.
capture_bs
=
list
(
range
(
1
,
3
3
))
+
[
64
,
128
]
else
:
self
.
capture_bs
=
[
1
,
2
,
4
]
+
[
i
*
8
for
i
in
range
(
1
,
21
)]
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
641b7d0a
...
...
@@ -242,20 +242,22 @@ class ModelRunner:
if
torch
.
cuda
.
get_device_capability
()[
1
]
<
5
:
raise
RuntimeError
(
"SGLang only supports sm75 and above."
)
# Prepare the
vllm
model config
# Prepare the model config
self
.
load_config
=
LoadConfig
(
load_format
=
self
.
server_args
.
load_format
,
download_dir
=
self
.
server_args
.
download_dir
,
)
if
self
.
server_args
.
load_format
==
"gguf"
:
monkey_patch_vllm_gguf_config
()
# Load the model
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
load_config
=
self
.
load_config
,
device_config
=
DeviceConfig
(
self
.
device
),
)
# Parse other args
self
.
sliding_window_size
=
(
self
.
model
.
get_attention_sliding_window_size
()
if
hasattr
(
self
.
model
,
"get_attention_sliding_window_size"
)
...
...
@@ -270,8 +272,10 @@ class ModelRunner:
f
"avail mem=
{
get_available_gpu_memory
(
self
.
device
,
self
.
gpu_id
):.
2
f
}
GB"
)
def
update_weights_from_disk
(
self
,
model_path
:
str
,
load_format
:
str
):
"""Update engine weights online from disk."""
def
update_weights_from_disk
(
self
,
model_path
:
str
,
load_format
:
str
)
->
tuple
[
bool
,
str
]:
"""Update engine weights in-place from the disk."""
from
sglang.srt.model_loader.loader
import
(
DefaultModelLoader
,
device_loading_context
,
...
...
python/sglang/srt/models/gemma2_reward.py
View file @
641b7d0a
...
...
@@ -32,7 +32,6 @@ class Gemma2ForSequenceClassification(nn.Module):
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
torchao_config
=
None
self
.
quant_config
=
quant_config
self
.
num_labels
=
config
.
num_labels
self
.
model
=
Gemma2Model
(
config
,
quant_config
=
quant_config
)
...
...
python/sglang/srt/models/llama_classification.py
View file @
641b7d0a
...
...
@@ -33,7 +33,6 @@ class LlamaForClassification(nn.Module):
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
torchao_config
=
None
self
.
quant_config
=
quant_config
self
.
model
=
LlamaModel
(
config
,
quant_config
=
quant_config
)
...
...
python/sglang/srt/models/llama_reward.py
View file @
641b7d0a
...
...
@@ -21,7 +21,6 @@ from transformers import LlamaConfig
from
sglang.srt.layers.pooler
import
EmbeddingPoolerOutput
,
Pooler
,
PoolingType
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
from
sglang.srt.model_loader.weight_utils
import
default_weight_loader
from
sglang.srt.models.llama
import
LlamaForCausalLM
,
LlamaModel
...
...
@@ -33,7 +32,6 @@ class LlamaForSequenceClassification(nn.Module):
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
torchao_config
=
None
self
.
quant_config
=
quant_config
self
.
num_labels
=
config
.
num_labels
self
.
model
=
LlamaModel
(
config
,
quant_config
=
quant_config
)
...
...
python/sglang/srt/server.py
View file @
641b7d0a
...
...
@@ -196,7 +196,7 @@ async def stop_profile_async():
@
app
.
post
(
"/update_weights_from_disk"
)
@
time_func_latency
async
def
update_weights_from_disk
(
obj
:
UpdateWeightFromDiskReqInput
,
request
:
Request
):
"""Update the weights from disk inplace without re-launching the server."""
"""Update the weights from disk in
-
place without re-launching the server."""
success
,
message
=
await
tokenizer_manager
.
update_weights_from_disk
(
obj
,
request
)
content
=
{
"success"
:
success
,
"message"
:
message
}
if
success
:
...
...
python/sglang/srt/utils.py
View file @
641b7d0a
...
...
@@ -169,7 +169,7 @@ def calculate_time(show=False, min_cost_ms=0.0):
return
wrapper
def
get_available_gpu_memory
(
device
,
gpu_id
,
distributed
=
False
):
def
get_available_gpu_memory
(
device
,
gpu_id
,
distributed
=
False
,
empty_cache
=
True
):
"""
Get available memory for cuda:gpu_id device.
When distributed is True, the available memory is the minimum available memory of all GPUs.
...
...
@@ -184,7 +184,8 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
"which may cause useless memory allocation for torch CUDA context."
,
)
torch
.
cuda
.
empty_cache
()
if
empty_cache
:
torch
.
cuda
.
empty_cache
()
free_gpu_memory
,
_
=
torch
.
cuda
.
mem_get_info
(
gpu_id
)
elif
device
==
"xpu"
:
...
...
@@ -196,7 +197,9 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
f
"WARNING: current device is not
{
gpu_id
}
, but
{
torch
.
xpu
.
current_device
()
}
, "
,
"which may cause useless memory allocation for torch XPU context."
,
)
torch
.
xpu
.
empty_cache
()
if
empty_cache
:
torch
.
xpu
.
empty_cache
()
used_memory
=
torch
.
xpu
.
memory_allocated
()
total_gpu_memory
=
torch
.
xpu
.
get_device_properties
(
gpu_id
).
total_memory
free_gpu_memory
=
total_gpu_memory
-
used_memory
...
...
test/srt/run_suite.py
View file @
641b7d0a
...
...
@@ -15,7 +15,6 @@ suites = {
"test_double_sparsity.py"
,
"test_embedding_openai_server.py"
,
"test_eval_accuracy_mini.py"
,
"test_fused_moe.py"
,
"test_get_weights_by_name.py"
,
"test_gguf.py"
,
"test_input_embeddings.py"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment