Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
325a06c2
"docs/vscode:/vscode.git/clone" did not exist on "8db3c9bc9fb3aed9fb8392945ec75e5237a351ba"
Unverified
Commit
325a06c2
authored
Jul 28, 2024
by
Ying Sheng
Committed by
GitHub
Jul 28, 2024
Browse files
Fix logging (#796)
parent
79f81629
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
21 additions
and
17 deletions
+21
-17
docs/en/hyperparameter_tuning.md
docs/en/hyperparameter_tuning.md
+1
-1
python/sglang/lang/backend/openai.py
python/sglang/lang/backend/openai.py
+1
-1
python/sglang/srt/managers/controller/infer_batch.py
python/sglang/srt/managers/controller/infer_batch.py
+6
-2
python/sglang/srt/managers/controller/manager_multi.py
python/sglang/srt/managers/controller/manager_multi.py
+1
-1
python/sglang/srt/managers/controller/manager_single.py
python/sglang/srt/managers/controller/manager_single.py
+1
-1
python/sglang/srt/managers/controller/model_runner.py
python/sglang/srt/managers/controller/model_runner.py
+6
-6
python/sglang/srt/managers/controller/tp_worker.py
python/sglang/srt/managers/controller/tp_worker.py
+4
-4
python/sglang/srt/model_loader/utils.py
python/sglang/srt/model_loader/utils.py
+1
-1
No files found.
docs/en/hyperparameter_tuning.md
View file @
325a06c2
...
...
@@ -6,7 +6,7 @@ Achieving a large batch size is the most important thing for attaining high thro
When the server is running at full load, look for the following in the log:
```
[gpu
_id
=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417```
```
[gpu=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417```
### Tune Your Request Submission Speed
`#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed.
...
...
python/sglang/lang/backend/openai.py
View file @
325a06c2
...
...
@@ -18,7 +18,7 @@ except ImportError as e:
openai
=
tiktoken
=
e
logger
=
logging
.
getLogger
(
"openai"
)
logger
=
logging
.
getLogger
(
__name__
)
def
create_logit_bias_int
(
tokenizer
):
...
...
python/sglang/srt/managers/controller/infer_batch.py
View file @
325a06c2
...
...
@@ -15,6 +15,7 @@ limitations under the License.
"""Meta data for requests and batches"""
import
logging
import
warnings
from
dataclasses
import
dataclass
from
enum
import
IntEnum
,
auto
...
...
@@ -40,6 +41,9 @@ global_server_args_dict = {
}
logger
=
logging
.
getLogger
(
__name__
)
class
ForwardMode
(
IntEnum
):
# Prefill a new sequence. This is deprecated now. "EXTEND" covers this case.
PREFILL
=
auto
()
...
...
@@ -379,7 +383,7 @@ class Batch:
out_cache_loc
=
self
.
token_to_kv_pool
.
alloc
(
extend_num_tokens
)
if
out_cache_loc
is
None
:
print
(
"Prefill out of memory. This should never happen."
)
logger
.
error
(
"Prefill out of memory. This should never happen."
)
self
.
tree_cache
.
pretty_print
()
exit
()
...
...
@@ -613,7 +617,7 @@ class Batch:
self
.
out_cache_loc
=
self
.
token_to_kv_pool
.
alloc
(
bs
)
if
self
.
out_cache_loc
is
None
:
print
(
"Decode out of memory. This should never happen."
)
logger
.
error
(
"Decode out of memory. This should never happen."
)
self
.
tree_cache
.
pretty_print
()
exit
()
...
...
python/sglang/srt/managers/controller/manager_multi.py
View file @
325a06c2
...
...
@@ -39,7 +39,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
from
sglang.srt.utils
import
kill_parent_process
from
sglang.utils
import
get_exception_traceback
logger
=
logging
.
getLogger
(
"srt.controller"
)
logger
=
logging
.
getLogger
(
__name__
)
class
LoadBalanceMethod
(
Enum
):
...
...
python/sglang/srt/managers/controller/manager_single.py
View file @
325a06c2
...
...
@@ -31,7 +31,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
from
sglang.srt.utils
import
kill_parent_process
from
sglang.utils
import
get_exception_traceback
logger
=
logging
.
getLogger
(
"srt.controller"
)
logger
=
logging
.
getLogger
(
__name__
)
class
ControllerSingle
:
...
...
python/sglang/srt/managers/controller/model_runner.py
View file @
325a06c2
...
...
@@ -57,7 +57,7 @@ from sglang.srt.utils import (
monkey_patch_vllm_qvk_linear_loader
,
)
logger
=
logging
.
getLogger
(
"srt.model_runner"
)
logger
=
logging
.
getLogger
(
__name__
)
class
ModelRunner
:
...
...
@@ -90,7 +90,7 @@ class ModelRunner:
# Init torch distributed
torch
.
cuda
.
set_device
(
self
.
gpu_id
)
logger
.
info
(
f
"[gpu
_id
=
{
self
.
gpu_id
}
] Init nccl begin."
)
logger
.
info
(
f
"[gpu=
{
self
.
gpu_id
}
] Init nccl begin."
)
if
not
server_args
.
enable_p2p_check
:
monkey_patch_vllm_p2p_access_check
(
self
.
gpu_id
)
...
...
@@ -130,7 +130,7 @@ class ModelRunner:
def
load_model
(
self
):
logger
.
info
(
f
"[gpu
_id
=
{
self
.
gpu_id
}
] Load weight begin. "
f
"[gpu=
{
self
.
gpu_id
}
] Load weight begin. "
f
"avail mem=
{
get_available_gpu_memory
(
self
.
gpu_id
):.
2
f
}
GB"
)
...
...
@@ -178,7 +178,7 @@ class ModelRunner:
cache_config
=
None
,
)
logger
.
info
(
f
"[gpu
_id
=
{
self
.
gpu_id
}
] Load weight end. "
f
"[gpu=
{
self
.
gpu_id
}
] Load weight end. "
f
"type=
{
type
(
self
.
model
).
__name__
}
, "
f
"dtype=
{
self
.
dtype
}
, "
f
"avail mem=
{
get_available_gpu_memory
(
self
.
gpu_id
):.
2
f
}
GB"
...
...
@@ -229,7 +229,7 @@ class ModelRunner:
layer_num
=
self
.
model_config
.
num_hidden_layers
,
)
logger
.
info
(
f
"[gpu
_id
=
{
self
.
gpu_id
}
] Memory pool end. "
f
"[gpu=
{
self
.
gpu_id
}
] Memory pool end. "
f
"avail mem=
{
get_available_gpu_memory
(
self
.
gpu_id
):.
2
f
}
GB"
)
...
...
@@ -280,7 +280,7 @@ class ModelRunner:
return
logger
.
info
(
f
"[gpu
_id
=
{
self
.
gpu_id
}
] Capture cuda graph begin. This can take up to several minutes."
f
"[gpu=
{
self
.
gpu_id
}
] Capture cuda graph begin. This can take up to several minutes."
)
batch_size_list
=
[
1
,
2
,
4
]
+
[
i
*
8
for
i
in
range
(
1
,
17
)]
self
.
cuda_graph_runner
=
CudaGraphRunner
(
...
...
python/sglang/srt/managers/controller/tp_worker.py
View file @
325a06c2
...
...
@@ -55,7 +55,7 @@ from sglang.srt.utils import (
)
from
sglang.utils
import
get_exception_traceback
logger
=
logging
.
getLogger
(
"srt.tp_worker"
)
logger
=
logging
.
getLogger
(
__name__
)
class
ModelTpServer
:
...
...
@@ -132,7 +132,7 @@ class ModelTpServer:
# Print info
logger
.
info
(
f
"[gpu
_id
=
{
self
.
gpu_id
}
] "
f
"[gpu=
{
self
.
gpu_id
}
] "
f
"max_total_num_tokens=
{
self
.
max_total_num_tokens
}
, "
f
"max_prefill_tokens=
{
self
.
max_prefill_tokens
}
, "
f
"max_running_requests=
{
self
.
max_running_requests
}
, "
...
...
@@ -256,7 +256,7 @@ class ModelTpServer:
self
.
num_generated_tokens
=
0
self
.
last_stats_tic
=
time
.
time
()
logger
.
info
(
f
"[gpu
_id
=
{
self
.
gpu_id
}
] Decode batch. "
f
"[gpu=
{
self
.
gpu_id
}
] Decode batch. "
f
"#running-req:
{
len
(
self
.
running_batch
.
reqs
)
}
, "
f
"#token:
{
num_used
}
, "
f
"token usage:
{
num_used
/
self
.
max_total_num_tokens
:.
2
f
}
, "
...
...
@@ -434,7 +434,7 @@ class ModelTpServer:
self
.
tree_cache_metrics
[
"hit"
]
/
self
.
tree_cache_metrics
[
"total"
]
)
logger
.
info
(
f
"[gpu
_id
=
{
self
.
gpu_id
}
] Prefill batch. "
f
"[gpu=
{
self
.
gpu_id
}
] Prefill batch. "
f
"#new-seq:
{
len
(
can_run_list
)
}
, "
f
"#new-token:
{
new_batch_input_tokens
}
, "
f
"#cached-token:
{
hit_tokens
}
, "
...
...
python/sglang/srt/model_loader/utils.py
View file @
325a06c2
...
...
@@ -38,7 +38,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
from
sglang.srt.layers.quantization
import
get_quantization_config
logger
=
logging
.
getLogger
(
"srt.model_loader"
)
logger
=
logging
.
getLogger
(
__name__
)
temp_dir
=
tempfile
.
gettempdir
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment