Unverified Commit 325a06c2 authored by Ying Sheng's avatar Ying Sheng Committed by GitHub
Browse files

Fix logging (#796)

parent 79f81629
...@@ -6,7 +6,7 @@ Achieving a large batch size is the most important thing for attaining high thro ...@@ -6,7 +6,7 @@ Achieving a large batch size is the most important thing for attaining high thro
When the server is running at full load, look for the following in the log: When the server is running at full load, look for the following in the log:
```[gpu_id=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417``` ```[gpu=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417```
### Tune Your Request Submission Speed ### Tune Your Request Submission Speed
`#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed. `#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed.
......
...@@ -18,7 +18,7 @@ except ImportError as e: ...@@ -18,7 +18,7 @@ except ImportError as e:
openai = tiktoken = e openai = tiktoken = e
logger = logging.getLogger("openai") logger = logging.getLogger(__name__)
def create_logit_bias_int(tokenizer): def create_logit_bias_int(tokenizer):
......
...@@ -15,6 +15,7 @@ limitations under the License. ...@@ -15,6 +15,7 @@ limitations under the License.
"""Meta data for requests and batches""" """Meta data for requests and batches"""
import logging
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from enum import IntEnum, auto from enum import IntEnum, auto
...@@ -40,6 +41,9 @@ global_server_args_dict = { ...@@ -40,6 +41,9 @@ global_server_args_dict = {
} }
logger = logging.getLogger(__name__)
class ForwardMode(IntEnum): class ForwardMode(IntEnum):
# Prefill a new sequence. This is deprecated now. "EXTEND" covers this case. # Prefill a new sequence. This is deprecated now. "EXTEND" covers this case.
PREFILL = auto() PREFILL = auto()
...@@ -379,7 +383,7 @@ class Batch: ...@@ -379,7 +383,7 @@ class Batch:
out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens) out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
if out_cache_loc is None: if out_cache_loc is None:
print("Prefill out of memory. This should never happen.") logger.error("Prefill out of memory. This should never happen.")
self.tree_cache.pretty_print() self.tree_cache.pretty_print()
exit() exit()
...@@ -613,7 +617,7 @@ class Batch: ...@@ -613,7 +617,7 @@ class Batch:
self.out_cache_loc = self.token_to_kv_pool.alloc(bs) self.out_cache_loc = self.token_to_kv_pool.alloc(bs)
if self.out_cache_loc is None: if self.out_cache_loc is None:
print("Decode out of memory. This should never happen.") logger.error("Decode out of memory. This should never happen.")
self.tree_cache.pretty_print() self.tree_cache.pretty_print()
exit() exit()
......
...@@ -39,7 +39,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs ...@@ -39,7 +39,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.utils import kill_parent_process from sglang.srt.utils import kill_parent_process
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
logger = logging.getLogger("srt.controller") logger = logging.getLogger(__name__)
class LoadBalanceMethod(Enum): class LoadBalanceMethod(Enum):
......
...@@ -31,7 +31,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs ...@@ -31,7 +31,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.utils import kill_parent_process from sglang.srt.utils import kill_parent_process
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
logger = logging.getLogger("srt.controller") logger = logging.getLogger(__name__)
class ControllerSingle: class ControllerSingle:
......
...@@ -57,7 +57,7 @@ from sglang.srt.utils import ( ...@@ -57,7 +57,7 @@ from sglang.srt.utils import (
monkey_patch_vllm_qvk_linear_loader, monkey_patch_vllm_qvk_linear_loader,
) )
logger = logging.getLogger("srt.model_runner") logger = logging.getLogger(__name__)
class ModelRunner: class ModelRunner:
...@@ -90,7 +90,7 @@ class ModelRunner: ...@@ -90,7 +90,7 @@ class ModelRunner:
# Init torch distributed # Init torch distributed
torch.cuda.set_device(self.gpu_id) torch.cuda.set_device(self.gpu_id)
logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.") logger.info(f"[gpu={self.gpu_id}] Init nccl begin.")
if not server_args.enable_p2p_check: if not server_args.enable_p2p_check:
monkey_patch_vllm_p2p_access_check(self.gpu_id) monkey_patch_vllm_p2p_access_check(self.gpu_id)
...@@ -130,7 +130,7 @@ class ModelRunner: ...@@ -130,7 +130,7 @@ class ModelRunner:
def load_model(self): def load_model(self):
logger.info( logger.info(
f"[gpu_id={self.gpu_id}] Load weight begin. " f"[gpu={self.gpu_id}] Load weight begin. "
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
) )
...@@ -178,7 +178,7 @@ class ModelRunner: ...@@ -178,7 +178,7 @@ class ModelRunner:
cache_config=None, cache_config=None,
) )
logger.info( logger.info(
f"[gpu_id={self.gpu_id}] Load weight end. " f"[gpu={self.gpu_id}] Load weight end. "
f"type={type(self.model).__name__}, " f"type={type(self.model).__name__}, "
f"dtype={self.dtype}, " f"dtype={self.dtype}, "
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
...@@ -229,7 +229,7 @@ class ModelRunner: ...@@ -229,7 +229,7 @@ class ModelRunner:
layer_num=self.model_config.num_hidden_layers, layer_num=self.model_config.num_hidden_layers,
) )
logger.info( logger.info(
f"[gpu_id={self.gpu_id}] Memory pool end. " f"[gpu={self.gpu_id}] Memory pool end. "
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
) )
...@@ -280,7 +280,7 @@ class ModelRunner: ...@@ -280,7 +280,7 @@ class ModelRunner:
return return
logger.info( logger.info(
f"[gpu_id={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes." f"[gpu={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
) )
batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)] batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)]
self.cuda_graph_runner = CudaGraphRunner( self.cuda_graph_runner = CudaGraphRunner(
......
...@@ -55,7 +55,7 @@ from sglang.srt.utils import ( ...@@ -55,7 +55,7 @@ from sglang.srt.utils import (
) )
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
logger = logging.getLogger("srt.tp_worker") logger = logging.getLogger(__name__)
class ModelTpServer: class ModelTpServer:
...@@ -132,7 +132,7 @@ class ModelTpServer: ...@@ -132,7 +132,7 @@ class ModelTpServer:
# Print info # Print info
logger.info( logger.info(
f"[gpu_id={self.gpu_id}] " f"[gpu={self.gpu_id}] "
f"max_total_num_tokens={self.max_total_num_tokens}, " f"max_total_num_tokens={self.max_total_num_tokens}, "
f"max_prefill_tokens={self.max_prefill_tokens}, " f"max_prefill_tokens={self.max_prefill_tokens}, "
f"max_running_requests={self.max_running_requests}, " f"max_running_requests={self.max_running_requests}, "
...@@ -256,7 +256,7 @@ class ModelTpServer: ...@@ -256,7 +256,7 @@ class ModelTpServer:
self.num_generated_tokens = 0 self.num_generated_tokens = 0
self.last_stats_tic = time.time() self.last_stats_tic = time.time()
logger.info( logger.info(
f"[gpu_id={self.gpu_id}] Decode batch. " f"[gpu={self.gpu_id}] Decode batch. "
f"#running-req: {len(self.running_batch.reqs)}, " f"#running-req: {len(self.running_batch.reqs)}, "
f"#token: {num_used}, " f"#token: {num_used}, "
f"token usage: {num_used / self.max_total_num_tokens:.2f}, " f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
...@@ -434,7 +434,7 @@ class ModelTpServer: ...@@ -434,7 +434,7 @@ class ModelTpServer:
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
) )
logger.info( logger.info(
f"[gpu_id={self.gpu_id}] Prefill batch. " f"[gpu={self.gpu_id}] Prefill batch. "
f"#new-seq: {len(can_run_list)}, " f"#new-seq: {len(can_run_list)}, "
f"#new-token: {new_batch_input_tokens}, " f"#new-token: {new_batch_input_tokens}, "
f"#cached-token: {hit_tokens}, " f"#cached-token: {hit_tokens}, "
......
...@@ -38,7 +38,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf ...@@ -38,7 +38,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
from sglang.srt.layers.quantization import get_quantization_config from sglang.srt.layers.quantization import get_quantization_config
logger = logging.getLogger("srt.model_loader") logger = logging.getLogger(__name__)
temp_dir = tempfile.gettempdir() temp_dir = tempfile.gettempdir()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment