"docs/vscode:/vscode.git/clone" did not exist on "8db3c9bc9fb3aed9fb8392945ec75e5237a351ba"
Unverified Commit 325a06c2 authored by Ying Sheng's avatar Ying Sheng Committed by GitHub
Browse files

Fix logging (#796)

parent 79f81629
......@@ -6,7 +6,7 @@ Achieving a large batch size is the most important thing for attaining high thro
When the server is running at full load, look for the following in the log:
```[gpu_id=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417```
```[gpu=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417```
### Tune Your Request Submission Speed
`#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed.
......
......@@ -18,7 +18,7 @@ except ImportError as e:
openai = tiktoken = e
logger = logging.getLogger("openai")
logger = logging.getLogger(__name__)
def create_logit_bias_int(tokenizer):
......
......@@ -15,6 +15,7 @@ limitations under the License.
"""Meta data for requests and batches"""
import logging
import warnings
from dataclasses import dataclass
from enum import IntEnum, auto
......@@ -40,6 +41,9 @@ global_server_args_dict = {
}
logger = logging.getLogger(__name__)
class ForwardMode(IntEnum):
# Prefill a new sequence. This is deprecated now. "EXTEND" covers this case.
PREFILL = auto()
......@@ -379,7 +383,7 @@ class Batch:
out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
if out_cache_loc is None:
print("Prefill out of memory. This should never happen.")
logger.error("Prefill out of memory. This should never happen.")
self.tree_cache.pretty_print()
exit()
......@@ -613,7 +617,7 @@ class Batch:
self.out_cache_loc = self.token_to_kv_pool.alloc(bs)
if self.out_cache_loc is None:
print("Decode out of memory. This should never happen.")
logger.error("Decode out of memory. This should never happen.")
self.tree_cache.pretty_print()
exit()
......
......@@ -39,7 +39,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.utils import kill_parent_process
from sglang.utils import get_exception_traceback
logger = logging.getLogger("srt.controller")
logger = logging.getLogger(__name__)
class LoadBalanceMethod(Enum):
......
......@@ -31,7 +31,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.utils import kill_parent_process
from sglang.utils import get_exception_traceback
logger = logging.getLogger("srt.controller")
logger = logging.getLogger(__name__)
class ControllerSingle:
......
......@@ -57,7 +57,7 @@ from sglang.srt.utils import (
monkey_patch_vllm_qvk_linear_loader,
)
logger = logging.getLogger("srt.model_runner")
logger = logging.getLogger(__name__)
class ModelRunner:
......@@ -90,7 +90,7 @@ class ModelRunner:
# Init torch distributed
torch.cuda.set_device(self.gpu_id)
logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
logger.info(f"[gpu={self.gpu_id}] Init nccl begin.")
if not server_args.enable_p2p_check:
monkey_patch_vllm_p2p_access_check(self.gpu_id)
......@@ -130,7 +130,7 @@ class ModelRunner:
def load_model(self):
logger.info(
f"[gpu_id={self.gpu_id}] Load weight begin. "
f"[gpu={self.gpu_id}] Load weight begin. "
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
)
......@@ -178,7 +178,7 @@ class ModelRunner:
cache_config=None,
)
logger.info(
f"[gpu_id={self.gpu_id}] Load weight end. "
f"[gpu={self.gpu_id}] Load weight end. "
f"type={type(self.model).__name__}, "
f"dtype={self.dtype}, "
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
......@@ -229,7 +229,7 @@ class ModelRunner:
layer_num=self.model_config.num_hidden_layers,
)
logger.info(
f"[gpu_id={self.gpu_id}] Memory pool end. "
f"[gpu={self.gpu_id}] Memory pool end. "
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
)
......@@ -280,7 +280,7 @@ class ModelRunner:
return
logger.info(
f"[gpu_id={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
f"[gpu={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
)
batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)]
self.cuda_graph_runner = CudaGraphRunner(
......
......@@ -55,7 +55,7 @@ from sglang.srt.utils import (
)
from sglang.utils import get_exception_traceback
logger = logging.getLogger("srt.tp_worker")
logger = logging.getLogger(__name__)
class ModelTpServer:
......@@ -132,7 +132,7 @@ class ModelTpServer:
# Print info
logger.info(
f"[gpu_id={self.gpu_id}] "
f"[gpu={self.gpu_id}] "
f"max_total_num_tokens={self.max_total_num_tokens}, "
f"max_prefill_tokens={self.max_prefill_tokens}, "
f"max_running_requests={self.max_running_requests}, "
......@@ -256,7 +256,7 @@ class ModelTpServer:
self.num_generated_tokens = 0
self.last_stats_tic = time.time()
logger.info(
f"[gpu_id={self.gpu_id}] Decode batch. "
f"[gpu={self.gpu_id}] Decode batch. "
f"#running-req: {len(self.running_batch.reqs)}, "
f"#token: {num_used}, "
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
......@@ -434,7 +434,7 @@ class ModelTpServer:
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
)
logger.info(
f"[gpu_id={self.gpu_id}] Prefill batch. "
f"[gpu={self.gpu_id}] Prefill batch. "
f"#new-seq: {len(can_run_list)}, "
f"#new-token: {new_batch_input_tokens}, "
f"#cached-token: {hit_tokens}, "
......
......@@ -38,7 +38,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
from sglang.srt.layers.quantization import get_quantization_config
logger = logging.getLogger("srt.model_loader")
logger = logging.getLogger(__name__)
temp_dir = tempfile.gettempdir()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment