Unverified Commit 3ddb1c46 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[Minor] Fix logger and style (#2325)

parent 480e38a7
......@@ -163,7 +163,6 @@ async def async_request_openai_completions(
"max_tokens": request_func_input.output_len,
"stream": not args.disable_stream,
"ignore_eos": not args.disable_ignore_eos,
"lora_path": request_func_input.lora_name,
**request_func_input.extra_request_body,
}
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
......
......@@ -16,6 +16,7 @@
import gc
import json
import logging
import time
from typing import Optional
import torch
......@@ -129,7 +130,7 @@ class ModelRunner:
# Global vars
if server_args.show_time_cost:
enable_show_time_cost()
if server_args.disable_disk_cache:
if server_args.disable_outlines_disk_cache:
from outlines.caching import disable_cache
disable_cache()
......@@ -623,8 +624,10 @@ class ModelRunner:
if self.server_args.disable_cuda_graph:
return
tic = time.time()
logger.info("Capture cuda graph begin. This can take up to several minutes.")
self.cuda_graph_runner = CudaGraphRunner(self)
logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f}s")
def apply_torch_tp(self):
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
......
......@@ -122,7 +122,7 @@ class ServerArgs:
disable_jump_forward: bool = False
disable_cuda_graph: bool = False
disable_cuda_graph_padding: bool = False
disable_disk_cache: bool = False
disable_outlines_disk_cache: bool = False
disable_custom_all_reduce: bool = False
disable_mla: bool = False
disable_overlap_schedule: bool = False
......@@ -159,7 +159,7 @@ class ServerArgs:
if self.tp_size >= 16:
self.mem_fraction_static = 0.79
elif self.tp_size >= 8:
self.mem_fraction_static = 0.82
self.mem_fraction_static = 0.81
elif self.tp_size >= 4:
self.mem_fraction_static = 0.85
elif self.tp_size >= 2:
......@@ -192,7 +192,7 @@ class ServerArgs:
)
if self.attention_backend == "torch_native":
logger.info(
logger.warning(
"Cuda graph is disabled because of using torch native attention backend"
)
self.disable_cuda_graph = True
......@@ -204,12 +204,12 @@ class ServerArgs:
self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
self.disable_overlap_schedule = True
logger.info(
logger.warning(
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
"Data parallel size is adjusted to be the same as tensor parallel size. "
"Overlap schedule is disabled."
"Overlap scheduler is disabled."
)
# GGUF
......@@ -642,9 +642,9 @@ class ServerArgs:
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
)
parser.add_argument(
"--disable-disk-cache",
"--disable-outlines-disk-cache",
action="store_true",
help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
help="Disable disk cache of outlines to avoid possible crashes related to file system or high concurrency.",
)
parser.add_argument(
"--disable-custom-all-reduce",
......@@ -745,6 +745,11 @@ class ServerArgs:
action=DeprecatedAction,
help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
)
parser.add_argument(
"--disable-disk-cache",
action=DeprecatedAction,
help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
)
@classmethod
def from_cli_args(cls, args: argparse.Namespace):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment