Unverified Commit 3ddb1c46 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[Minor] Fix logger and style (#2325)

parent 480e38a7
...@@ -163,7 +163,6 @@ async def async_request_openai_completions( ...@@ -163,7 +163,6 @@ async def async_request_openai_completions(
"max_tokens": request_func_input.output_len, "max_tokens": request_func_input.output_len,
"stream": not args.disable_stream, "stream": not args.disable_stream,
"ignore_eos": not args.disable_ignore_eos, "ignore_eos": not args.disable_ignore_eos,
"lora_path": request_func_input.lora_name,
**request_func_input.extra_request_body, **request_func_input.extra_request_body,
} }
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import gc import gc
import json import json
import logging import logging
import time
from typing import Optional from typing import Optional
import torch import torch
...@@ -129,7 +130,7 @@ class ModelRunner: ...@@ -129,7 +130,7 @@ class ModelRunner:
# Global vars # Global vars
if server_args.show_time_cost: if server_args.show_time_cost:
enable_show_time_cost() enable_show_time_cost()
if server_args.disable_disk_cache: if server_args.disable_outlines_disk_cache:
from outlines.caching import disable_cache from outlines.caching import disable_cache
disable_cache() disable_cache()
...@@ -623,8 +624,10 @@ class ModelRunner: ...@@ -623,8 +624,10 @@ class ModelRunner:
if self.server_args.disable_cuda_graph: if self.server_args.disable_cuda_graph:
return return
tic = time.time()
logger.info("Capture cuda graph begin. This can take up to several minutes.") logger.info("Capture cuda graph begin. This can take up to several minutes.")
self.cuda_graph_runner = CudaGraphRunner(self) self.cuda_graph_runner = CudaGraphRunner(self)
logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f}s")
def apply_torch_tp(self): def apply_torch_tp(self):
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.") logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
......
...@@ -122,7 +122,7 @@ class ServerArgs: ...@@ -122,7 +122,7 @@ class ServerArgs:
disable_jump_forward: bool = False disable_jump_forward: bool = False
disable_cuda_graph: bool = False disable_cuda_graph: bool = False
disable_cuda_graph_padding: bool = False disable_cuda_graph_padding: bool = False
disable_disk_cache: bool = False disable_outlines_disk_cache: bool = False
disable_custom_all_reduce: bool = False disable_custom_all_reduce: bool = False
disable_mla: bool = False disable_mla: bool = False
disable_overlap_schedule: bool = False disable_overlap_schedule: bool = False
...@@ -159,7 +159,7 @@ class ServerArgs: ...@@ -159,7 +159,7 @@ class ServerArgs:
if self.tp_size >= 16: if self.tp_size >= 16:
self.mem_fraction_static = 0.79 self.mem_fraction_static = 0.79
elif self.tp_size >= 8: elif self.tp_size >= 8:
self.mem_fraction_static = 0.82 self.mem_fraction_static = 0.81
elif self.tp_size >= 4: elif self.tp_size >= 4:
self.mem_fraction_static = 0.85 self.mem_fraction_static = 0.85
elif self.tp_size >= 2: elif self.tp_size >= 2:
...@@ -192,7 +192,7 @@ class ServerArgs: ...@@ -192,7 +192,7 @@ class ServerArgs:
) )
if self.attention_backend == "torch_native": if self.attention_backend == "torch_native":
logger.info( logger.warning(
"Cuda graph is disabled because of using torch native attention backend" "Cuda graph is disabled because of using torch native attention backend"
) )
self.disable_cuda_graph = True self.disable_cuda_graph = True
...@@ -204,12 +204,12 @@ class ServerArgs: ...@@ -204,12 +204,12 @@ class ServerArgs:
self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96) self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
self.schedule_conservativeness = self.schedule_conservativeness * 0.3 self.schedule_conservativeness = self.schedule_conservativeness * 0.3
self.disable_overlap_schedule = True self.disable_overlap_schedule = True
logger.info( logger.warning(
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. " f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. " f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. " f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
"Data parallel size is adjusted to be the same as tensor parallel size. " "Data parallel size is adjusted to be the same as tensor parallel size. "
"Overlap schedule is disabled." "Overlap scheduler is disabled."
) )
# GGUF # GGUF
...@@ -642,9 +642,9 @@ class ServerArgs: ...@@ -642,9 +642,9 @@ class ServerArgs:
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.", help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
) )
parser.add_argument( parser.add_argument(
"--disable-disk-cache", "--disable-outlines-disk-cache",
action="store_true", action="store_true",
help="Disable disk cache to avoid possible crashes related to file system or high concurrency.", help="Disable disk cache of outlines to avoid possible crashes related to file system or high concurrency.",
) )
parser.add_argument( parser.add_argument(
"--disable-custom-all-reduce", "--disable-custom-all-reduce",
...@@ -745,6 +745,11 @@ class ServerArgs: ...@@ -745,6 +745,11 @@ class ServerArgs:
action=DeprecatedAction, action=DeprecatedAction,
help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.", help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
) )
parser.add_argument(
"--disable-disk-cache",
action=DeprecatedAction,
help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
)
@classmethod @classmethod
def from_cli_args(cls, args: argparse.Namespace): def from_cli_args(cls, args: argparse.Namespace):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment