Unverified Commit f949ad57 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[Auto Sync] Update activation.py, chunk_cache.py, utils.py (20250917) (#10538)


Co-authored-by: default avatargithub-actions[bot] <github-actions[bot]@users.noreply.github.com>
parent c49484a6
...@@ -224,12 +224,13 @@ class XIELU(CustomOp): ...@@ -224,12 +224,13 @@ class XIELU(CustomOp):
self._xielu_cuda_fn = self._xielu_cuda self._xielu_cuda_fn = self._xielu_cuda
logger.warning_once(msg) logger.warning_once(msg)
except Exception as err: except Exception as err:
logger.warning_once( pass
"CUDA-fused xIELU not available (%s) –" # logger.warning_once(
" falling back to a Python version.\n" # "CUDA-fused xIELU not available (%s) –"
"For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`", # " falling back to a Python version.\n"
str(err), # "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`",
) # str(err),
# )
def _xielu_python(self, x: torch.Tensor) -> torch.Tensor: def _xielu_python(self, x: torch.Tensor) -> torch.Tensor:
alpha_p = nn.functional.softplus(self.alpha_p) alpha_p = nn.functional.softplus(self.alpha_p)
......
...@@ -28,6 +28,13 @@ class ChunkCache(BasePrefixCache): ...@@ -28,6 +28,13 @@ class ChunkCache(BasePrefixCache):
self.token_to_kv_pool_allocator = token_to_kv_pool_allocator self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
self.page_size = page_size self.page_size = page_size
# NOTE (csy): this is to determine if a cache has prefix matching feature.
# Chunk cache always return True to indicate no prefix matching.
# TODO (csy): Using a prefix cache trait to replace this
@property
def disable(self):
return True
def reset(self): def reset(self):
pass pass
...@@ -38,7 +45,7 @@ class ChunkCache(BasePrefixCache): ...@@ -38,7 +45,7 @@ class ChunkCache(BasePrefixCache):
last_host_node=None, last_host_node=None,
) )
def cache_finished_req(self, req: Req): def cache_finished_req(self, req: Req, insert: bool = True):
kv_indices = self.req_to_token_pool.req_to_token[ kv_indices = self.req_to_token_pool.req_to_token[
req.req_pool_idx, req.req_pool_idx,
# For decode server: if req.output_ids is empty, we want to free all req.origin_input_ids # For decode server: if req.output_ids is empty, we want to free all req.origin_input_ids
......
...@@ -82,11 +82,9 @@ from packaging import version as pkg_version ...@@ -82,11 +82,9 @@ from packaging import version as pkg_version
from PIL import Image from PIL import Image
from starlette.routing import Mount from starlette.routing import Mount
from torch import nn from torch import nn
from torch.func import functional_call
from torch.library import Library from torch.library import Library
from torch.profiler import ProfilerActivity, profile, record_function from torch.profiler import ProfilerActivity, profile, record_function
from torch.utils._contextlib import _DecoratorContextManager from torch.utils._contextlib import _DecoratorContextManager
from triton.runtime.cache import FileCacheManager
from typing_extensions import Literal from typing_extensions import Literal
from sglang.srt.metrics.func_timer import enable_func_timer from sglang.srt.metrics.func_timer import enable_func_timer
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment