Unverified Commit 6a63a985 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

minor code sync (#12403)

parent 4d2f17bd
......@@ -1473,6 +1473,16 @@ class WatchLoadUpdateReq(BaseReq):
loads: List[GetLoadReqOutput]
@dataclass
class SetInjectDumpMetadataReqInput(BaseReq):
dump_metadata: Dict[str, Any]
@dataclass
class SetInjectDumpMetadataReqOutput(BaseReq):
success: bool
@dataclass
class LazyDumpTensorsReqInput(BaseReq):
pass
......@@ -1504,6 +1514,3 @@ def _check_all_req_types():
raise ValueError(
f"{name} is a subclass of BaseReq but not follow the naming convention."
)
_check_all_req_types()
......@@ -870,13 +870,13 @@ class TokenizerMetricsCollector:
def check_time_to_first_token_straggler(self, value: float) -> bool:
his = self.histogram_time_to_first_token.labels(**self.labels)
total_observations = sum(bucket._value for bucket in his._buckets)
if total_observations < 1000:
if total_observations < 100:
return False
p999_threshold = total_observations * 0.999
p99_threshold = total_observations * 0.99
cumulative_count = 0
for i, bucket in enumerate(his._buckets):
cumulative_count += bucket._value
if cumulative_count > p999_threshold:
if cumulative_count > p99_threshold:
return value >= his._upper_bounds[i]
return False
......
......@@ -135,6 +135,8 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND = ["fa3", "triton"]
DEFAULT_LORA_EVICTION_POLICY = "lru"
NSA_CHOICES = [
......@@ -190,6 +192,10 @@ def add_deterministic_attention_backend_choices(choices):
DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
def add_radix_supported_deterministic_attention_backend_choices(choices):
RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND.extend(choices)
def add_radix_eviction_policy_choices(choices):
RADIX_EVICTION_POLICY_CHOICES.extend(choices)
......@@ -1753,13 +1759,17 @@ class ServerArgs:
f"but you explicitly specified '{self.attention_backend}'."
)
if self.attention_backend not in ["fa3", "triton"]:
if is_deepseek_model:
if is_deepseek_model:
if self.attention_backend not in ["fa3", "triton"]:
raise ValueError(
f"Currently only fa3 and triton attention backends are supported for deterministic inference with DeepSeek models. But you're using {self.attention_backend}."
f"Currently only {RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND} attention backends are supported for deterministic inference with DeepSeek models. But you're using {self.attention_backend}."
)
# Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
if (
self.attention_backend
not in RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND
):
# Currently, only certain backends support radix cache. Support for other backends is in progress
self.disable_radix_cache = True
logger.warning(
f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
......
......@@ -21,7 +21,7 @@ python3 -c 'import os, shutil, tempfile, getpass; cache_dir = os.environ.get("TO
rm -rf /root/.cache/flashinfer
# Install apt packages
apt install -y git libnuma-dev
apt install -y git libnuma-dev libssl-dev pkg-config
# Install protoc for router build (gRPC protobuf compilation)
if ! command -v protoc &> /dev/null; then
......@@ -54,10 +54,7 @@ if [ "$IS_BLACKWELL" = "1" ]; then
PIP_INSTALL_SUFFIX="--break-system-packages"
# Clean up existing installations
$PIP_CMD uninstall -y flashinfer_python sgl-kernel sglang vllm $PIP_INSTALL_SUFFIX || true
# Install the main package
$PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX --force-reinstall
$PIP_CMD uninstall -y sgl-kernel sglang $PIP_INSTALL_SUFFIX || true
else
# In normal cases, we use uv, which is much faster than pip.
pip install --upgrade pip
......@@ -68,20 +65,11 @@ else
PIP_INSTALL_SUFFIX="--index-strategy unsafe-best-match"
# Clean up existing installations
$PIP_CMD uninstall flashinfer_python sgl-kernel sglang vllm || true
# Install the main package without deps
$PIP_CMD install -e "python[dev]" --no-deps $PIP_INSTALL_SUFFIX --force-reinstall
# Install flashinfer-python 0.4.1 dependency that requires prerelease (This should be removed when flashinfer fixes this issue)
$PIP_CMD install flashinfer-python==0.4.1 --prerelease=allow $PIP_INSTALL_SUFFIX
# Install the main package
$PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX --upgrade
$PIP_CMD uninstall sgl-kernel sglang || true
fi
# Install OpenSSL development libraries for router build
apt install -y libssl-dev pkg-config
# Install the main package
$PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX
# Install router for pd-disagg test
$PIP_CMD install -e "sgl-router" $PIP_INSTALL_SUFFIX
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment