sync v0.15.1 (ex tests&vllm)

86a65417 · zhuwenwen · 45a060d6 · 86a65417 · 86a65417 · 86a65417
Commit 86a65417 authored Feb 05, 2026 by zhuwenwen
Showing with 1 addition and 80 deletions

examples/others/logging_configuration.md examples/others/logging_configuration.md +1 -32

pyproject.toml pyproject.toml +0 -1

setup.py setup.py +0 -16

vllm/_custom_ops.py vllm/_custom_ops.py +0 -31

No files found.
--- a/examples/others/logging_configuration.md
+++ b/examples/others/logging_configuration.md
@@ -157,37 +157,6 @@ VLLM_CONFIGURE_LOGGING=0 \
    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
-### Example 4: Disable access logs for health check endpoints
-In production environments, health check endpoints like `/health`, `/metrics`,
-and `/ping` are frequently called by load balancers and monitoring systems,
-generating a large volume of repetitive access logs. To reduce log noise while
-keeping logs for other endpoints, use the `--disable-access-log-for-endpoints`
-option.
-**Disable access logs for health and metrics endpoints:**
-```bash
-vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 \
-    --disable-access-log-for-endpoints /health,/metrics,/ping
-```
-**Common endpoints to consider filtering:**
-| Endpoint   | Description            | Typical Caller                                       |
-| ---------- | ---------------------- | ---------------------------------------------------- |
-| `/health`  | Health check           | Kubernetes liveness/readiness probes, load balancers |
-| `/metrics` | Prometheus metrics     | Prometheus scraper (every 15-60s)                    |
-| `/ping`    | SageMaker health check | SageMaker infrastructure                             |
-| `/load`    | Server load metrics    | Custom monitoring                                    |
-**Notes:**
- This option only affects uvicorn access logs, not vLLM application logs
- Specify multiple endpoints by separating them with commas (no spaces)
- The filter uses exact path matching, query parameters are ignored (e.g., `/health?verbose=true` matches `/health`)
- If you need to completely disable all access logs, use `--disable-uvicorn-access-log` instead
 ## Additional resources
 - [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
\ No newline at end of file
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,6 @@ vllm = "vllm.entrypoints.cli.main:main"
 [project.entry-points."vllm.general_plugins"]
 lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
-lora_hf_hub_resolver = "vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver"
 [tool.setuptools_scm]
 # no extra settings needed, presence enables setuptools-scm

--- a/setup.py
+++ b/setup.py
@@ -1004,22 +1004,6 @@ def get_version():
    return locals()['__hcu_version__']
-def get_gaudi_sw_version():
-    """
-    Returns the driver version.
-    """
-    # Enable console printing for `hl-smi` check
-    output = subprocess.run("hl-smi",
-                            shell=True,
-                            text=True,
-                            capture_output=True,
-                            env={"ENABLE_CONSOLE": "true"})
-    if output.returncode == 0 and output.stdout:
-        return output.stdout.split("\n")[2].replace(
-            " ", "").split(":")[1][:-1].split("-")[0]
-    return "0.0.0"  # when hl-smi is not available
 def get_vllm_version() -> str:
    # Allow overriding the version. This is useful to build platform-specific
    # wheels (e.g. CPU, TPU) without modifying the source.

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2802,13 +2802,6 @@ def indexer_k_quant_and_cache(
        k, kv_cache, slot_mapping, quant_block_size, kv_cache_dtype
    )
-def indexer_k_cache(k: torch.Tensor, kv_cache: torch.Tensor,
-                    slot_mapping: torch.Tensor,
-                    kv_cache_dtype: str) -> None:
-    torch.ops._C_cache_ops.indexer_k_cache(
-        k, kv_cache, slot_mapping, kv_cache_dtype
-    )
 def cp_gather_indexer_k_quant_cache(
    kv_cache: torch.Tensor,
@@ -2898,30 +2891,6 @@ def free_shared_buffer(ptr: int) -> None:
    torch.ops._C_custom_ar.free_shared_buffer(ptr)
-def read_cache(
-        keys: torch.Tensor,
-        values: torch.Tensor,
-        key_caches: list[torch.Tensor],
-        value_caches: list[torch.Tensor],
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str
-) -> None:
-    torch.ops._C_cache_ops.read_cache(keys, values, key_caches,
-                                      value_caches, slot_mapping,
-                                      kv_cache_dtype)
-def write_cache_multi_layers(
-        keys: torch.Tensor,
-        values: torch.Tensor,
-        key_caches: list[torch.Tensor],
-        value_caches: list[torch.Tensor],
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str
-) -> None:
-    torch.ops._C_cache_ops.write_cache_multi_layers(keys, values, key_caches,
-                                                    value_caches, slot_mapping,
-                                                    kv_cache_dtype)
 # quick all reduce
 def init_custom_qr(rank: int, world_size: int, qr_max_size: int | None = None) -> int:
    return torch.ops._C_custom_ar.init_custom_qr(rank, world_size, qr_max_size)