Commit 86a65417 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.15.1 (ex tests&vllm)

parent 45a060d6
...@@ -157,37 +157,6 @@ VLLM_CONFIGURE_LOGGING=0 \ ...@@ -157,37 +157,6 @@ VLLM_CONFIGURE_LOGGING=0 \
vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
``` ```
### Example 4: Disable access logs for health check endpoints
In production environments, health check endpoints like `/health`, `/metrics`,
and `/ping` are frequently called by load balancers and monitoring systems,
generating a large volume of repetitive access logs. To reduce log noise while
keeping logs for other endpoints, use the `--disable-access-log-for-endpoints`
option.
**Disable access logs for health and metrics endpoints:**
```bash
vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 \
--disable-access-log-for-endpoints /health,/metrics,/ping
```
**Common endpoints to consider filtering:**
| Endpoint | Description | Typical Caller |
| ---------- | ---------------------- | ---------------------------------------------------- |
| `/health` | Health check | Kubernetes liveness/readiness probes, load balancers |
| `/metrics` | Prometheus metrics | Prometheus scraper (every 15-60s) |
| `/ping` | SageMaker health check | SageMaker infrastructure |
| `/load` | Server load metrics | Custom monitoring |
**Notes:**
- This option only affects uvicorn access logs, not vLLM application logs
- Specify multiple endpoints by separating them with commas (no spaces)
- The filter uses exact path matching, query parameters are ignored (e.g., `/health?verbose=true` matches `/health`)
- If you need to completely disable all access logs, use `--disable-uvicorn-access-log` instead
## Additional resources ## Additional resources
- [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details) - [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
\ No newline at end of file
...@@ -44,7 +44,6 @@ vllm = "vllm.entrypoints.cli.main:main" ...@@ -44,7 +44,6 @@ vllm = "vllm.entrypoints.cli.main:main"
[project.entry-points."vllm.general_plugins"] [project.entry-points."vllm.general_plugins"]
lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver" lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
lora_hf_hub_resolver = "vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver"
[tool.setuptools_scm] [tool.setuptools_scm]
# no extra settings needed, presence enables setuptools-scm # no extra settings needed, presence enables setuptools-scm
......
...@@ -1004,22 +1004,6 @@ def get_version(): ...@@ -1004,22 +1004,6 @@ def get_version():
return locals()['__hcu_version__'] return locals()['__hcu_version__']
def get_gaudi_sw_version():
"""
Returns the driver version.
"""
# Enable console printing for `hl-smi` check
output = subprocess.run("hl-smi",
shell=True,
text=True,
capture_output=True,
env={"ENABLE_CONSOLE": "true"})
if output.returncode == 0 and output.stdout:
return output.stdout.split("\n")[2].replace(
" ", "").split(":")[1][:-1].split("-")[0]
return "0.0.0" # when hl-smi is not available
def get_vllm_version() -> str: def get_vllm_version() -> str:
# Allow overriding the version. This is useful to build platform-specific # Allow overriding the version. This is useful to build platform-specific
# wheels (e.g. CPU, TPU) without modifying the source. # wheels (e.g. CPU, TPU) without modifying the source.
......
...@@ -2802,13 +2802,6 @@ def indexer_k_quant_and_cache( ...@@ -2802,13 +2802,6 @@ def indexer_k_quant_and_cache(
k, kv_cache, slot_mapping, quant_block_size, kv_cache_dtype k, kv_cache, slot_mapping, quant_block_size, kv_cache_dtype
) )
def indexer_k_cache(k: torch.Tensor, kv_cache: torch.Tensor,
slot_mapping: torch.Tensor,
kv_cache_dtype: str) -> None:
torch.ops._C_cache_ops.indexer_k_cache(
k, kv_cache, slot_mapping, kv_cache_dtype
)
def cp_gather_indexer_k_quant_cache( def cp_gather_indexer_k_quant_cache(
kv_cache: torch.Tensor, kv_cache: torch.Tensor,
...@@ -2898,30 +2891,6 @@ def free_shared_buffer(ptr: int) -> None: ...@@ -2898,30 +2891,6 @@ def free_shared_buffer(ptr: int) -> None:
torch.ops._C_custom_ar.free_shared_buffer(ptr) torch.ops._C_custom_ar.free_shared_buffer(ptr)
def read_cache(
keys: torch.Tensor,
values: torch.Tensor,
key_caches: list[torch.Tensor],
value_caches: list[torch.Tensor],
slot_mapping: torch.Tensor,
kv_cache_dtype: str
) -> None:
torch.ops._C_cache_ops.read_cache(keys, values, key_caches,
value_caches, slot_mapping,
kv_cache_dtype)
def write_cache_multi_layers(
keys: torch.Tensor,
values: torch.Tensor,
key_caches: list[torch.Tensor],
value_caches: list[torch.Tensor],
slot_mapping: torch.Tensor,
kv_cache_dtype: str
) -> None:
torch.ops._C_cache_ops.write_cache_multi_layers(keys, values, key_caches,
value_caches, slot_mapping,
kv_cache_dtype)
# quick all reduce # quick all reduce
def init_custom_qr(rank: int, world_size: int, qr_max_size: int | None = None) -> int: def init_custom_qr(rank: int, world_size: int, qr_max_size: int | None = None) -> int:
return torch.ops._C_custom_ar.init_custom_qr(rank, world_size, qr_max_size) return torch.ops._C_custom_ar.init_custom_qr(rank, world_size, qr_max_size)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment