Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
86a65417
Commit
86a65417
authored
Feb 05, 2026
by
zhuwenwen
Browse files
sync v0.15.1 (ex tests&vllm)
parent
45a060d6
Changes
24
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
1 addition
and
80 deletions
+1
-80
examples/others/logging_configuration.md
examples/others/logging_configuration.md
+1
-32
pyproject.toml
pyproject.toml
+0
-1
setup.py
setup.py
+0
-16
vllm/_custom_ops.py
vllm/_custom_ops.py
+0
-31
No files found.
examples/others/logging_configuration.md
View file @
86a65417
...
@@ -157,37 +157,6 @@ VLLM_CONFIGURE_LOGGING=0 \
...
@@ -157,37 +157,6 @@ VLLM_CONFIGURE_LOGGING=0 \
vllm serve mistralai/Mistral-7B-v0.1
--max-model-len
2048
vllm serve mistralai/Mistral-7B-v0.1
--max-model-len
2048
```
```
### Example 4: Disable access logs for health check endpoints
In production environments, health check endpoints like
`/health`
,
`/metrics`
,
and
`/ping`
are frequently called by load balancers and monitoring systems,
generating a large volume of repetitive access logs. To reduce log noise while
keeping logs for other endpoints, use the
`--disable-access-log-for-endpoints`
option.
**Disable access logs for health and metrics endpoints:**
```
bash
vllm serve mistralai/Mistral-7B-v0.1
--max-model-len
2048
\
--disable-access-log-for-endpoints
/health,/metrics,/ping
```
**Common endpoints to consider filtering:**
| Endpoint | Description | Typical Caller |
| ---------- | ---------------------- | ---------------------------------------------------- |
|
`/health`
| Health check | Kubernetes liveness/readiness probes, load balancers |
|
`/metrics`
| Prometheus metrics | Prometheus scraper (every 15-60s) |
|
`/ping`
| SageMaker health check | SageMaker infrastructure |
|
`/load`
| Server load metrics | Custom monitoring |
**Notes:**
-
This option only affects uvicorn access logs, not vLLM application logs
-
Specify multiple endpoints by separating them with commas (no spaces)
-
The filter uses exact path matching, query parameters are ignored (e.g.,
`/health?verbose=true`
matches
`/health`
)
-
If you need to completely disable all access logs, use
`--disable-uvicorn-access-log`
instead
## Additional resources
## Additional resources
-
[
`logging.config` Dictionary Schema Details
](
https://docs.python.org/3/library/logging.config.html#dictionary-schema-details
)
-
[
`logging.config` Dictionary Schema Details
](
https://docs.python.org/3/library/logging.config.html#dictionary-schema-details
)
\ No newline at end of file
pyproject.toml
View file @
86a65417
...
@@ -44,7 +44,6 @@ vllm = "vllm.entrypoints.cli.main:main"
...
@@ -44,7 +44,6 @@ vllm = "vllm.entrypoints.cli.main:main"
[project.entry-points."vllm.general_plugins"]
[project.entry-points."vllm.general_plugins"]
lora_filesystem_resolver
=
"vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
lora_filesystem_resolver
=
"vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
lora_hf_hub_resolver
=
"vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver"
[tool.setuptools_scm]
[tool.setuptools_scm]
# no extra settings needed, presence enables setuptools-scm
# no extra settings needed, presence enables setuptools-scm
...
...
setup.py
View file @
86a65417
...
@@ -1004,22 +1004,6 @@ def get_version():
...
@@ -1004,22 +1004,6 @@ def get_version():
return
locals
()[
'__hcu_version__'
]
return
locals
()[
'__hcu_version__'
]
def
get_gaudi_sw_version
():
"""
Returns the driver version.
"""
# Enable console printing for `hl-smi` check
output
=
subprocess
.
run
(
"hl-smi"
,
shell
=
True
,
text
=
True
,
capture_output
=
True
,
env
=
{
"ENABLE_CONSOLE"
:
"true"
})
if
output
.
returncode
==
0
and
output
.
stdout
:
return
output
.
stdout
.
split
(
"
\n
"
)[
2
].
replace
(
" "
,
""
).
split
(
":"
)[
1
][:
-
1
].
split
(
"-"
)[
0
]
return
"0.0.0"
# when hl-smi is not available
def
get_vllm_version
()
->
str
:
def
get_vllm_version
()
->
str
:
# Allow overriding the version. This is useful to build platform-specific
# Allow overriding the version. This is useful to build platform-specific
# wheels (e.g. CPU, TPU) without modifying the source.
# wheels (e.g. CPU, TPU) without modifying the source.
...
...
vllm/_custom_ops.py
View file @
86a65417
...
@@ -2801,13 +2801,6 @@ def indexer_k_quant_and_cache(
...
@@ -2801,13 +2801,6 @@ def indexer_k_quant_and_cache(
torch
.
ops
.
_C_cache_ops
.
indexer_k_quant_and_cache
(
torch
.
ops
.
_C_cache_ops
.
indexer_k_quant_and_cache
(
k
,
kv_cache
,
slot_mapping
,
quant_block_size
,
kv_cache_dtype
k
,
kv_cache
,
slot_mapping
,
quant_block_size
,
kv_cache_dtype
)
)
def
indexer_k_cache
(
k
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
slot_mapping
:
torch
.
Tensor
,
kv_cache_dtype
:
str
)
->
None
:
torch
.
ops
.
_C_cache_ops
.
indexer_k_cache
(
k
,
kv_cache
,
slot_mapping
,
kv_cache_dtype
)
def
cp_gather_indexer_k_quant_cache
(
def
cp_gather_indexer_k_quant_cache
(
...
@@ -2898,30 +2891,6 @@ def free_shared_buffer(ptr: int) -> None:
...
@@ -2898,30 +2891,6 @@ def free_shared_buffer(ptr: int) -> None:
torch
.
ops
.
_C_custom_ar
.
free_shared_buffer
(
ptr
)
torch
.
ops
.
_C_custom_ar
.
free_shared_buffer
(
ptr
)
def
read_cache
(
keys
:
torch
.
Tensor
,
values
:
torch
.
Tensor
,
key_caches
:
list
[
torch
.
Tensor
],
value_caches
:
list
[
torch
.
Tensor
],
slot_mapping
:
torch
.
Tensor
,
kv_cache_dtype
:
str
)
->
None
:
torch
.
ops
.
_C_cache_ops
.
read_cache
(
keys
,
values
,
key_caches
,
value_caches
,
slot_mapping
,
kv_cache_dtype
)
def
write_cache_multi_layers
(
keys
:
torch
.
Tensor
,
values
:
torch
.
Tensor
,
key_caches
:
list
[
torch
.
Tensor
],
value_caches
:
list
[
torch
.
Tensor
],
slot_mapping
:
torch
.
Tensor
,
kv_cache_dtype
:
str
)
->
None
:
torch
.
ops
.
_C_cache_ops
.
write_cache_multi_layers
(
keys
,
values
,
key_caches
,
value_caches
,
slot_mapping
,
kv_cache_dtype
)
# quick all reduce
# quick all reduce
def
init_custom_qr
(
rank
:
int
,
world_size
:
int
,
qr_max_size
:
int
|
None
=
None
)
->
int
:
def
init_custom_qr
(
rank
:
int
,
world_size
:
int
,
qr_max_size
:
int
|
None
=
None
)
->
int
:
return
torch
.
ops
.
_C_custom_ar
.
init_custom_qr
(
rank
,
world_size
,
qr_max_size
)
return
torch
.
ops
.
_C_custom_ar
.
init_custom_qr
(
rank
,
world_size
,
qr_max_size
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment