Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
05972ea7
Unverified
Commit
05972ea7
authored
Feb 26, 2026
by
Wentao Ye
Committed by
GitHub
Feb 26, 2026
Browse files
[Refactor] Remove dead or duplicate func utils or variables (#35318)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
111d8690
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
0 additions
and
199 deletions
+0
-199
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+0
-6
benchmarks/benchmark_utils.py
benchmarks/benchmark_utils.py
+0
-71
benchmarks/cutlass_benchmarks/utils.py
benchmarks/cutlass_benchmarks/utils.py
+0
-13
benchmarks/disagg_benchmarks/rate_limiter.py
benchmarks/disagg_benchmarks/rate_limiter.py
+0
-45
benchmarks/disagg_benchmarks/request_queue.py
benchmarks/disagg_benchmarks/request_queue.py
+0
-39
vllm/model_executor/layers/quantization/ptpc_fp8.py
vllm/model_executor/layers/quantization/ptpc_fp8.py
+0
-5
vllm/model_executor/layers/quantization/utils/marlin_utils.py
.../model_executor/layers/quantization/utils/marlin_utils.py
+0
-18
vllm/model_executor/models/hyperclovax_vision.py
vllm/model_executor/models/hyperclovax_vision.py
+0
-1
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+0
-1
No files found.
benchmarks/backend_request_func.py
View file @
05972ea7
...
...
@@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = {
"sglang"
:
async_request_openai_completions
,
"llama.cpp"
:
async_request_openai_completions
,
}
OPENAI_COMPATIBLE_BACKENDS
=
[
k
for
k
,
v
in
ASYNC_REQUEST_FUNCS
.
items
()
if
v
in
(
async_request_openai_completions
,
async_request_openai_chat_completions
)
]
benchmarks/benchmark_utils.py
View file @
05972ea7
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
json
import
math
import
os
import
time
from
types
import
TracebackType
from
typing
import
Any
def
convert_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
metrics
:
dict
[
str
,
list
],
extra_info
:
dict
[
str
,
Any
]
)
->
list
:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records
=
[]
if
not
os
.
environ
.
get
(
"SAVE_TO_PYTORCH_BENCHMARK_FORMAT"
,
False
):
return
records
for
name
,
benchmark_values
in
metrics
.
items
():
record
=
{
"benchmark"
:
{
"name"
:
"vLLM benchmark"
,
"extra_info"
:
{
"args"
:
vars
(
args
),
},
},
"model"
:
{
"name"
:
args
.
model
,
},
"metric"
:
{
"name"
:
name
,
"benchmark_values"
:
benchmark_values
,
"extra_info"
:
extra_info
,
},
}
tp
=
record
[
"benchmark"
][
"extra_info"
][
"args"
].
get
(
"tensor_parallel_size"
)
# Save tensor_parallel_size parameter if it's part of the metadata
if
not
tp
and
"tensor_parallel_size"
in
extra_info
:
record
[
"benchmark"
][
"extra_info"
][
"args"
][
"tensor_parallel_size"
]
=
(
extra_info
[
"tensor_parallel_size"
]
)
records
.
append
(
record
)
return
records
class
InfEncoder
(
json
.
JSONEncoder
):
def
clear_inf
(
self
,
o
:
Any
):
if
isinstance
(
o
,
dict
):
return
{
k
:
self
.
clear_inf
(
v
)
for
k
,
v
in
o
.
items
()}
elif
isinstance
(
o
,
list
):
return
[
self
.
clear_inf
(
v
)
for
v
in
o
]
elif
isinstance
(
o
,
float
)
and
math
.
isinf
(
o
):
return
"inf"
return
o
def
iterencode
(
self
,
o
:
Any
,
*
args
,
**
kwargs
)
->
Any
:
return
super
().
iterencode
(
self
.
clear_inf
(
o
),
*
args
,
**
kwargs
)
def
write_to_json
(
filename
:
str
,
records
:
list
)
->
None
:
with
open
(
filename
,
"w"
)
as
f
:
json
.
dump
(
records
,
f
,
cls
=
InfEncoder
,
default
=
lambda
o
:
f
"<
{
type
(
o
).
__name__
}
object is not JSON serializable>"
,
)
# Collect time and generate time metrics
...
...
benchmarks/cutlass_benchmarks/utils.py
View file @
05972ea7
...
...
@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils
from
collections.abc
import
Iterable
import
torch
...
...
@@ -86,15 +85,3 @@ def make_rand_sparse_tensors(
# Compressed B, Metadata, Original A, B
return
b_compressed
,
e
,
a
,
b
def
make_n_rand_sparse_tensors
(
num_tensors
:
int
,
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
tuple
[
Iterable
[
torch
.
Tensor
],
Iterable
[
torch
.
Tensor
]]:
ABs
=
[]
for
_
in
range
(
num_tensors
):
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
)
if
b_comp
is
not
None
:
ABs
.
append
(
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
))
BComps
,
Es
,
As
,
Bs
=
zip
(
*
ABs
)
return
list
(
BComps
),
list
(
Es
),
list
(
As
),
list
(
Bs
)
benchmarks/disagg_benchmarks/rate_limiter.py
deleted
100644 → 0
View file @
111d8690
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
time
class
RateLimiter
:
"""Token bucket rate limiter implementation"""
def
__init__
(
self
,
rate_limit
):
self
.
rate_limit
=
rate_limit
# Requests per second
self
.
num_available_tokens
=
rate_limit
# Available tokens
self
.
last_refill
=
time
.
monotonic
()
# Last token refill time
self
.
lock
=
asyncio
.
Lock
()
# Synchronization lock
async
def
acquire
(
self
):
"""Acquire a token from the rate limiter"""
while
True
:
async
with
self
.
lock
:
current_time
=
time
.
monotonic
()
elapsed
=
current_time
-
self
.
last_refill
# Refill num_available_tokens if more than 1 second has passed
if
elapsed
>
1.0
:
self
.
num_available_tokens
=
self
.
rate_limit
self
.
last_refill
=
current_time
# Check if num_available_tokens are available
if
self
.
num_available_tokens
>
0
:
self
.
num_available_tokens
-=
1
return
True
# Calculate wait time if no num_available_tokens available
wait_time
=
1.0
-
elapsed
await
asyncio
.
sleep
(
wait_time
)
async
def
__aenter__
(
self
):
"""Enter async context manager - acquire token"""
await
self
.
acquire
()
return
self
async
def
__aexit__
(
self
,
exc_type
,
exc_value
,
traceback
):
"""Exit async context manager - no cleanup needed"""
pass
benchmarks/disagg_benchmarks/request_queue.py
deleted
100644 → 0
View file @
111d8690
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
from
collections
import
deque
class
RequestQueue
:
"""Request queue manager with concurrency control"""
def
__init__
(
self
,
max_concurrent
,
max_queue_size
):
# Maximum concurrent requests
self
.
max_concurrent
=
max_concurrent
self
.
max_queue_size
=
max_queue_size
# Maximum queue size
# Concurrency control
self
.
semaphore
=
asyncio
.
Semaphore
(
max_concurrent
)
self
.
queue
=
deque
()
# Request queue
self
.
queue_size
=
0
# Current queue size
self
.
lock
=
asyncio
.
Lock
()
# Sync queue Lock
async
def
enqueue
(
self
,
task
):
"""Add a request task to the queue"""
async
with
self
.
lock
:
if
self
.
queue_size
>=
self
.
max_queue_size
:
return
False
self
.
queue
.
append
(
task
)
self
.
queue_size
+=
1
return
True
async
def
process
(
self
):
"""Process queued requests using semaphore for concurrency control"""
while
True
:
if
self
.
queue
:
async
with
self
.
semaphore
,
self
.
lock
:
task
=
self
.
queue
.
popleft
()
self
.
queue_size
-=
1
await
task
await
asyncio
.
sleep
(
0.01
)
# Yield control to event loop
vllm/model_executor/layers/quantization/ptpc_fp8.py
View file @
05972ea7
...
...
@@ -7,7 +7,6 @@ import torch
from
torch.nn.parameter
import
Parameter
from
vllm
import
_custom_ops
as
ops
from
vllm.logger
import
init_logger
from
vllm.model_executor.kernels.linear
import
(
init_fp8_linear_kernel
,
)
...
...
@@ -26,10 +25,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
)
from
vllm.platforms
import
current_platform
ACTIVATION_SCHEMES
=
[
"static"
,
"dynamic"
]
logger
=
init_logger
(
__name__
)
class
PTPCFp8Config
(
Fp8Config
):
"""Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""
...
...
vllm/model_executor/layers/quantization/utils/marlin_utils.py
View file @
05972ea7
...
...
@@ -255,18 +255,6 @@ def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tenso
return
w2_packed
.
size
(
1
)
*
marlin_tile_size
def
marlin_make_workspace
(
output_size_per_partition
:
int
,
device
:
torch
.
device
)
->
torch
.
Tensor
:
max_workspace_size
=
(
output_size_per_partition
//
GPTQ_MARLIN_MIN_THREAD_N
)
*
GPTQ_MARLIN_MAX_PARALLEL
return
torch
.
zeros
(
max_workspace_size
,
dtype
=
torch
.
int
,
device
=
device
,
requires_grad
=
False
)
def
marlin_make_workspace_new
(
device
:
torch
.
device
,
max_blocks_per_sm
:
int
=
1
)
->
torch
.
Tensor
:
...
...
@@ -297,12 +285,6 @@ def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
)
def
marlin_make_empty_zp
(
device
:
torch
.
device
)
->
torch
.
Tensor
:
return
torch
.
nn
.
Parameter
(
torch
.
empty
(
0
,
dtype
=
torch
.
int
,
device
=
device
),
requires_grad
=
False
)
def
marlin_sort_g_idx
(
g_idx
:
torch
.
Tensor
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
g_idx_sort_indices
=
torch
.
argsort
(
g_idx
).
to
(
torch
.
int
)
return
g_idx
[
g_idx_sort_indices
],
g_idx_sort_indices
...
...
vllm/model_executor/models/hyperclovax_vision.py
View file @
05972ea7
...
...
@@ -49,7 +49,6 @@ from .utils import (
)
from
.vision
import
get_vision_encoder_info
EOT
=
"<|endofturn|>"
IMAGE_TOKEN
:
str
=
"<|dummy3|>"
VIDEO_TOKEN
:
str
=
"<|_unuse_missing_100270|>"
...
...
vllm/v1/engine/core.py
View file @
05972ea7
...
...
@@ -72,7 +72,6 @@ from vllm.version import __version__ as VLLM_VERSION
logger
=
init_logger
(
__name__
)
POLLING_TIMEOUT_S
=
2.5
HANDSHAKE_TIMEOUT_MINS
=
5
_R
=
TypeVar
(
"_R"
)
# Return type for collective_rpc
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment