Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
05972ea7
Unverified
Commit
05972ea7
authored
Feb 26, 2026
by
Wentao Ye
Committed by
GitHub
Feb 26, 2026
Browse files
[Refactor] Remove dead or duplicate func utils or variables (#35318)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
111d8690
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
0 additions
and
199 deletions
+0
-199
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+0
-6
benchmarks/benchmark_utils.py
benchmarks/benchmark_utils.py
+0
-71
benchmarks/cutlass_benchmarks/utils.py
benchmarks/cutlass_benchmarks/utils.py
+0
-13
benchmarks/disagg_benchmarks/rate_limiter.py
benchmarks/disagg_benchmarks/rate_limiter.py
+0
-45
benchmarks/disagg_benchmarks/request_queue.py
benchmarks/disagg_benchmarks/request_queue.py
+0
-39
vllm/model_executor/layers/quantization/ptpc_fp8.py
vllm/model_executor/layers/quantization/ptpc_fp8.py
+0
-5
vllm/model_executor/layers/quantization/utils/marlin_utils.py
.../model_executor/layers/quantization/utils/marlin_utils.py
+0
-18
vllm/model_executor/models/hyperclovax_vision.py
vllm/model_executor/models/hyperclovax_vision.py
+0
-1
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+0
-1
No files found.
benchmarks/backend_request_func.py
View file @
05972ea7
...
@@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = {
...
@@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = {
"sglang"
:
async_request_openai_completions
,
"sglang"
:
async_request_openai_completions
,
"llama.cpp"
:
async_request_openai_completions
,
"llama.cpp"
:
async_request_openai_completions
,
}
}
OPENAI_COMPATIBLE_BACKENDS
=
[
k
for
k
,
v
in
ASYNC_REQUEST_FUNCS
.
items
()
if
v
in
(
async_request_openai_completions
,
async_request_openai_chat_completions
)
]
benchmarks/benchmark_utils.py
View file @
05972ea7
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
json
import
math
import
os
import
time
import
time
from
types
import
TracebackType
from
types
import
TracebackType
from
typing
import
Any
def
convert_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
metrics
:
dict
[
str
,
list
],
extra_info
:
dict
[
str
,
Any
]
)
->
list
:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records
=
[]
if
not
os
.
environ
.
get
(
"SAVE_TO_PYTORCH_BENCHMARK_FORMAT"
,
False
):
return
records
for
name
,
benchmark_values
in
metrics
.
items
():
record
=
{
"benchmark"
:
{
"name"
:
"vLLM benchmark"
,
"extra_info"
:
{
"args"
:
vars
(
args
),
},
},
"model"
:
{
"name"
:
args
.
model
,
},
"metric"
:
{
"name"
:
name
,
"benchmark_values"
:
benchmark_values
,
"extra_info"
:
extra_info
,
},
}
tp
=
record
[
"benchmark"
][
"extra_info"
][
"args"
].
get
(
"tensor_parallel_size"
)
# Save tensor_parallel_size parameter if it's part of the metadata
if
not
tp
and
"tensor_parallel_size"
in
extra_info
:
record
[
"benchmark"
][
"extra_info"
][
"args"
][
"tensor_parallel_size"
]
=
(
extra_info
[
"tensor_parallel_size"
]
)
records
.
append
(
record
)
return
records
class
InfEncoder
(
json
.
JSONEncoder
):
def
clear_inf
(
self
,
o
:
Any
):
if
isinstance
(
o
,
dict
):
return
{
k
:
self
.
clear_inf
(
v
)
for
k
,
v
in
o
.
items
()}
elif
isinstance
(
o
,
list
):
return
[
self
.
clear_inf
(
v
)
for
v
in
o
]
elif
isinstance
(
o
,
float
)
and
math
.
isinf
(
o
):
return
"inf"
return
o
def
iterencode
(
self
,
o
:
Any
,
*
args
,
**
kwargs
)
->
Any
:
return
super
().
iterencode
(
self
.
clear_inf
(
o
),
*
args
,
**
kwargs
)
def
write_to_json
(
filename
:
str
,
records
:
list
)
->
None
:
with
open
(
filename
,
"w"
)
as
f
:
json
.
dump
(
records
,
f
,
cls
=
InfEncoder
,
default
=
lambda
o
:
f
"<
{
type
(
o
).
__name__
}
object is not JSON serializable>"
,
)
# Collect time and generate time metrics
# Collect time and generate time metrics
...
...
benchmarks/cutlass_benchmarks/utils.py
View file @
05972ea7
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils
# Cutlass bench utils
from
collections.abc
import
Iterable
import
torch
import
torch
...
@@ -86,15 +85,3 @@ def make_rand_sparse_tensors(
...
@@ -86,15 +85,3 @@ def make_rand_sparse_tensors(
# Compressed B, Metadata, Original A, B
# Compressed B, Metadata, Original A, B
return
b_compressed
,
e
,
a
,
b
return
b_compressed
,
e
,
a
,
b
def
make_n_rand_sparse_tensors
(
num_tensors
:
int
,
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
tuple
[
Iterable
[
torch
.
Tensor
],
Iterable
[
torch
.
Tensor
]]:
ABs
=
[]
for
_
in
range
(
num_tensors
):
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
)
if
b_comp
is
not
None
:
ABs
.
append
(
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
))
BComps
,
Es
,
As
,
Bs
=
zip
(
*
ABs
)
return
list
(
BComps
),
list
(
Es
),
list
(
As
),
list
(
Bs
)
benchmarks/disagg_benchmarks/rate_limiter.py
deleted
100644 → 0
View file @
111d8690
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
time
class
RateLimiter
:
"""Token bucket rate limiter implementation"""
def
__init__
(
self
,
rate_limit
):
self
.
rate_limit
=
rate_limit
# Requests per second
self
.
num_available_tokens
=
rate_limit
# Available tokens
self
.
last_refill
=
time
.
monotonic
()
# Last token refill time
self
.
lock
=
asyncio
.
Lock
()
# Synchronization lock
async
def
acquire
(
self
):
"""Acquire a token from the rate limiter"""
while
True
:
async
with
self
.
lock
:
current_time
=
time
.
monotonic
()
elapsed
=
current_time
-
self
.
last_refill
# Refill num_available_tokens if more than 1 second has passed
if
elapsed
>
1.0
:
self
.
num_available_tokens
=
self
.
rate_limit
self
.
last_refill
=
current_time
# Check if num_available_tokens are available
if
self
.
num_available_tokens
>
0
:
self
.
num_available_tokens
-=
1
return
True
# Calculate wait time if no num_available_tokens available
wait_time
=
1.0
-
elapsed
await
asyncio
.
sleep
(
wait_time
)
async
def
__aenter__
(
self
):
"""Enter async context manager - acquire token"""
await
self
.
acquire
()
return
self
async
def
__aexit__
(
self
,
exc_type
,
exc_value
,
traceback
):
"""Exit async context manager - no cleanup needed"""
pass
benchmarks/disagg_benchmarks/request_queue.py
deleted
100644 → 0
View file @
111d8690
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
from
collections
import
deque
class
RequestQueue
:
"""Request queue manager with concurrency control"""
def
__init__
(
self
,
max_concurrent
,
max_queue_size
):
# Maximum concurrent requests
self
.
max_concurrent
=
max_concurrent
self
.
max_queue_size
=
max_queue_size
# Maximum queue size
# Concurrency control
self
.
semaphore
=
asyncio
.
Semaphore
(
max_concurrent
)
self
.
queue
=
deque
()
# Request queue
self
.
queue_size
=
0
# Current queue size
self
.
lock
=
asyncio
.
Lock
()
# Sync queue Lock
async
def
enqueue
(
self
,
task
):
"""Add a request task to the queue"""
async
with
self
.
lock
:
if
self
.
queue_size
>=
self
.
max_queue_size
:
return
False
self
.
queue
.
append
(
task
)
self
.
queue_size
+=
1
return
True
async
def
process
(
self
):
"""Process queued requests using semaphore for concurrency control"""
while
True
:
if
self
.
queue
:
async
with
self
.
semaphore
,
self
.
lock
:
task
=
self
.
queue
.
popleft
()
self
.
queue_size
-=
1
await
task
await
asyncio
.
sleep
(
0.01
)
# Yield control to event loop
vllm/model_executor/layers/quantization/ptpc_fp8.py
View file @
05972ea7
...
@@ -7,7 +7,6 @@ import torch
...
@@ -7,7 +7,6 @@ import torch
from
torch.nn.parameter
import
Parameter
from
torch.nn.parameter
import
Parameter
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.logger
import
init_logger
from
vllm.model_executor.kernels.linear
import
(
from
vllm.model_executor.kernels.linear
import
(
init_fp8_linear_kernel
,
init_fp8_linear_kernel
,
)
)
...
@@ -26,10 +25,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
...
@@ -26,10 +25,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
ACTIVATION_SCHEMES
=
[
"static"
,
"dynamic"
]
logger
=
init_logger
(
__name__
)
class
PTPCFp8Config
(
Fp8Config
):
class
PTPCFp8Config
(
Fp8Config
):
"""Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""
"""Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""
...
...
vllm/model_executor/layers/quantization/utils/marlin_utils.py
View file @
05972ea7
...
@@ -255,18 +255,6 @@ def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tenso
...
@@ -255,18 +255,6 @@ def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tenso
return
w2_packed
.
size
(
1
)
*
marlin_tile_size
return
w2_packed
.
size
(
1
)
*
marlin_tile_size
def
marlin_make_workspace
(
output_size_per_partition
:
int
,
device
:
torch
.
device
)
->
torch
.
Tensor
:
max_workspace_size
=
(
output_size_per_partition
//
GPTQ_MARLIN_MIN_THREAD_N
)
*
GPTQ_MARLIN_MAX_PARALLEL
return
torch
.
zeros
(
max_workspace_size
,
dtype
=
torch
.
int
,
device
=
device
,
requires_grad
=
False
)
def
marlin_make_workspace_new
(
def
marlin_make_workspace_new
(
device
:
torch
.
device
,
max_blocks_per_sm
:
int
=
1
device
:
torch
.
device
,
max_blocks_per_sm
:
int
=
1
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
...
@@ -297,12 +285,6 @@ def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
...
@@ -297,12 +285,6 @@ def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
)
)
def
marlin_make_empty_zp
(
device
:
torch
.
device
)
->
torch
.
Tensor
:
return
torch
.
nn
.
Parameter
(
torch
.
empty
(
0
,
dtype
=
torch
.
int
,
device
=
device
),
requires_grad
=
False
)
def
marlin_sort_g_idx
(
g_idx
:
torch
.
Tensor
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
def
marlin_sort_g_idx
(
g_idx
:
torch
.
Tensor
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
g_idx_sort_indices
=
torch
.
argsort
(
g_idx
).
to
(
torch
.
int
)
g_idx_sort_indices
=
torch
.
argsort
(
g_idx
).
to
(
torch
.
int
)
return
g_idx
[
g_idx_sort_indices
],
g_idx_sort_indices
return
g_idx
[
g_idx_sort_indices
],
g_idx_sort_indices
...
...
vllm/model_executor/models/hyperclovax_vision.py
View file @
05972ea7
...
@@ -49,7 +49,6 @@ from .utils import (
...
@@ -49,7 +49,6 @@ from .utils import (
)
)
from
.vision
import
get_vision_encoder_info
from
.vision
import
get_vision_encoder_info
EOT
=
"<|endofturn|>"
IMAGE_TOKEN
:
str
=
"<|dummy3|>"
IMAGE_TOKEN
:
str
=
"<|dummy3|>"
VIDEO_TOKEN
:
str
=
"<|_unuse_missing_100270|>"
VIDEO_TOKEN
:
str
=
"<|_unuse_missing_100270|>"
...
...
vllm/v1/engine/core.py
View file @
05972ea7
...
@@ -72,7 +72,6 @@ from vllm.version import __version__ as VLLM_VERSION
...
@@ -72,7 +72,6 @@ from vllm.version import __version__ as VLLM_VERSION
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
POLLING_TIMEOUT_S
=
2.5
HANDSHAKE_TIMEOUT_MINS
=
5
HANDSHAKE_TIMEOUT_MINS
=
5
_R
=
TypeVar
(
"_R"
)
# Return type for collective_rpc
_R
=
TypeVar
(
"_R"
)
# Return type for collective_rpc
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment