Unverified Commit 05972ea7 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Refactor] Remove dead or duplicate func utils or variables (#35318)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
parent 111d8690
...@@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = { ...@@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = {
"sglang": async_request_openai_completions, "sglang": async_request_openai_completions,
"llama.cpp": async_request_openai_completions, "llama.cpp": async_request_openai_completions,
} }
OPENAI_COMPATIBLE_BACKENDS = [
k
for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions, async_request_openai_chat_completions)
]
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import math
import os
import time import time
from types import TracebackType from types import TracebackType
from typing import Any
def convert_to_pytorch_benchmark_format(
args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
) -> list:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records = []
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
return records
for name, benchmark_values in metrics.items():
record = {
"benchmark": {
"name": "vLLM benchmark",
"extra_info": {
"args": vars(args),
},
},
"model": {
"name": args.model,
},
"metric": {
"name": name,
"benchmark_values": benchmark_values,
"extra_info": extra_info,
},
}
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
extra_info["tensor_parallel_size"]
)
records.append(record)
return records
class InfEncoder(json.JSONEncoder):
def clear_inf(self, o: Any):
if isinstance(o, dict):
return {k: self.clear_inf(v) for k, v in o.items()}
elif isinstance(o, list):
return [self.clear_inf(v) for v in o]
elif isinstance(o, float) and math.isinf(o):
return "inf"
return o
def iterencode(self, o: Any, *args, **kwargs) -> Any:
return super().iterencode(self.clear_inf(o), *args, **kwargs)
def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f:
json.dump(
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)
# Collect time and generate time metrics # Collect time and generate time metrics
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils # Cutlass bench utils
from collections.abc import Iterable
import torch import torch
...@@ -86,15 +85,3 @@ def make_rand_sparse_tensors( ...@@ -86,15 +85,3 @@ def make_rand_sparse_tensors(
# Compressed B, Metadata, Original A, B # Compressed B, Metadata, Original A, B
return b_compressed, e, a, b return b_compressed, e, a, b
def make_n_rand_sparse_tensors(
num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
ABs = []
for _ in range(num_tensors):
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
if b_comp is not None:
ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
BComps, Es, As, Bs = zip(*ABs)
return list(BComps), list(Es), list(As), list(Bs)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import time
class RateLimiter:
"""Token bucket rate limiter implementation"""
def __init__(self, rate_limit):
self.rate_limit = rate_limit # Requests per second
self.num_available_tokens = rate_limit # Available tokens
self.last_refill = time.monotonic() # Last token refill time
self.lock = asyncio.Lock() # Synchronization lock
async def acquire(self):
"""Acquire a token from the rate limiter"""
while True:
async with self.lock:
current_time = time.monotonic()
elapsed = current_time - self.last_refill
# Refill num_available_tokens if more than 1 second has passed
if elapsed > 1.0:
self.num_available_tokens = self.rate_limit
self.last_refill = current_time
# Check if num_available_tokens are available
if self.num_available_tokens > 0:
self.num_available_tokens -= 1
return True
# Calculate wait time if no num_available_tokens available
wait_time = 1.0 - elapsed
await asyncio.sleep(wait_time)
async def __aenter__(self):
"""Enter async context manager - acquire token"""
await self.acquire()
return self
async def __aexit__(self, exc_type, exc_value, traceback):
"""Exit async context manager - no cleanup needed"""
pass
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
from collections import deque
class RequestQueue:
"""Request queue manager with concurrency control"""
def __init__(self, max_concurrent, max_queue_size):
# Maximum concurrent requests
self.max_concurrent = max_concurrent
self.max_queue_size = max_queue_size # Maximum queue size
# Concurrency control
self.semaphore = asyncio.Semaphore(max_concurrent)
self.queue = deque() # Request queue
self.queue_size = 0 # Current queue size
self.lock = asyncio.Lock() # Sync queue Lock
async def enqueue(self, task):
"""Add a request task to the queue"""
async with self.lock:
if self.queue_size >= self.max_queue_size:
return False
self.queue.append(task)
self.queue_size += 1
return True
async def process(self):
"""Process queued requests using semaphore for concurrency control"""
while True:
if self.queue:
async with self.semaphore, self.lock:
task = self.queue.popleft()
self.queue_size -= 1
await task
await asyncio.sleep(0.01) # Yield control to event loop
...@@ -7,7 +7,6 @@ import torch ...@@ -7,7 +7,6 @@ import torch
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.kernels.linear import ( from vllm.model_executor.kernels.linear import (
init_fp8_linear_kernel, init_fp8_linear_kernel,
) )
...@@ -26,10 +25,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ...@@ -26,10 +25,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
ACTIVATION_SCHEMES = ["static", "dynamic"]
logger = init_logger(__name__)
class PTPCFp8Config(Fp8Config): class PTPCFp8Config(Fp8Config):
"""Config class for Per-Token-Per-Channel Dynamic Quantization Fp8.""" """Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""
......
...@@ -255,18 +255,6 @@ def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tenso ...@@ -255,18 +255,6 @@ def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tenso
return w2_packed.size(1) * marlin_tile_size return w2_packed.size(1) * marlin_tile_size
def marlin_make_workspace(
output_size_per_partition: int, device: torch.device
) -> torch.Tensor:
max_workspace_size = (
output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N
) * GPTQ_MARLIN_MAX_PARALLEL
return torch.zeros(
max_workspace_size, dtype=torch.int, device=device, requires_grad=False
)
def marlin_make_workspace_new( def marlin_make_workspace_new(
device: torch.device, max_blocks_per_sm: int = 1 device: torch.device, max_blocks_per_sm: int = 1
) -> torch.Tensor: ) -> torch.Tensor:
...@@ -297,12 +285,6 @@ def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor: ...@@ -297,12 +285,6 @@ def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
) )
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
return torch.nn.Parameter(
torch.empty(0, dtype=torch.int, device=device), requires_grad=False
)
def marlin_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: def marlin_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int) g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
return g_idx[g_idx_sort_indices], g_idx_sort_indices return g_idx[g_idx_sort_indices], g_idx_sort_indices
......
...@@ -49,7 +49,6 @@ from .utils import ( ...@@ -49,7 +49,6 @@ from .utils import (
) )
from .vision import get_vision_encoder_info from .vision import get_vision_encoder_info
EOT = "<|endofturn|>"
IMAGE_TOKEN: str = "<|dummy3|>" IMAGE_TOKEN: str = "<|dummy3|>"
VIDEO_TOKEN: str = "<|_unuse_missing_100270|>" VIDEO_TOKEN: str = "<|_unuse_missing_100270|>"
......
...@@ -72,7 +72,6 @@ from vllm.version import __version__ as VLLM_VERSION ...@@ -72,7 +72,6 @@ from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__) logger = init_logger(__name__)
POLLING_TIMEOUT_S = 2.5
HANDSHAKE_TIMEOUT_MINS = 5 HANDSHAKE_TIMEOUT_MINS = 5
_R = TypeVar("_R") # Return type for collective_rpc _R = TypeVar("_R") # Return type for collective_rpc
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment