Commit 7a985548 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.0' into v0.9.0-ori

parents 45d3785c dc1440cf
This diff is collapsed.
...@@ -7,9 +7,9 @@ import os ...@@ -7,9 +7,9 @@ import os
from typing import Any from typing import Any
def convert_to_pytorch_benchmark_format(args: argparse.Namespace, def convert_to_pytorch_benchmark_format(
metrics: dict[str, list], args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
extra_info: dict[str, Any]) -> list: ) -> list:
""" """
Save the benchmark results in the format used by PyTorch OSS benchmark with Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record on metric per record
...@@ -37,12 +37,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace, ...@@ -37,12 +37,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
}, },
} }
tp = record["benchmark"]["extra_info"]["args"].get( tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
"tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata # Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info: if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"][ record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
"tensor_parallel_size"] = extra_info["tensor_parallel_size"] extra_info["tensor_parallel_size"]
)
records.append(record) records.append(record)
...@@ -50,7 +50,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace, ...@@ -50,7 +50,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
class InfEncoder(json.JSONEncoder): class InfEncoder(json.JSONEncoder):
def clear_inf(self, o: Any): def clear_inf(self, o: Any):
if isinstance(o, dict): if isinstance(o, dict):
return {k: self.clear_inf(v) for k, v in o.items()} return {k: self.clear_inf(v) for k, v in o.items()}
......
...@@ -10,8 +10,9 @@ import vllm._custom_ops as ops ...@@ -10,8 +10,9 @@ import vllm._custom_ops as ops
def to_fp8(tensor: torch.Tensor) -> torch.Tensor: def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
finfo = torch.finfo(torch.float8_e4m3fn) finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp( return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) dtype=torch.float8_e4m3fn
)
def to_int8(tensor: torch.Tensor) -> torch.Tensor: def to_int8(tensor: torch.Tensor) -> torch.Tensor:
...@@ -26,10 +27,11 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor: ...@@ -26,10 +27,11 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
return tensor.to(dtype=torch.float16) return tensor.to(dtype=torch.float16)
def make_rand_tensors(dtype: torch.dtype, m: int, n: int, def make_rand_tensors(
k: int) -> tuple[torch.Tensor, torch.Tensor]: dtype: torch.dtype, m: int, n: int, k: int
a = torch.randn((m, k), device='cuda') * 5 ) -> tuple[torch.Tensor, torch.Tensor]:
b = torch.randn((n, k), device='cuda').t() * 5 a = torch.randn((m, k), device="cuda") * 5
b = torch.randn((n, k), device="cuda").t() * 5
if dtype == torch.int8: if dtype == torch.int8:
return to_int8(a), to_int8(b) return to_int8(a), to_int8(b)
...@@ -49,9 +51,7 @@ def prune_to_2_4(tensor): ...@@ -49,9 +51,7 @@ def prune_to_2_4(tensor):
# Create binary mask # Create binary mask
mask = torch.zeros_like(reshaped) mask = torch.zeros_like(reshaped)
mask.scatter_(dim=1, mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))
index=indices,
src=torch.ones_like(indices, dtype=mask.dtype))
# Apply mask and reshape back # Apply mask and reshape back
pruned = reshaped * mask pruned = reshaped * mask
...@@ -62,10 +62,11 @@ def prune_to_2_4(tensor): ...@@ -62,10 +62,11 @@ def prune_to_2_4(tensor):
return pruned.reshape(original_shape) return pruned.reshape(original_shape)
def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, def make_rand_sparse_tensors(
k: int) -> tuple[torch.Tensor, torch.Tensor]: dtype: torch.dtype, m: int, n: int, k: int
a = torch.randn((m, k), device='cuda') * 5 ) -> tuple[torch.Tensor, torch.Tensor]:
b = torch.randn((n, k), device='cuda').t() * 5 a = torch.randn((m, k), device="cuda") * 5
b = torch.randn((n, k), device="cuda").t() * 5
b = prune_to_2_4(b.t()).t() b = prune_to_2_4(b.t()).t()
...@@ -86,9 +87,9 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, ...@@ -86,9 +87,9 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
return b_compressed, e, a, b return b_compressed, e, a, b
def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype, def make_n_rand_sparse_tensors(
m: int, n: int, k: int) -> \ num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: ) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
ABs = [] ABs = []
for _ in range(num_tensors): for _ in range(num_tensors):
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
......
...@@ -12,39 +12,37 @@ app = Quart(__name__) ...@@ -12,39 +12,37 @@ app = Quart(__name__)
async def forward_request(url, data): async def forward_request(url, data):
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
headers = { headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" async with session.post(url=url, json=data, headers=headers) as response:
}
async with session.post(url=url, json=data,
headers=headers) as response:
if response.status == 200: if response.status == 200:
# if response.headers.get('Transfer-Encoding') == 'chunked': # if response.headers.get('Transfer-Encoding') == 'chunked':
if True: if True:
async for chunk_bytes in response.content.iter_chunked( async for chunk_bytes in response.content.iter_chunked(1024):
1024):
yield chunk_bytes yield chunk_bytes
else: else:
content = await response.read() content = await response.read()
yield content yield content
@app.route('/v1/completions', methods=['POST']) @app.route("/v1/completions", methods=["POST"])
async def handle_request(): async def handle_request():
try: try:
original_request_data = await request.get_json() original_request_data = await request.get_json()
prefill_request = original_request_data.copy() prefill_request = original_request_data.copy()
# change max_tokens = 1 to let it only do prefill # change max_tokens = 1 to let it only do prefill
prefill_request['max_tokens'] = 1 prefill_request["max_tokens"] = 1
# finish prefill # finish prefill
async for _ in forward_request('http://localhost:8100/v1/completions', async for _ in forward_request(
prefill_request): "http://localhost:8100/v1/completions", prefill_request
):
continue continue
# return decode # return decode
generator = forward_request('http://localhost:8200/v1/completions', generator = forward_request(
original_request_data) "http://localhost:8200/v1/completions", original_request_data
)
response = await make_response(generator) response = await make_response(generator)
response.timeout = None response.timeout = None
...@@ -53,11 +51,12 @@ async def handle_request(): ...@@ -53,11 +51,12 @@ async def handle_request():
except Exception as e: except Exception as e:
import sys import sys
import traceback import traceback
exc_info = sys.exc_info() exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server") print("Error occurred in disagg prefill proxy server")
print(e) print(e)
print("".join(traceback.format_exception(*exc_info))) print("".join(traceback.format_exception(*exc_info)))
if __name__ == '__main__': if __name__ == "__main__":
app.run(port=8000) app.run(port=8000)
...@@ -8,7 +8,6 @@ from aiohttp import web ...@@ -8,7 +8,6 @@ from aiohttp import web
class RoundRobinProxy: class RoundRobinProxy:
def __init__(self, target_ports): def __init__(self, target_ports):
self.target_ports = target_ports self.target_ports = target_ports
self.port_cycle = itertools.cycle(self.target_ports) self.port_cycle = itertools.cycle(self.target_ports)
...@@ -27,8 +26,9 @@ class RoundRobinProxy: ...@@ -27,8 +26,9 @@ class RoundRobinProxy:
data=request.content, data=request.content,
) as response: ) as response:
# Start sending the response # Start sending the response
resp = web.StreamResponse(status=response.status, resp = web.StreamResponse(
headers=response.headers) status=response.status, headers=response.headers
)
await resp.prepare(request) await resp.prepare(request)
# Stream the response content # Stream the response content
...@@ -45,11 +45,11 @@ class RoundRobinProxy: ...@@ -45,11 +45,11 @@ class RoundRobinProxy:
async def main(): async def main():
proxy = RoundRobinProxy([8100, 8200]) proxy = RoundRobinProxy([8100, 8200])
app = web.Application() app = web.Application()
app.router.add_route('*', '/{path:.*}', proxy.handle_request) app.router.add_route("*", "/{path:.*}", proxy.handle_request)
runner = web.AppRunner(app) runner = web.AppRunner(app)
await runner.setup() await runner.setup()
site = web.TCPSite(runner, 'localhost', 8000) site = web.TCPSite(runner, "localhost", 8000)
await site.start() await site.start()
print("Proxy server started on http://localhost:8000") print("Proxy server started on http://localhost:8000")
...@@ -58,5 +58,5 @@ async def main(): ...@@ -58,5 +58,5 @@ async def main():
await asyncio.Event().wait() await asyncio.Event().wait()
if __name__ == '__main__': if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())
...@@ -6,43 +6,41 @@ import matplotlib.pyplot as plt ...@@ -6,43 +6,41 @@ import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
if __name__ == "__main__": if __name__ == "__main__":
data = [] data = []
for name in ['disagg_prefill', 'chunked_prefill']: for name in ["disagg_prefill", "chunked_prefill"]:
for qps in [2, 4, 6, 8]: for qps in [2, 4, 6, 8]:
with open(f"results/{name}-qps-{qps}.json") as f: with open(f"results/{name}-qps-{qps}.json") as f:
x = json.load(f) x = json.load(f)
x['name'] = name x["name"] = name
x['qps'] = qps x["qps"] = qps
data.append(x) data.append(x)
df = pd.DataFrame.from_dict(data) df = pd.DataFrame.from_dict(data)
dis_df = df[df['name'] == 'disagg_prefill'] dis_df = df[df["name"] == "disagg_prefill"]
chu_df = df[df['name'] == 'chunked_prefill'] chu_df = df[df["name"] == "chunked_prefill"]
plt.style.use('bmh') plt.style.use("bmh")
plt.rcParams['font.size'] = 20 plt.rcParams["font.size"] = 20
for key in [ for key in [
'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms', "mean_ttft_ms",
'median_itl_ms', 'p99_itl_ms' "median_ttft_ms",
"p99_ttft_ms",
"mean_itl_ms",
"median_itl_ms",
"p99_itl_ms",
]: ]:
fig, ax = plt.subplots(figsize=(11, 7)) fig, ax = plt.subplots(figsize=(11, 7))
plt.plot(dis_df['qps'], plt.plot(
dis_df[key], dis_df["qps"], dis_df[key], label="disagg_prefill", marker="o", linewidth=4
label='disagg_prefill', )
marker='o', plt.plot(
linewidth=4) chu_df["qps"], chu_df[key], label="chunked_prefill", marker="o", linewidth=4
plt.plot(chu_df['qps'], )
chu_df[key],
label='chunked_prefill',
marker='o',
linewidth=4)
ax.legend() ax.legend()
ax.set_xlabel('QPS') ax.set_xlabel("QPS")
ax.set_ylabel(key) ax.set_ylabel(key)
ax.set_ylim(bottom=0) ax.set_ylim(bottom=0)
fig.savefig(f'results/{key}.png') fig.savefig(f"results/{key}.png")
plt.close(fig) plt.close(fig)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment