Commit 4c676e3d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.1' into v0.9.1-dev

parents b4c4464d b6553be1
This diff is collapsed.
This diff is collapsed.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json
...@@ -7,9 +8,9 @@ import os ...@@ -7,9 +8,9 @@ import os
from typing import Any from typing import Any
def convert_to_pytorch_benchmark_format(args: argparse.Namespace, def convert_to_pytorch_benchmark_format(
metrics: dict[str, list], args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
extra_info: dict[str, Any]) -> list: ) -> list:
""" """
Save the benchmark results in the format used by PyTorch OSS benchmark with Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record on metric per record
...@@ -37,12 +38,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace, ...@@ -37,12 +38,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
}, },
} }
tp = record["benchmark"]["extra_info"]["args"].get( tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
"tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata # Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info: if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"][ record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
"tensor_parallel_size"] = extra_info["tensor_parallel_size"] extra_info["tensor_parallel_size"]
)
records.append(record) records.append(record)
...@@ -50,7 +51,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace, ...@@ -50,7 +51,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
class InfEncoder(json.JSONEncoder): class InfEncoder(json.JSONEncoder):
def clear_inf(self, o: Any): def clear_inf(self, o: Any):
if isinstance(o, dict): if isinstance(o, dict):
return {k: self.clear_inf(v) for k, v in o.items()} return {k: self.clear_inf(v) for k, v in o.items()}
...@@ -66,4 +66,9 @@ class InfEncoder(json.JSONEncoder): ...@@ -66,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
def write_to_json(filename: str, records: list) -> None: def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f: with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder) json.dump(
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils # Cutlass bench utils
from collections.abc import Iterable from collections.abc import Iterable
...@@ -10,8 +11,9 @@ import vllm._custom_ops as ops ...@@ -10,8 +11,9 @@ import vllm._custom_ops as ops
def to_fp8(tensor: torch.Tensor) -> torch.Tensor: def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
finfo = torch.finfo(torch.float8_e4m3fn) finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp( return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) dtype=torch.float8_e4m3fn
)
def to_int8(tensor: torch.Tensor) -> torch.Tensor: def to_int8(tensor: torch.Tensor) -> torch.Tensor:
...@@ -26,10 +28,11 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor: ...@@ -26,10 +28,11 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
return tensor.to(dtype=torch.float16) return tensor.to(dtype=torch.float16)
def make_rand_tensors(dtype: torch.dtype, m: int, n: int, def make_rand_tensors(
k: int) -> tuple[torch.Tensor, torch.Tensor]: dtype: torch.dtype, m: int, n: int, k: int
a = torch.randn((m, k), device='cuda') * 5 ) -> tuple[torch.Tensor, torch.Tensor]:
b = torch.randn((n, k), device='cuda').t() * 5 a = torch.randn((m, k), device="cuda") * 5
b = torch.randn((n, k), device="cuda").t() * 5
if dtype == torch.int8: if dtype == torch.int8:
return to_int8(a), to_int8(b) return to_int8(a), to_int8(b)
...@@ -49,9 +52,7 @@ def prune_to_2_4(tensor): ...@@ -49,9 +52,7 @@ def prune_to_2_4(tensor):
# Create binary mask # Create binary mask
mask = torch.zeros_like(reshaped) mask = torch.zeros_like(reshaped)
mask.scatter_(dim=1, mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))
index=indices,
src=torch.ones_like(indices, dtype=mask.dtype))
# Apply mask and reshape back # Apply mask and reshape back
pruned = reshaped * mask pruned = reshaped * mask
...@@ -62,10 +63,11 @@ def prune_to_2_4(tensor): ...@@ -62,10 +63,11 @@ def prune_to_2_4(tensor):
return pruned.reshape(original_shape) return pruned.reshape(original_shape)
def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, def make_rand_sparse_tensors(
k: int) -> tuple[torch.Tensor, torch.Tensor]: dtype: torch.dtype, m: int, n: int, k: int
a = torch.randn((m, k), device='cuda') * 5 ) -> tuple[torch.Tensor, torch.Tensor]:
b = torch.randn((n, k), device='cuda').t() * 5 a = torch.randn((m, k), device="cuda") * 5
b = torch.randn((n, k), device="cuda").t() * 5
b = prune_to_2_4(b.t()).t() b = prune_to_2_4(b.t()).t()
...@@ -86,9 +88,9 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, ...@@ -86,9 +88,9 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
return b_compressed, e, a, b return b_compressed, e, a, b
def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype, def make_n_rand_sparse_tensors(
m: int, n: int, k: int) -> \ num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: ) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
ABs = [] ABs = []
for _ in range(num_tensors): for _ in range(num_tensors):
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment