Unverified Commit 823ab796 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `pre-commit` hooks (#12475)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 6116ca8c
...@@ -3,18 +3,18 @@ default_stages: ...@@ -3,18 +3,18 @@ default_stages:
- manual # Run in CI - manual # Run in CI
repos: repos:
- repo: https://github.com/google/yapf - repo: https://github.com/google/yapf
rev: v0.32.0 rev: v0.43.0
hooks: hooks:
- id: yapf - id: yapf
args: [--in-place, --verbose] args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.5 rev: v0.9.3
hooks: hooks:
- id: ruff - id: ruff
args: [--output-format, github] args: [--output-format, github]
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.3.0 rev: v2.4.0
hooks: hooks:
- id: codespell - id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
...@@ -23,7 +23,7 @@ repos: ...@@ -23,7 +23,7 @@ repos:
hooks: hooks:
- id: isort - id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.5 rev: v19.1.7
hooks: hooks:
- id: clang-format - id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
...@@ -35,7 +35,7 @@ repos: ...@@ -35,7 +35,7 @@ repos:
- id: pymarkdown - id: pymarkdown
files: docs/.* files: docs/.*
- repo: https://github.com/rhysd/actionlint - repo: https://github.com/rhysd/actionlint
rev: v1.7.6 rev: v1.7.7
hooks: hooks:
- id: actionlint - id: actionlint
- repo: local - repo: local
......
...@@ -926,8 +926,8 @@ def main(args: argparse.Namespace): ...@@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
) )
# Traffic # Traffic
result_json["request_rate"] = ( result_json["request_rate"] = (args.request_rate if args.request_rate
args.request_rate if args.request_rate < float("inf") else "inf") < float("inf") else "inf")
result_json["burstiness"] = args.burstiness result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency result_json["max_concurrency"] = args.max_concurrency
......
...@@ -38,9 +38,13 @@ struct Signal { ...@@ -38,9 +38,13 @@ struct Signal {
alignas(128) FlagType peer_counter[2][kMaxBlocks][8]; alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
}; };
struct __align__(16) RankData { const void* __restrict__ ptrs[8]; }; struct __align__(16) RankData {
const void* __restrict__ ptrs[8];
};
struct __align__(16) RankSignals { Signal* signals[8]; }; struct __align__(16) RankSignals {
Signal* signals[8];
};
// like std::array, but aligned // like std::array, but aligned
template <typename T, int sz> template <typename T, int sz>
......
...@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) { ...@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
const int SUB = 0x64086408; const int SUB = 0x64086408;
...@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) { ...@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
const int SUB = 0x64006400; const int SUB = 0x64006400;
const int MUL = 0x2c002c00; const int MUL = 0x2c002c00;
......
...@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) { ...@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
const int SUB = 0x64086408; const int SUB = 0x64086408;
...@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) { ...@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
q >>= 4; q >>= 4;
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
typename ScalarType<nv_bfloat16>::FragB frag_b; typename ScalarType<nv_bfloat16>::FragB frag_b;
static constexpr uint32_t MUL = 0x3F803F80; static constexpr uint32_t MUL = 0x3F803F80;
...@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) { ...@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
const int SUB = 0x64006400; const int SUB = 0x64006400;
const int MUL = 0x2c002c00; const int MUL = 0x2c002c00;
...@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) { ...@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
q >>= 4; q >>= 4;
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
typename ScalarType<nv_bfloat16>::FragB frag_b; typename ScalarType<nv_bfloat16>::FragB frag_b;
static constexpr uint32_t MUL = 0x3F803F80; static constexpr uint32_t MUL = 0x3F803F80;
......
...@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) { ...@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
const int SUB = 0x64086408; const int SUB = 0x64086408;
......
...@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) { ...@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
static constexpr uint32_t HI = 0x00f000f0; static constexpr uint32_t HI = 0x00f000f0;
static constexpr uint32_t EX = 0x64006400; static constexpr uint32_t EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
static constexpr uint32_t SUB = 0x64086408; static constexpr uint32_t SUB = 0x64086408;
......
...@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) { ...@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
const int SUB = 0x64086408; const int SUB = 0x64086408;
......
...@@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( ...@@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
// max_num_partitions, head_size] // max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs] const int* __restrict__ context_lens, // [num_seqs]
const int max_num_partitions){UNREACHABLE_CODE} const int max_num_partitions) {
UNREACHABLE_CODE
}
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
......
...@@ -417,7 +417,7 @@ def get_rocm_version(): ...@@ -417,7 +417,7 @@ def get_rocm_version():
if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor), if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
ctypes.byref(patch)) == 0): ctypes.byref(patch)) == 0):
return "%d.%d.%d" % (major.value, minor.value, patch.value) return f"{major.value}.{minor.value}.{patch.value}"
return None return None
except Exception: except Exception:
return None return None
......
...@@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A, ...@@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles) A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
] ]
B_tiles = [[ B_tiles = [[
B[j * block_n:min((j + 1) * block_n, N), B[
i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles) j * block_n:min((j + 1) * block_n, N),
i * block_k:min((i + 1) * block_k, K),
] for i in range(k_tiles)
] for j in range(n_tiles)] ] for j in range(n_tiles)]
C_tiles = [ C_tiles = [
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles) C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
...@@ -157,9 +159,9 @@ def setup_cuda(): ...@@ -157,9 +159,9 @@ def setup_cuda():
torch.set_default_device("cuda") torch.set_default_device("cuda")
@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed", @pytest.mark.parametrize(
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, "num_tokens,d,dtype,group_size,seed",
SEEDS)) itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
@torch.inference_mode() @torch.inference_mode()
def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
torch.manual_seed(seed) torch.manual_seed(seed)
...@@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): ...@@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
assert torch.allclose(scale, ref_scale) assert torch.allclose(scale, ref_scale)
@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed", @pytest.mark.parametrize(
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, "M,N,K,block_size,out_dtype,seed",
SEEDS)) itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
@torch.inference_mode() @torch.inference_mode()
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
torch.manual_seed(seed) torch.manual_seed(seed)
...@@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): ...@@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
assert rel_diff < 0.001 assert rel_diff < 0.001
@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed", @pytest.mark.parametrize(
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, "M,N,K,E,topk,block_size,dtype,seed",
BLOCK_SIZE, DTYPES, SEEDS)) itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
SEEDS))
@torch.inference_mode() @torch.inference_mode()
def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
torch.manual_seed(seed) torch.manual_seed(seed)
......
...@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device): ...@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0 assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0 assert len(buffer.buffer) == 0
print("My rank: %d, device: %s" % (my_rank, device)) print(f"My rank: {my_rank}, device: {device}")
# insert # insert
tokens = torch.tensor([1, 2, 3]).to(device) tokens = torch.tensor([1, 2, 3]).to(device)
...@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device): ...@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0 assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0 assert len(buffer.buffer) == 0
print("My rank: %d, Test run passed!" % (my_rank)) print(f"My rank: {my_rank}, Test run passed!")
def stress_test(my_rank, buf, device): def stress_test(my_rank, buf, device):
...@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device): ...@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
assert torch.allclose(k, k_) assert torch.allclose(k, k_)
assert torch.allclose(v, v_) assert torch.allclose(v, v_)
assert torch.allclose(h, h_) assert torch.allclose(h, h_)
print('Rank %d done' % my_rank) print(f"Rank {my_rank} done")
torch.distributed.barrier() torch.distributed.barrier()
if my_rank == 0: if my_rank == 0:
...@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device): ...@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else: else:
torch.distributed.send(torch.tensor([n]), 0) torch.distributed.send(torch.tensor([n]), 0)
print("My rank: %d, Passed stress test!" % (my_rank)) print(f"My rank: {my_rank}, Passed stress test!")
if __name__ == "__main__": if __name__ == "__main__":
...@@ -122,7 +122,7 @@ if __name__ == "__main__": ...@@ -122,7 +122,7 @@ if __name__ == "__main__":
rank=my_rank, rank=my_rank,
) )
print("initialized! My rank is %d" % my_rank) print(f"initialized! My rank is {my_rank}")
config = KVTransferConfig( config = KVTransferConfig(
kv_connector='PyNcclConnector', kv_connector='PyNcclConnector',
......
...@@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: ...@@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts return generated_texts
@pytest.mark.xfail(current_platform.is_rocm(), @pytest.mark.xfail(
reason="Qwen2-VL dependency xformers incompatible with ROCm" current_platform.is_rocm(),
) reason="Qwen2-VL dependency xformers incompatible with ROCm")
def test_qwen2vl_lora(qwen2vl_lora_files): def test_qwen2vl_lora(qwen2vl_lora_files):
llm = vllm.LLM( llm = vllm.LLM(
MODEL_PATH, MODEL_PATH,
......
...@@ -521,7 +521,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2) ...@@ -521,7 +521,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
# - image embeddings # - image embeddings
# - video # - video
# - custom inputs # - custom inputs
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
...@@ -543,7 +544,8 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, ...@@ -543,7 +544,8 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE, test_type=VLMTestType.MULTI_IMAGE,
...@@ -565,7 +567,8 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, ...@@ -565,7 +567,8 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING, test_type=VLMTestType.EMBEDDING,
...@@ -586,7 +589,8 @@ def test_image_embedding_models(model_type: str, ...@@ -586,7 +589,8 @@ def test_image_embedding_models(model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO, test_type=VLMTestType.VIDEO,
...@@ -605,7 +609,8 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, ...@@ -605,7 +609,8 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
...@@ -627,7 +632,8 @@ def test_custom_inputs_models( ...@@ -627,7 +632,8 @@ def test_custom_inputs_models(
#### Tests filtering for things running each test as a new process #### Tests filtering for things running each test as a new process
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
...@@ -650,7 +656,8 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, ...@@ -650,7 +656,8 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE, test_type=VLMTestType.MULTI_IMAGE,
...@@ -673,7 +680,8 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, ...@@ -673,7 +680,8 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING, test_type=VLMTestType.EMBEDDING,
...@@ -695,7 +703,8 @@ def test_image_embedding_models_heavy(model_type: str, ...@@ -695,7 +703,8 @@ def test_image_embedding_models_heavy(model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO, test_type=VLMTestType.VIDEO,
...@@ -715,7 +724,8 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, ...@@ -715,7 +724,8 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
......
...@@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs( ...@@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs, outputs: OutputsLogprobs,
filename: "StrPath", filename: "StrPath",
) -> None: ) -> None:
json_data = [(tokens, text, json_data = [(tokens, text, [{
[{k: asdict(v) k: asdict(v)
for k, v in token_logprobs.items()} for k, v in token_logprobs.items()
for token_logprobs in (logprobs or [])]) } for token_logprobs in (logprobs or [])])
for tokens, text, logprobs in outputs] for tokens, text, logprobs in outputs]
with open(filename, "w") as f: with open(filename, "w") as f:
...@@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs: ...@@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f: with open(filename, "rb") as f:
json_data = json.load(f) json_data = json.load(f)
return [(tokens, text, return [(tokens, text, [{
[{int(k): Logprob(**v) int(k): Logprob(**v)
for k, v in token_logprobs.items()} for k, v in token_logprobs.items()
for token_logprobs in logprobs]) } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
for tokens, text, logprobs in json_data]
@large_gpu_test(min_gb=80) @large_gpu_test(min_gb=80)
......
...@@ -314,9 +314,9 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): ...@@ -314,9 +314,9 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
@pytest.mark.skipif(not sparse_cutlass_supported(), @pytest.mark.skipif(
reason="2of4 Sparse is not yet supported on this GPU type." not sparse_cutlass_supported(),
) reason="2of4 Sparse is not yet supported on this GPU type.")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"args_2of4", "args_2of4",
[("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")]) [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
......
...@@ -23,16 +23,17 @@ def mock_causal_accepted_tensor( ...@@ -23,16 +23,17 @@ def mock_causal_accepted_tensor(
""" """
batch_size = last_accepted_indices.shape[0] batch_size = last_accepted_indices.shape[0]
accepted = (torch.arange(k).expand(batch_size, k) <= accepted = (torch.arange(k).expand(batch_size, k)
last_accepted_indices.unsqueeze(-1).broadcast_to( <= last_accepted_indices.unsqueeze(-1).broadcast_to(
batch_size, k)) batch_size, k))
# Sprinkle accepted values after the contiguous initial accepted values. # Sprinkle accepted values after the contiguous initial accepted values.
# This replicates the behavior of rejection sampling, which may "accept" # This replicates the behavior of rejection sampling, which may "accept"
# a token that cannot be accepted because of causality. # a token that cannot be accepted because of causality.
sprinkle_candidates = ( sprinkle_candidates = (torch.arange(k).expand(
torch.arange(k).expand(batch_size, k) > batch_size,
last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1) k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
1)
sprinkle = torch.rand(batch_size, k) > 0.5 sprinkle = torch.rand(batch_size, k) > 0.5
accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates] accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
return accepted return accepted
...@@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution( ...@@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution(
distance_wrt_reference) distance_wrt_reference)
expected_improvement_multiplier = 20 expected_improvement_multiplier = 20
assert (relative_change_in_distance_wrt_target > assert (relative_change_in_distance_wrt_target
relative_change_in_distance_wrt_reference * > relative_change_in_distance_wrt_reference *
expected_improvement_multiplier) expected_improvement_multiplier)
......
...@@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types): ...@@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types):
print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x ' print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
'parallelism)'.format(length, total_cpu_time, 'parallelism)'.format(length, total_cpu_time,
total_cpu_time * 1.0 / length)) total_cpu_time * 1.0 / length))
print(' %d build steps completed, average of %1.2f/s' % print(' {} build steps completed, average of {:1.2f}/s'.format(
(len(entries), len(entries) / (length))) len(entries),
len(entries) / (length)))
def main(): def main():
......
...@@ -820,8 +820,8 @@ def scaled_int8_quant( ...@@ -820,8 +820,8 @@ def scaled_int8_quant(
if scale is not None: if scale is not None:
# static-per-tensor quantization. # static-per-tensor quantization.
assert symmetric == ( assert symmetric == (
azp is azp
None), "azp must only be provided for asymmetric quantization." is None), "azp must only be provided for asymmetric quantization."
torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
return output, scale, azp return output, scale, azp
......
...@@ -219,8 +219,8 @@ if triton.__version__ >= "2.1.0": ...@@ -219,8 +219,8 @@ if triton.__version__ >= "2.1.0":
float("-inf")) float("-inf"))
if SLIDING_WINDOW > 0: if SLIDING_WINDOW > 0:
qk = tl.where( qk = tl.where(
offs_m[:, None] - offs_m[:, None] - (start_n + offs_n[None, :])
(start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000) < SLIDING_WINDOW, qk, -10000)
# -- compute m_ij, p, l_ij # -- compute m_ij, p, l_ij
m_ij = tl.max(qk, 1) m_ij = tl.max(qk, 1)
...@@ -324,9 +324,9 @@ if triton.__version__ >= "2.1.0": ...@@ -324,9 +324,9 @@ if triton.__version__ >= "2.1.0":
(cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
cur_head * stride_qh + offs_d[None, :] * stride_qd) cur_head * stride_qh + offs_d[None, :] * stride_qd)
q = tl.load( q = tl.load(Q + off_q,
Q + off_q, mask=offs_m[:, None]
mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, < cur_batch_seq_len - cur_batch_ctx_len,
other=0.0) other=0.0)
# # initialize pointer to m and l # # initialize pointer to m and l
...@@ -402,8 +402,8 @@ if triton.__version__ >= "2.1.0": ...@@ -402,8 +402,8 @@ if triton.__version__ >= "2.1.0":
# -- compute qk ---- # -- compute qk ----
k = tl.load(k_ptrs + k = tl.load(k_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_kbs, (cur_batch_in_all_start_index + start_n) * stride_kbs,
mask=(start_n + offs_n[None, :]) < mask=(start_n + offs_n[None, :])
cur_batch_seq_len - cur_batch_ctx_len, < cur_batch_seq_len - cur_batch_ctx_len,
other=0.0) other=0.0)
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
...@@ -430,8 +430,8 @@ if triton.__version__ >= "2.1.0": ...@@ -430,8 +430,8 @@ if triton.__version__ >= "2.1.0":
# update acc # update acc
v = tl.load(v_ptrs + v = tl.load(v_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_vbs, (cur_batch_in_all_start_index + start_n) * stride_vbs,
mask=(start_n + offs_n[:, None]) < mask=(start_n + offs_n[:, None])
cur_batch_seq_len - cur_batch_ctx_len, < cur_batch_seq_len - cur_batch_ctx_len,
other=0.0) other=0.0)
p = p.to(v.dtype) p = p.to(v.dtype)
...@@ -639,8 +639,8 @@ if triton.__version__ >= "2.1.0": ...@@ -639,8 +639,8 @@ if triton.__version__ >= "2.1.0":
k = tl.load(k_ptrs + k = tl.load(k_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_kbs, (cur_batch_in_all_start_index + start_n) * stride_kbs,
mask=dim_mask[:, None] & mask=dim_mask[:, None] &
((start_n + offs_n[None, :]) < ((start_n + offs_n[None, :])
cur_batch_seq_len - cur_batch_ctx_len), < cur_batch_seq_len - cur_batch_ctx_len),
other=0.0) other=0.0)
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
...@@ -677,8 +677,8 @@ if triton.__version__ >= "2.1.0": ...@@ -677,8 +677,8 @@ if triton.__version__ >= "2.1.0":
v = tl.load(v_ptrs + v = tl.load(v_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_vbs, (cur_batch_in_all_start_index + start_n) * stride_vbs,
mask=dim_mask[None, :] & mask=dim_mask[None, :] &
((start_n + offs_n[:, None]) < ((start_n + offs_n[:, None])
cur_batch_seq_len - cur_batch_ctx_len), < cur_batch_seq_len - cur_batch_ctx_len),
other=0.0) other=0.0)
p = p.to(v.dtype) p = p.to(v.dtype)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment