Update `pre-commit` hooks (#12475)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Update `pre-commit` hooks (#12475)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
823ab796 · Harry Mellor · GitHub · 6116ca8c · 823ab796 · 823ab796
Unverified Commit 823ab796 authored Jan 28, 2025 by Harry Mellor Committed by GitHub Jan 27, 2025
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,18 +3,18 @@ default_stages:
  - manual # Run in CI
 repos:
 - repo: https://github.com/google/yapf
-  rev: v0.32.0
+  rev: v0.43.0
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.5
+  rev: v0.9.3
  hooks:
  - id: ruff
    args: [--output-format, github]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.3.0
+  rev: v2.4.0
  hooks:
  - id: codespell
    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
@@ -23,7 +23,7 @@ repos:
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v18.1.5
+  rev: v19.1.7
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
@@ -35,7 +35,7 @@ repos:
  - id: pymarkdown
    files: docs/.*
 - repo: https://github.com/rhysd/actionlint
-  rev: v1.7.6
+  rev: v1.7.7
  hooks:
  - id: actionlint
 - repo: local

--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
                    )
        # Traffic
-        result_json["request_rate"] = (
+        result_json["request_rate"] = (args.request_rate if args.request_rate
-            args.request_rate if args.request_rate < float("inf") else "inf")
+                                       < float("inf") else "inf")
        result_json["burstiness"] = args.burstiness
        result_json["max_concurrency"] = args.max_concurrency

--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -38,9 +38,13 @@ struct Signal {
  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
 };
-struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
+struct __align__(16) RankData {
+  const void* __restrict__ ptrs[8];
+};
-struct __align__(16) RankSignals { Signal* signals[8]; };
+struct __align__(16) RankSignals {
+  Signal* signals[8];
+};
 // like std::array, but aligned
 template <typename T, int sz>

--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  const int SUB = 0x64006400;
  const int MUL = 0x2c002c00;

--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;
@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
  q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
  typename ScalarType<nv_bfloat16>::FragB frag_b;
  static constexpr uint32_t MUL = 0x3F803F80;
@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  const int SUB = 0x64006400;
  const int MUL = 0x2c002c00;
@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
  q >>= 4;
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
  typename ScalarType<nv_bfloat16>::FragB frag_b;
  static constexpr uint32_t MUL = 0x3F803F80;

--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;

--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
  static constexpr uint32_t HI = 0x00f000f0;
  static constexpr uint32_t EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  static constexpr uint32_t SUB = 0x64086408;

--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
  const int HI = 0x00f000f0;
  const int EX = 0x64006400;
  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
  // directly into `SUB` and `ADD`.
  const int SUB = 0x64086408;

--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                           // max_num_partitions, head_size]
    const int* __restrict__ context_lens,  // [num_seqs]
-    const int max_num_partitions){UNREACHABLE_CODE}
+    const int max_num_partitions) {
+  UNREACHABLE_CODE
+}
 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support

--- a/setup.py
+++ b/setup.py
@@ -417,7 +417,7 @@ def get_rocm_version():
        if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
                                  ctypes.byref(patch)) == 0):
-            return "%d.%d.%d" % (major.value, minor.value, patch.value)
+            return f"{major.value}.{minor.value}.{patch.value}"
        return None
    except Exception:
        return None

--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
    ]
    B_tiles = [[
-        B[j * block_n:min((j + 1) * block_n, N),
+        B[
-          i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
+            j * block_n:min((j + 1) * block_n, N),
+            i * block_k:min((i + 1) * block_k, K),
+        ] for i in range(k_tiles)
    ] for j in range(n_tiles)]
    C_tiles = [
        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
@@ -157,9 +159,9 @@ def setup_cuda():
    torch.set_default_device("cuda")
-@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
+@pytest.mark.parametrize(
-                         itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
+    "num_tokens,d,dtype,group_size,seed",
-                                           SEEDS))
+    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
 @torch.inference_mode()
 def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
    torch.manual_seed(seed)
@@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
    assert torch.allclose(scale, ref_scale)
-@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
+@pytest.mark.parametrize(
-                         itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
+    "M,N,K,block_size,out_dtype,seed",
-                                           SEEDS))
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
    torch.manual_seed(seed)
@@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
    assert rel_diff < 0.001
-@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
+@pytest.mark.parametrize(
-                         itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
+    "M,N,K,E,topk,block_size,dtype,seed",
-                                           BLOCK_SIZE, DTYPES, SEEDS))
+    itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
+                      SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
    torch.manual_seed(seed)

--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
        assert buffer.buffer_size == 0
        assert len(buffer.buffer) == 0
-    print("My rank: %d, device: %s" % (my_rank, device))
+    print(f"My rank: {my_rank}, device: {device}")
    # insert
    tokens = torch.tensor([1, 2, 3]).to(device)
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
        assert buffer.buffer_size == 0
        assert len(buffer.buffer) == 0
-    print("My rank: %d, Test run passed!" % (my_rank))
+    print(f"My rank: {my_rank}, Test run passed!")
 def stress_test(my_rank, buf, device):
@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
                assert torch.allclose(k, k_)
                assert torch.allclose(v, v_)
                assert torch.allclose(h, h_)
-    print('Rank %d done' % my_rank)
+    print(f"Rank {my_rank} done")
    torch.distributed.barrier()
    if my_rank == 0:
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
    else:
        torch.distributed.send(torch.tensor([n]), 0)
-    print("My rank: %d, Passed stress test!" % (my_rank))
+    print(f"My rank: {my_rank}, Passed stress test!")
 if __name__ == "__main__":
@@ -122,7 +122,7 @@ if __name__ == "__main__":
        rank=my_rank,
    )
-    print("initialized! My rank is %d" % my_rank)
+    print(f"initialized! My rank is {my_rank}")
    config = KVTransferConfig(
        kv_connector='PyNcclConnector',

--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    return generated_texts
-@pytest.mark.xfail(current_platform.is_rocm(),
+@pytest.mark.xfail(
-                   reason="Qwen2-VL dependency xformers incompatible with ROCm"
+    current_platform.is_rocm(),
-                   )
+    reason="Qwen2-VL dependency xformers incompatible with ROCm")
 def test_qwen2vl_lora(qwen2vl_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,

--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -521,7 +521,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
 # - image embeddings
 # - video
 # - custom inputs
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
+    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
@@ -543,7 +544,8 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
+    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
@@ -565,7 +567,8 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
+    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
@@ -586,7 +589,8 @@ def test_image_embedding_models(model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
+    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
@@ -605,7 +609,8 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
+    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
@@ -627,7 +632,8 @@ def test_custom_inputs_models(
 #### Tests filtering for things running each test as a new process
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
+    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
@@ -650,7 +656,8 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
+    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
@@ -673,7 +680,8 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
+    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
@@ -695,7 +703,8 @@ def test_image_embedding_models_heavy(model_type: str,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
+    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
@@ -715,7 +724,8 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
    )
-@pytest.mark.parametrize("model_type,test_case",
+@pytest.mark.parametrize(
+    "model_type,test_case",
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,

--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs(
    outputs: OutputsLogprobs,
    filename: "StrPath",
 ) -> None:
-    json_data = [(tokens, text,
+    json_data = [(tokens, text, [{
-                  [{k: asdict(v)
+        k: asdict(v)
-                    for k, v in token_logprobs.items()}
+        for k, v in token_logprobs.items()
-                   for token_logprobs in (logprobs or [])])
+    } for token_logprobs in (logprobs or [])])
                 for tokens, text, logprobs in outputs]
    with open(filename, "w") as f:
@@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
    with open(filename, "rb") as f:
        json_data = json.load(f)
-    return [(tokens, text,
+    return [(tokens, text, [{
-             [{int(k): Logprob(**v)
+        int(k): Logprob(**v)
-               for k, v in token_logprobs.items()}
+        for k, v in token_logprobs.items()
-              for token_logprobs in logprobs])
+    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
-            for tokens, text, logprobs in json_data]
 @large_gpu_test(min_gb=80)

--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -314,9 +314,9 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
 @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
-@pytest.mark.skipif(not sparse_cutlass_supported(),
+@pytest.mark.skipif(
-                    reason="2of4 Sparse is not yet supported on this GPU type."
+    not sparse_cutlass_supported(),
-                    )
+    reason="2of4 Sparse is not yet supported on this GPU type.")
 @pytest.mark.parametrize(
    "args_2of4",
    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])

--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -23,16 +23,17 @@ def mock_causal_accepted_tensor(
    """
    batch_size = last_accepted_indices.shape[0]
-    accepted = (torch.arange(k).expand(batch_size, k) <=
+    accepted = (torch.arange(k).expand(batch_size, k)
-                last_accepted_indices.unsqueeze(-1).broadcast_to(
+                <= last_accepted_indices.unsqueeze(-1).broadcast_to(
                    batch_size, k))
    # Sprinkle accepted values after the contiguous initial accepted values.
    # This replicates the behavior of rejection sampling, which may "accept"
    # a token that cannot be accepted because of causality.
-    sprinkle_candidates = (
+    sprinkle_candidates = (torch.arange(k).expand(
-        torch.arange(k).expand(batch_size, k) >
+        batch_size,
-        last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1)
+        k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
+                           1)
    sprinkle = torch.rand(batch_size, k) > 0.5
    accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
    return accepted
@@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution(
        distance_wrt_reference)
    expected_improvement_multiplier = 20
-    assert (relative_change_in_distance_wrt_target >
+    assert (relative_change_in_distance_wrt_target
-            relative_change_in_distance_wrt_reference *
+            > relative_change_in_distance_wrt_reference *
            expected_improvement_multiplier)

--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types):
    print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
          'parallelism)'.format(length, total_cpu_time,
                                total_cpu_time * 1.0 / length))
-    print('    %d build steps completed, average of %1.2f/s' %
+    print('    {} build steps completed, average of {:1.2f}/s'.format(
-          (len(entries), len(entries) / (length)))
+        len(entries),
+        len(entries) / (length)))
 def main():

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -820,8 +820,8 @@ def scaled_int8_quant(
    if scale is not None:
        # static-per-tensor quantization.
        assert symmetric == (
-            azp is
+            azp
-            None), "azp must only be provided for asymmetric quantization."
+            is None), "azp must only be provided for asymmetric quantization."
        torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
        return output, scale, azp

--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -219,8 +219,8 @@ if triton.__version__ >= "2.1.0":
                          float("-inf"))
            if SLIDING_WINDOW > 0:
                qk = tl.where(
-                    offs_m[:, None] -
+                    offs_m[:, None] - (start_n + offs_n[None, :])
-                    (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000)
+                    < SLIDING_WINDOW, qk, -10000)
            # -- compute m_ij, p, l_ij
            m_ij = tl.max(qk, 1)
@@ -324,9 +324,9 @@ if triton.__version__ >= "2.1.0":
            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-        q = tl.load(
+        q = tl.load(Q + off_q,
-            Q + off_q,
+                    mask=offs_m[:, None]
-            mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+                    < cur_batch_seq_len - cur_batch_ctx_len,
                    other=0.0)
        # # initialize pointer to m and l
@@ -402,8 +402,8 @@ if triton.__version__ >= "2.1.0":
            # -- compute qk ----
            k = tl.load(k_ptrs +
                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :]) <
+                        mask=(start_n + offs_n[None, :])
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        < cur_batch_seq_len - cur_batch_ctx_len,
                        other=0.0)
            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -430,8 +430,8 @@ if triton.__version__ >= "2.1.0":
            # update acc
            v = tl.load(v_ptrs +
                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None]) <
+                        mask=(start_n + offs_n[:, None])
-                        cur_batch_seq_len - cur_batch_ctx_len,
+                        < cur_batch_seq_len - cur_batch_ctx_len,
                        other=0.0)
            p = p.to(v.dtype)
@@ -639,8 +639,8 @@ if triton.__version__ >= "2.1.0":
            k = tl.load(k_ptrs +
                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
                        mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :]) <
+                        ((start_n + offs_n[None, :])
-                         cur_batch_seq_len - cur_batch_ctx_len),
+                         < cur_batch_seq_len - cur_batch_ctx_len),
                        other=0.0)
            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -677,8 +677,8 @@ if triton.__version__ >= "2.1.0":
            v = tl.load(v_ptrs +
                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
                        mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None]) <
+                        ((start_n + offs_n[:, None])
-                         cur_batch_seq_len - cur_batch_ctx_len),
+                         < cur_batch_seq_len - cur_batch_ctx_len),
                        other=0.0)
            p = p.to(v.dtype)