Merge branch 'main' into dcu

667632cc · guchaoyang · GitHub · d6dd2ddf · a874e4e8 · 667632cc
Unverified Commit 667632cc authored Dec 22, 2025 by guchaoyang Committed by GitHub Dec 22, 2025
20 changed files
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
--- a/examples/blocksparse_attention/heuristic.py
+++ b/examples/blocksparse_attention/heuristic.py
 import math


-def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, num_m_blocks, size_one_kv_head,
-                         is_causal_or_local, max_splits):
+def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local, max_splits):
    """
    Determines the optimal number of splits for maximizing GPU occupancy while balancing memory efficiency.


--- a/examples/blocksparse_attention/test_example_blocksparse_attention.py
+++ b/examples/blocksparse_attention/test_example_blocksparse_attention.py
--- a/examples/blocksparse_gemm/example_blocksparse_gemm.py
+++ b/examples/blocksparse_gemm/example_blocksparse_gemm.py
--- a/examples/cast/example_group_per_split_token_cast_to_fp8.py
+++ b/examples/cast/example_group_per_split_token_cast_to_fp8.py
--- a/examples/cast/example_per_token_cast_to_fp8.py
+++ b/examples/cast/example_per_token_cast_to_fp8.py
--- a/examples/cast/example_triton_cast_to_fp8.py
+++ b/examples/cast/example_triton_cast_to_fp8.py
@@ -128,9 +128,7 @@ def per_token_group_quant_fp8(
        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
        scaling factor for quantization.
    """
-    assert (x.shape[-1] %
-            group_size == 0), (f"the last dimension of `x` {x.shape[-1]} must be divisible "
-                               f"by `group_size` {group_size}")
+    assert x.shape[-1] % group_size == 0, f"the last dimension of `x` {x.shape[-1]} must be divisible by `group_size` {group_size}"
    assert x.stride(-1) == 1, "`x` groups must be contiguous"

    finfo = torch.finfo(dtype)

--- a/examples/cast/test_example_cast.py
+++ b/examples/cast/test_example_cast.py
--- a/examples/conftest.py
+++ b/examples/conftest.py
--- a/examples/convolution/example_convolution.py
+++ b/examples/convolution/example_convolution.py
--- a/examples/convolution/example_convolution_autotune.py
+++ b/examples/convolution/example_convolution_autotune.py
--- a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
+++ b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
--- a/examples/deepseek_mla/benchmark_mla.py
+++ b/examples/deepseek_mla/benchmark_mla.py
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
--- a/examples/deepseek_mla/example_mla_decode_paged.py
+++ b/examples/deepseek_mla/example_mla_decode_paged.py
--- a/examples/deepseek_mla/example_mla_decode_persistent.py
+++ b/examples/deepseek_mla/example_mla_decode_persistent.py