# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # See LICENSE for license information. """Efficient NVFP4 padding kernels written with OpenAI Triton . TODO(ksivamani): Documentation """ import triton import triton.language as tl @triton.autotune( configs=[ triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=4, num_stages=2), triton.Config({"BLOCK_M": 128, "BLOCK_N": 256}, num_warps=4, num_stages=2), triton.Config({"BLOCK_M": 256, "BLOCK_N": 128}, num_warps=8, num_stages=2), triton.Config({"BLOCK_M": 128, "BLOCK_N": 256}, num_warps=8, num_stages=1), ], key=["out_dim0", "out_dim1"], ) @triton.jit def zero_pad_kernel( inp_ptr, out_ptr, in_dim0: tl.constexpr, in_dim1: tl.constexpr, out_dim0: tl.constexpr, out_dim1: tl.constexpr, in_s0, in_s1, out_s0, out_s1, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, ): """Pads a tensor assuming it's a columnwise scaling inverse.""" # tile over OUTPUT coordinates pid_m = tl.program_id(0) pid_n = tl.program_id(1) offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) # output rows offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) # output cols om = offs_m[:, None] on = offs_n[None, :] # edge masking for output out_mask = (om < out_dim0) & (on < out_dim1) # valid input region is simply top-left (no offsets) in_mask = (om < in_dim0) & (on < in_dim1) # load valid input, else zero (masked load touches memory only where True) x = tl.load(inp_ptr + om * in_s0 + on * in_s1, mask=in_mask, other=0) # store to output (only within bounds of the output tile) tl.store(out_ptr + om * out_s0 + on * out_s1, x, mask=out_mask)