builtin.h

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

/*!
 * \file tl/op/builtin.h
 * \brief Builtin intrinsics.
 *
 */

#ifndef TVM_TL_OP_BUILTIN_H_
#define TVM_TL_OP_BUILTIN_H_

#include "op.h"

namespace tvm {
namespace tl {

/*!
 * \brief tvm intrinsics for TMADescriptor creation for tiled load
 *
 * CuTensorMap* CreateTMADescriptorOp(data_type, rank, global_addr, global_shape...,
 * global_stride..., smem_box..., smem_stride..., interleave, swizzle, l2_promotion, oob_fill)
 *
 */
const Op& CreateTMADescriptorOp();

/*!
 * \brief tvm intrinsics for TMADescriptor creation for image to column load
 *
 * CuTensorMap* CreateTMAIm2ColDescriptorOp(data_type, rank, global_addr, global_shape...,
 * global_stride..., elem_stride..., lower_corner..., upper_corner..., smme_box_pixel, smem_box_channel,
 * interleave, swizzle, l2_promotion, oob_fill)
 *
 */
const Op& CreateTMAIm2ColDescriptorOp();

/*!
 * \brief Create a list of mbarrier with num_threads
 *
 * GetMBarrier(num_threads0, num_threads1, ...)
 *
 */
const Op& CreateListofMBarrierOp();

/*!
 * \brief Get the mbarrier with barrier_id
 *
 * int64_t* GetMBarrier(barrier_id)
 *
 */
const Op& GetMBarrierOp();

/*!
 * \brief tvm intrinsics for loading data from global tensor descriptor to shared memory
 *
 * TMALoadOp(descriptor, mbarrier, smem_data, coord_0, coord_1, ...)
 *
 */
const Op& TMALoadOp();

/*!
 * \brief tvm intrinsics for loading image from global tensor to columns in shared memory
 *
 * TMALoadOp(descriptor, mbarrier, smem_data, coord_0, coord_1, ..., image_offset, ...)
 *
 */
const Op& TMALoadIm2ColOp();

/*!
 * \brief tvm intrinsics for storing data from shared memory to global tensor descriptor
 *
 * TMAStoreOp(descriptor, smem_data, coord_0, coord_1, ...)
 *
 */
const Op& TMAStoreOp();

/*!
 * \brief tvm intrinsics for mbarrier wait with parity bit
 *
 * MBarrierWaitParity(mbarrier, parity)
 *
 */
const Op& MBarrierWaitParity();

/*!
 * \brief tvm intrinsics for mbarrier expect tx
 *
 * MBarrierExpectTX(mbarrier, transaction_bytes)
 *
 */
const Op& MBarrierExpectTX();

/*!
 * \brief tvm intrinsics for ldmatrix
 *
 * LDMatrixOp(transposed, num, shared_addr, local_addr)
 *
 */
const Op& LDMatrixOp();

/*!
 * \brief tvm intrinsics for stmatrix
 *
 * LDMatrixOp(transposed, num, shared_addr, int32_values...)
 *
 */
const Op& STMatrixOp();

/*!
 * \brief Pack two b16 value into a b32 value
 *
 * int32 PackB16Op(b16_value, b16_value)
 *
 */
const Op& PackB16Op();

/*!
 * \brief Similar to __syncthreads(), but can be used to sync partial threads
 *
 * SyncThreadsPartialOp(num_partial_threads or mbarrier)
 *
 */
const Op& SyncThreadsPartialOp();

/*!
 * \brief Issue a shared memory fence for async operations
 *
 * FenceProxyAsync()
 *
 */
const Op& FenceProxyAsyncOp();

/*!
 * \brief Set reg hint for warp-specialized branched
 *
 * SetMaxNRegInc(num_reg, is_inc)
 *
 */
const Op& SetMaxNReg();

/*!
 * \brief Wait the previous wgmma to finish
 *
 * WaitWgmma(num_mma)
 *
 */
const Op& WaitWgmma();

/*!
 * \brief tvm intrinsic for amd matrix core mfma instructions.
 *
 *  void tvm_mfma(StringImm shape, StringImm A_layout, StringImm B_layout,
 *               StringImm A_dtype, StringImm B_dtype, StringImm C_dtype,
 *               Var multiplicand_a, Expr a_index,
 *               Var multiplicand_b, Expr b_index,
 *               Var accumulator, Expr c_index);
 */
TVM_DLL const Op &tvm_mfma();

/*!
 * \brief tvm intrinsic for storing the result of AMD MFMA into a destination
 * pointer.
 *
 *        There is no real instruction that does that, but we want to hide
 * details of complex index manipulation behind this intrinsic to simplify TIR
 * lowering passes (e.g. LowerWarpMemory) like cuda ptx backend does.
 *
 * void tvm_mfma_store(IntImm m, IntImm n, Var dst_ptr, Var src_ptr, Expr
 * src_offset, Var dst_stride);
 */
TVM_DLL const Op &tvm_mfma_store();

/*!
 * \brief tvm intrinsic for amd rdna matrix core instructions.
 *
 *  void tvm_rdna_wmma(StringImm shape, StringImm A_layout, StringImm B_layout,
 *               StringImm A_dtype, StringImm B_dtype, StringImm C_dtype,
 *               Var multiplicand_a, Expr a_index,
 *               Var multiplicand_b, Expr b_index,
 *               Var accumulator, Expr c_index);
 */
TVM_DLL const Op &tvm_rdna_wmma();

/*!
 * \brief tvm intrinsic for storing the result of AMD RDNA WMMA into a
 * destination pointer.
 *
 *        There is no real instruction that does that, but we want to hide
 * details of complex index manipulation behind this intrinsic to simplify TIR
 * lowering passes (e.g. LowerWarpMemory) like cuda ptx backend does.
 *
 * void tvm_rdna_wmma_store(IntImm m, IntImm n, Var dst_ptr, Var src_ptr, Expr
 * src_offset, Var dst_stride);
 */
TVM_DLL const Op &tvm_rdna_wmma_store();

}  // namespace tl
}  // namespace tvm

#endif  //  TVM_TL_OP_BUILTIN_H_