gemm.cc 31.8 KB
Newer Older
1
2
/*!
 * \file tl/op/gemm.cc
3
 * \brief Implementation of General Matrix Multiplication (GEMM) operators
4
5
6
7
 */

#include "gemm.h"

8
#include "builtin.h"
9
10
11
#include <tvm/tir/builtin.h>
#include <tvm/tir/op.h>
#include <tvm/tir/op_attr_types.h>
12
#include <tvm/tir/transform.h>
13
14

#include "../target/utils.h"
15
#include "tcgen5_meta.h"
16
17
18
19
20
21

namespace tvm {
namespace tl {

using namespace tir;

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/**
 * @brief Construct a Gemm operator from serialized TL arguments and a buffer
 * map.
 *
 * This constructor deserializes operator parameters from `args` and resolves
 * buffer references via `vmap`, populating an internal GemmNode with:
 * - device pointers for A, B, C and their corresponding Buffer objects,
 * - transpose flags for A and B,
 * - matrix dimensions M, N, K,
 * - warp allocation policy and clear_accum flag,
 * - strides and memory offsets for A and B,
 * - optional kPack (must be 1 or 2) and optional wg_wait.
 *
 * The populated GemmNode is stored into the wrapper's internal `data_`.
 *
 * @param args Positional serialized arguments produced by the TL frontend:
 *   expected layout is:
 *     [Aptr, Bptr, Cptr, trans_A (Bool), trans_B (Bool),
 *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
 *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
 *      (optional) kPack (Int), (optional) wg_wait (Int)]
 * @param vmap Mapping from access pointer vars to Buffer objects used to
 *   resolve the Buffer corresponding to each pointer argument.
 *
46
 * @note If `kPack` is provided it must be 1; otherwise the constructor
47
48
49
 *       fails with an ICHECK (runtime assertion). No other validation is
 *       performed here.
 */
50
Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
51
  ObjectPtr<GemmNode> node = tvm::ffi::make_object<GemmNode>();
52
53
54
55
56
57
58
59
60
61
62
63

  node->Aptr = args[0];
  node->Bptr = args[1];
  node->Cptr = args[2];
  node->A = vmap[GetVarFromAccessPtr(node->Aptr)];
  node->B = vmap[GetVarFromAccessPtr(node->Bptr)];
  node->C = vmap[GetVarFromAccessPtr(node->Cptr)];
  node->trans_A = args[3].as<Bool>().value();
  node->trans_B = args[4].as<Bool>().value();
  node->M = args[5].as<IntImm>().value()->value;
  node->N = args[6].as<IntImm>().value()->value;
  node->K = args[7].as<IntImm>().value()->value;
64
  node->policy = GemmWarpPolicy(args[8].as<IntImm>().value()->value);
65
  node->clear_accum = args[9].as<PrimExpr>().value();
66
67
68
69
  node->stride_A = args[10].as<IntImm>().value()->value;
  node->stride_B = args[11].as<IntImm>().value()->value;
  node->offset_A = args[12].as<IntImm>().value()->value;
  node->offset_B = args[13].as<IntImm>().value()->value;
70
  if (args.size() > 14) {
71
72
    node->kPack = args[14].as<IntImm>().value()->value;
    if (node->kPack != 1 && node->kPack != 2) {
73
74
75
      ICHECK(false) << "kPack must be 1 or 2";
    }
  }
76
  if (args.size() > 15) {
77
    node->wg_wait = args[15].as<IntImm>().value()->value;
78
  }
79
80
81
82
83
84
85
86
  node->mbarptr = args[16];
  if (node->mbarptr.as<CallNode>()) {
    node->mbar = vmap[GetVarFromAccessPtr(node->mbarptr)];
  } else {
    node->mbar = std::nullopt;
  }
  node->C_coords = Array<PrimExpr>(
      {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
87
  data_ = std::move(node);
88
89
}

90
91
92
93
94
95
96
97
/**
 * @brief Create a copy of this GemmNode as a TileOperator.
 *
 * Constructs a new GemmNode by copying the current node state and returns it
 * wrapped in a Gemm TileOperator.
 *
 * @return TileOperator A Gemm operator that owns a copy of this node.
 */
98
TileOperator GemmNode::Clone() const {
99
  auto op = tvm::ffi::make_object<GemmNode>(*this);
100
101
102
  return Gemm(op);
}

103
104
105
106
107
108
109
110
111
112
bool GemmNode::AllowTCGEN5MMA(Target target) const {
  return TargetIsSm100(target) &&
         ((A.scope() == "shared.dyn" || A.scope() == "shared" ||
           A.scope() == "shared.tmem") &&
          (B.scope() == "shared.dyn" || B.scope() == "shared") &&
          C.scope() == "shared.tmem") &&
         GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype).first;
}

bool GemmNode::AllowWGMMA(int block_size, Target target) const {
113
114
  tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();

115
116
  int warp_size = TargetGetWarpSize(target);
  int num_warps = block_size / warp_size;
117
118
119
120
121
122
123
124
125
126
127
  return !ctxt->GetConfig(kDisableWGMMA, Optional<Bool>()).value_or(false) &&
         TargetIsHopper(target) && (this->M >= 64) && (num_warps % 4 == 0) &&
         CheckWGMMA();
}

GemmInst GemmNode::GetGemmInst(int block_size, Target target) const {
  bool allow_tcgen5mma = AllowTCGEN5MMA(target);
  bool allow_wgmma = AllowWGMMA(block_size, target);
  if (allow_tcgen5mma) {
    return GemmInst::kTCGEN5MMA;
  } else if (allow_wgmma) {
128
129
130
    return GemmInst::kWGMMA;
  } else if (TargetIsCDNA(target)) {
    return GemmInst::kMFMA;
131
  } else if (TargetIsCuda(target)) {
132
133
    return GemmInst::kMMA;
  } else {
134
    ICHECK(0) << "Unsupported target for gemm: " << target;
135
136
137
  }
}

138
139
std::pair<int, int> GemmWarpPolicyNode::ComputeWarpPartition(
    int M, int N, int block_size, Target target, GemmInst gemm_inst) const {
140
  int num_warps = block_size / TargetGetWarpSize(target);
141
142
143
144
  if (gemm_inst == GemmInst::kTCGEN5MMA) {
    return {1, num_warps}; // TCGEN5MMA doesn't care about warp partitioning
  }

145
  int m_warp = 1, n_warp = 1;
146
  constexpr int kMPerWarp = 16; // Rows processed by a single warp
147
148
149
150
  int kNPerWarp = 8;            // Columns processed by a single warp
  if (TargetIsVolta(target)) {
    kNPerWarp = 16;
  }
151
152
153
154
155
  ICHECK(M % kMPerWarp == 0)
      << "M must be divisible by " << kMPerWarp << ", but got " << M;
  ICHECK(N % kNPerWarp == 0)
      << "N must be divisible by " << kNPerWarp << ", but got " << N;

156
  if (gemm_inst == GemmInst::kWGMMA) {
157
158
159
160
161
162
163
    ICHECK(num_warps % 4 == 0) << "Warp-Group MMA requires 128×k threads.";

    constexpr int kGroup = 4; // Number of warps in a warp-group

    m_warp = kGroup; // Initially, only one warp-group on M dimension
    n_warp = num_warps / m_warp; // Rest all on N dimension

164
    if (this->isFullRow()) {
165
166
167
      // Try to put as many warp-groups as possible on M dimension
      // (decreasing multiples of 4, ensuring divisibility by M)
      for (int cand = num_warps; cand >= kGroup; cand -= kGroup) {
168
        if (M % (cand * kMPerWarp) == 0) {
169
170
171
172
173
          m_warp = cand;
          n_warp = num_warps / m_warp;
          break;
        }
      }
174
    } else if (this->isFullCol()) {
175
176
      // Try to use warps on N dimension; if N is not divisible, split excess
      // groups to M
177
178
179
      int cand_n = n_warp;                 // Initially assume all on N
      if (N % (cand_n * kNPerWarp) != 0) { // N direction division fails
        int max_n = N / kNPerWarp;
180
181
182
183
184
185
186
187
188
189
        // Find a feasible n_warp from max possible downwards, ensuring
        // num_warps/n_warp is multiple of 4
        for (int n = std::min(cand_n, max_n); n >= 1; --n) {
          if (num_warps % n == 0 && (num_warps / n) % kGroup == 0) {
            n_warp = n;
            m_warp = num_warps / n_warp;
            break;
          }
        }
      }
190
    } else if (this->isSquare()) {
191
      // Exhaustive search, but m must be multiple of 4
192
193
      int max_m = M / kMPerWarp;
      int max_n = N / kNPerWarp;
194

195
      float ideal = N > 0 ? static_cast<float>(M) / N : 1.f;
196
197
198
199
200
201
202
203
204
205
206

      float best_score = std::numeric_limits<float>::max();
      int best_m = kGroup, best_n = n_warp;

      for (int m = kGroup; m <= num_warps && m <= max_m; m += kGroup) {
        if (num_warps % m)
          continue;
        int n = num_warps / m;
        if (n > max_n)
          continue;

207
208
        float m_per_warp = static_cast<float>(M) / (m * kMPerWarp);
        float n_per_warp = static_cast<float>(N) / (n * kNPerWarp);
209
210
211
212
213
214
215
216
217
218
        float score = std::abs(m_per_warp / n_per_warp - ideal);

        if (score < best_score) {
          best_score = score;
          best_m = m;
          best_n = n;
        }
      }
      m_warp = best_m;
      n_warp = best_n;
219
220
221
    } else {
      ICHECK(0) << "Unknown GemmWarpPolicy";
    }
222
223

    ICHECK(m_warp * n_warp == num_warps)
224
225
        << "m_warp * n_warp must equal num_warps, m_warp: " << m_warp
        << ", n_warp: " << n_warp << ", num_warps: " << num_warps;
226
227
228
229
230

    // Store the computed values in the object's member variables
    this->m_warp = m_warp;
    this->n_warp = n_warp;

231
232
    return {m_warp, n_warp};
  }
233

234
  if (this->isFullRow()) {
235
    // Try to partition M first
236
    m_warp = num_warps;
237
238
239
240
    n_warp = 1;

    // If M cannot be evenly divided by m_warp*16, try to split remaining warps
    // to N
241
    if (M % (m_warp * kMPerWarp) != 0) {
242
      // Calculate how many warps we can use for M
243
      int max_m_warps = M / kMPerWarp;
244
245
246
247
248
249
      m_warp = max_m_warps;
      // Use remaining warps for N
      n_warp = num_warps / m_warp;
      if (n_warp == 0)
        n_warp = 1;
    }
250
  } else if (this->isFullCol()) {
251
252
    // Try to partition N first
    m_warp = 1;
253
    n_warp = num_warps;
254
255
256

    // If N cannot be evenly divided by n_warp*8, try to split remaining warps
    // to M
257
    if (N % (n_warp * kNPerWarp) != 0) {
258
      // Calculate how many warps we can use for N
259
      int max_n_warps = N / kNPerWarp;
260
261
262
263
264
265
      n_warp = max_n_warps;
      // Use remaining warps for M
      m_warp = num_warps / n_warp;
      if (m_warp == 0)
        m_warp = 1;
    }
266
  } else if (this->isSquare()) {
267
    // First calculate the maximum possible warps for each dimension
268
    int max_m_warps =
269
        M / kMPerWarp; // Each warp needs at least 16 elements in M
270
271
272

    // Calculate the ideal ratio of M/N warps based on the matrix dimensions
    float ideal_ratio = 1.0f;
273
274
    if (N > 0) {
      ideal_ratio = static_cast<float>(M) / N;
275
276
277
278
279
280
281
282
283
284
285
    }

    // Try to find the best balanced partition
    int best_m = 1;
    int best_n = 1;
    float best_balance = std::numeric_limits<float>::max();
    // Try all possible combinations that satisfy the constraints
    for (int m = 1; m <= max_m_warps && m <= num_warps; m++) {
      int n = num_warps / m;

      // Calculate how balanced this partition is
286
287
      float m_per_warp = static_cast<float>(M) / (m * kMPerWarp);
      float n_per_warp = static_cast<float>(N) / (n * kNPerWarp);
288
289
290
291
292
293
294
      // m_per_warp and n_per_warp must be greater than 1
      if (m_per_warp < 1 || n_per_warp < 1)
        continue;
      // m * n must equal num_warps
      if (m * n != num_warps)
        continue;

295
296
297
298
299
300
      float balance = std::abs(m_per_warp / n_per_warp - ideal_ratio);

      if (balance < best_balance) {
        best_balance = balance;
        best_m = m;
        best_n = n;
301
302
      }
    }
303
304
305

    m_warp = best_m;
    n_warp = best_n;
306
307
308
  } else {
    ICHECK(0) << "Unknown GemmWarpPolicy";
  }
309
310
311
312
  ICHECK(m_warp * n_warp == num_warps)
      << "m_warp * n_warp must equal num_warps, m_warp: " << m_warp
      << ", n_warp: " << n_warp << ", num_warps: " << num_warps;

313
314
315
316
  // Store the computed values in the object's member variables
  this->m_warp = m_warp;
  this->n_warp = n_warp;

317
318
319
  return {m_warp, n_warp};
}

320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
/**
 * @brief Checks whether WGMMA (warp-group MMA) can be used for this GEMM.
 *
 * Evaluates device-memory placement, data-type combinations, transpose flags,
 * and K divisibility constraints required for the Hopper WGMMA code path.
 *
 * The check returns true only when:
 * - B resides in shared memory ("shared" or "shared.dyn"); and
 * - (C, A, B) dtypes match one of the supported combinations below and K
 *   satisfies the required alignment; and
 * - for combinations that require specific orientations, A is not transposed
 *   and B is transposed.
 *
 * Supported combinations and constraints:
 * - C=float16:
 *   - A=float16, B=float16: K % 16 == 0
336
337
 *   - Various float8 mixes (e4m3/e5m2): require (!trans_A && trans_B) and K %
 * 32 == 0
338
339
340
341
342
343
 * - C=float32:
 *   - A=float16, B=float16: K % 16 == 0
 *   - A=bfloat16, B=bfloat16: K % 16 == 0
 *   - A=float32, B=float32: require (!trans_A && trans_B) and K % 8 == 0
 *   - Various float8 mixes: require (!trans_A && trans_B) and K % 32 == 0
 * - C=int32:
344
345
 *   - 8-bit integer combinations (Int8/UInt8): require (!trans_A && trans_B)
 * and K % 32 == 0
346
347
348
349
 *
 * @return true if WGMMA is supported for the current buffers, dtypes, and
 *         transpose/shape constraints; false otherwise.
 */
350
bool GemmNode::CheckWGMMA() const {
351
352
353
354
  if (B.scope() != "shared.dyn" && B.scope() != "shared") {
    return false;
  }

355
356
357
  if (C->dtype == DataType::Float(16)) {
    if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
      return K % 16 == 0;
358
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
359
      return (!trans_A) && trans_B && K % 32 == 0;
360
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
361
      return (!trans_A) && trans_B && K % 32 == 0;
362
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
363
      return (!trans_A) && trans_B && K % 32 == 0;
364
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
365
366
367
368
369
370
371
372
373
374
375
      return (!trans_A) && trans_B && K % 32 == 0;
    else
      return false;
  } else if (C->dtype == DataType::Float(32)) {
    if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
      return K % 16 == 0;
    else if (A->dtype == DataType::BFloat(16) &&
             B->dtype == DataType::BFloat(16))
      return K % 16 == 0;
    else if (A->dtype == DataType::Float(32) && B->dtype == DataType::Float(32))
      return (!trans_A) && trans_B && K % 8 == 0;
376
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
377
      return (!trans_A) && trans_B && K % 32 == 0;
378
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
379
      return (!trans_A) && trans_B && K % 32 == 0;
380
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
381
      return (!trans_A) && trans_B && K % 32 == 0;
382
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
      return (!trans_A) && trans_B && K % 32 == 0;
    else
      return false;
  } else if (C->dtype == DataType::Int(32)) {
    if (A->dtype == DataType::Int(8) && B->dtype == DataType::Int(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else if (A->dtype == DataType::Int(8) && B->dtype == DataType::UInt(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::Int(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::UInt(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else
      return false;
  } else {
    return false;
  }
}

402
403
404
405
406
407
408
409
410
411
412
413
414
415
/**
 * @brief Parse and return the numeric GPU architecture from a Target's "arch"
 * attribute.
 *
 * Examines the target's "arch" string and, if it matches the pattern
 * "sm_<num>", returns <num> as an int. If the attribute is present but does not
 * match that pattern, returns 0.
 *
 * Preconditions: the target must have an "arch" attribute (this is checked via
 * ICHECK).
 *
 * @return int The parsed architecture number (e.g., 80 for "sm_80"), or 0 if
 * the arch string does not match "sm_<num>".
 */
416
417
static int GetArchInt(Target target) {
  int arch_int = 0;
418
419
  auto s = target->GetAttr<tvm::ffi::String>("arch");
  ICHECK(s.has_value());
420
421
422
  std::string arch = s.value();
  if (arch.rfind("sm_", 0) == 0) {
    arch_int = std::stoi(arch.substr(3));
423
424
425
426
427
428
  } else {
    arch_int = 0;
  }
  return arch_int;
}

429
430
431
432
433
434
435
436
437
438
439
440
441
/**
 * @brief Lower the GEMM operator to a TL TIR call expression.
 *
 * Constructs a tl::gemm call string parameterized by M, N, K, warp partition,
 * transpose flags, accumulation clearing, target-specific stride/offset/kPack
 * and optional workgroup wait value, then returns an Evaluate(call) node
 * invoking tl::tl_gemm with the composed string and the A/B/C buffer handles.
 *
 * @param T Contains lowering context including thread bounds and target.
 * @param analyzer Optional arithmetic analyzer used by lowering (may be
 * nullptr).
 * @return Stmt A TIR statement representing the evaluated TL GEMM call.
 */
442
Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
443
  auto block_size = *as_const_int(T.thread_bounds->extent);
444
  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
445
446
  auto [warp_m, warp_n] =
      policy->ComputeWarpPartition(M, N, block_size, T.target, gemm_inst);
447

448
  std::stringstream ss;
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
  std::string op_name;

  if (gemm_inst == GemmInst::kTCGEN5MMA) {
    auto [can_use_tcgen5mma, meta] =
        GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype);
    ICHECK(can_use_tcgen5mma);
    ICHECK(B.scope() == "shared.dyn" || B.scope() == "shared");
    ICHECK(C.scope() == "shared.tmem");
    ICHECK(mbar.has_value()) << "mbar must be provided for TCGEN5MMA";
    if (A.scope() == "shared.tmem") {
      op_name = "tl::tcgen5mma_gemm_ts";
    } else if (A.scope() == "shared.dyn" || A.scope() == "shared") {
      op_name = "tl::tcgen5mma_gemm_ss";
    } else {
      ICHECK(0)
          << "Unsupported A scope for TCGEN5MMA: "
          << A.scope(); // If this is triggered, it means Tilelang has bugs.
    }
    ICHECK(wg_wait == -1)
        << "Currently only wg_wait == -1 is supported for TCGEN5MMA. Please "
           "use "
           "wg_wait = -1 and manually synchronize with mbarrier.";

    std::string accum_dtype = "";
    if (C->dtype.is_float()) {
      if (C->dtype.bits() == 32) {
        accum_dtype = "float";
      }
    }
    ICHECK(!accum_dtype.empty())
        << "Unsupported C dtype for TCGEN5MMA: " << C->dtype;
    ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
    ss << meta.atom_m << ", " << meta.atom_n << ", " << meta.atom_k << ", ";
    ss << trans_A << ", " << trans_B << ", ";
    ss << accum_dtype;
    ss << ">";

    auto C_buffer = T.buffer_remap.count(C) ? T.buffer_remap[C] : C;
    Array<PrimExpr> new_args;
    new_args.push_back(StringImm(ss.str()));
    new_args.push_back(Aptr);
    new_args.push_back(Bptr);
    new_args.push_back(BufferLoad(C_buffer, C_coords));
    new_args.push_back(mbarptr);
    new_args.push_back(clear_accum);
    auto new_call = Call(DataType::Handle(), builtin::call_extern(), new_args);

    // Since TCGEN5MMA atoms provided by CUTLASS always have an internal
    // `elect_one_sync()`, we check if we are calling it using full warps
    constexpr int warp_size = 32;
    ICHECK(
        analyzer->CanProveEqual(FloorMod(T.thread_bounds->min, warp_size), 0) &&
        analyzer->CanProveEqual(FloorMod(T.thread_bounds->extent, warp_size),
                                0))
        << "TCGEN5MMA requires thread bounds to be multiples of warp size (32) "
           "and aligned to warps.";
    if (analyzer->CanProveEqual(T.thread_bounds->extent, warp_size)) {
      // If the thread bounds is exactly one warp, we can use the original call
      return Evaluate(new_call);
    } else {
      // Add an if-else clause
      auto tcgen5mma_call =
          IfThenElse(EQ(FloorDiv(T.thread_var, warp_size),
                        FloorDiv(T.thread_bounds->min, warp_size)),
                     Evaluate(new_call));
      return tcgen5mma_call;
    }
  }

518
519
  if (A.scope() == "local.fragment") {
    ICHECK(B.scope() != "local.fragment");
520
521
    ICHECK(!trans_A)
        << "gemm_rs requires the A operand to be in non-transposed layout.";
522
523
524
    op_name = "tl::gemm_rs";
  } else if (B.scope() == "local.fragment") {
    op_name = "tl::gemm_sr";
525
526
  } else {
    op_name = "tl::gemm_ss";
527
  }
528
529
  ICHECK(C.scope() == "local.fragment");

530
531
532
  ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
  ss << warp_m << ", " << warp_n << ", ";
  ss << trans_A << ", " << trans_B;
533
534
535
536
  auto clear_accum_bool = clear_accum.as<Bool>();
  ICHECK(clear_accum_bool.has_value())
      << "clear_accum must be a constant Bool type, got " << clear_accum;
  ss << ", " << bool(clear_accum_bool.value());
537
538
539
540
  if (TargetIsCuda(T.target) && (GetArchInt(T.target) >= 75)) {
    ss << ", " << stride_A << ", " << stride_B;
    ss << ", " << offset_A << ", " << offset_B;
  }
541
542
543
  if (TargetIsCDNA(T.target)) {
    // for cdna gemm, we need to specify kPack
    ss << ", " << kPack;
544
  } else if (TargetIsHopper(T.target)) {
545
    ss << ", " << (gemm_inst == GemmInst::kWGMMA ? "true" : "false");
546
  }
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561

  // Emit wg_wait if necessary
  if (TargetIsHopper(T.target)) {
    if (wg_wait != 0) {
      ss << ", " << wg_wait;
    }
  } else if (TargetIsSm100(T.target)) {
    // NOTE On sm100, only the leading thread issues the TCGEN5MMA instruction
    // but all threads need to wait, so we emit another statement for cases
    // where wg_wait == 0.
    ICHECK(wg_wait == 0 || wg_wait == -1)
        << "wg_wait must be 0 or -1 for Sm100";
  } else {
    ICHECK(wg_wait == 0)
        << "wg_wait must be 0 for non-Hopper and non-Sm100 targets";
562
  }
563
  ss << ">";
564
565
566

  auto new_call = Call(DataType::Handle(), tl::tl_gemm(),
                       Array<PrimExpr>{StringImm(ss.str()), Aptr, Bptr, Cptr});
567
568
569
  return Evaluate(new_call);
}

570
/**
571
 * @brief Infer and bind target-specific memory/layout mappings for A, B, and C.
572
 *
573
574
575
576
 * Infers per-buffer layouts (fragment or shared-memory layouts) for this GEMM
 * operator according to the target architecture, thread bounds, warp
 * partitioning, data types, and transpose flags, then binds fragment layouts
 * to the thread range when required.
577
578
 *
 * Preconditions:
579
 * - C.scope() == "local.fragment"
580
 *
581
582
 * Side effects:
 * - Marks layout inference as completed (sets completed_ = true).
583
584
585
 * - May abort via ICHECK on unsupported targets, invalid buffer scopes, or
 *   incompatible shape constraints.
 *
586
587
 * @param T Input layout-inference context (provides thread bounds and target).
 * @return LayoutMap mapping A, B, and C to their inferred layouts.
588
 */
589
590
LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
                                InferLevel level) const {
591
592
  if (completed_)
    return {};
593
  LayoutMap results;
594
595
  auto thread_range = T.thread_bounds;
  auto block_size = *as_const_int(thread_range->extent);
596
  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
597
598
  auto [warp_m, warp_n] =
      policy->ComputeWarpPartition(M, N, block_size, T.target, gemm_inst);
599
  if (TargetIsVolta(T.target)) {
600
601
602
    ICHECK(C.scope() == "local.fragment")
        << "Volta gemm only supports C in local.fragment scope, got "
        << C.scope();
603
604
    auto fragment =
        makeGemmVoltaFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
605
    results.Set(C, fragment->BindThreadRange(thread_range));
606
    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
607
608
609
      int dim_A = A->shape.size();
      results.Set(A, makeGemmVoltaABLayout(*as_const_int(A->shape[dim_A - 2]),
                                           *as_const_int(A->shape[dim_A - 1]),
610
                                           true, !trans_A));
611
612
    } else if (A.scope() == "local.fragment") {
      ICHECK(trans_A == false);
613
      auto fragment = makeGemmVoltaFragmentA(M, N, K, M / warp_m, N / warp_n);
614
      results.Set(A, fragment->BindThreadRange(thread_range));
615
616
617
618
619
    } else {
      ICHECK(0);
    }

    ICHECK(B.scope() == "shared" || B.scope() == "shared.dyn");
620
621
622
    int dim_B = B->shape.size();
    results.Set(B, makeGemmVoltaABLayout(*as_const_int(B->shape[dim_B - 2]),
                                         *as_const_int(B->shape[dim_B - 1]),
623
                                         false, trans_B));
624
  } else if (TargetIsAmpere(T.target) || TargetIsTuring(T.target) ||
625
626
627
628
629
             TargetIsSM120(T.target) ||
             (TargetIsSm100(T.target) && gemm_inst == GemmInst::kMMA)) {
    ICHECK(C.scope() == "local.fragment")
        << "MMA only supports C in local.fragment scope, got " << C.scope();

630
631
    auto fragment =
        makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
632
    results.Set(C, fragment->BindThreadRange(thread_range));
633
634

    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
635
636
637
      int dim_A = A->shape.size();
      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
638
639
      results.Set(A,
                  makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
640
                                   A->dtype.bits(), !trans_A));
641
    } else if (A.scope() == "local.fragment") {
642
643
      auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
                                        A->dtype.bits(), trans_A);
644
      results.Set(A, fragment->BindThreadRange(thread_range));
645
646
647
648
    } else {
      ICHECK(0);
    }
    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
649
650
651
      int dim_B = B->shape.size();
      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
652
653
      results.Set(B,
                  makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
654
                                   B->dtype.bits(), trans_B));
655
    } else if (B.scope() == "local.fragment") {
656
657
      auto fragment =
          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
658
      results.Set(B, fragment->BindThreadRange(thread_range));
659
660
661
662
    } else {
      ICHECK(0);
    }
  } else if (TargetIsHopper(T.target)) {
663
664
665
    ICHECK(C.scope() == "local.fragment")
        << (gemm_inst == GemmInst::kWGMMA ? "WGMMA " : "MMA ")
        << "only supports C in local.fragment scope, got " << C.scope();
666
    auto fragment =
667
        gemm_inst == GemmInst::kWGMMA
668
669
670
            ? makeGemmFragmentCHopper(M, N, M / warp_m, N / warp_n,
                                      C->dtype.bits())
            : makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
671
    results.Set(C, fragment->BindThreadRange(thread_range));
672
    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
673
674
675
      int dim_A = A->shape.size();
      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
676
      const int64_t continuity =
677
          trans_A ? 4 * mat_continuous / warp_m : mat_continuous;
678
      auto ABLayout =
679
          gemm_inst == GemmInst::kWGMMA
680
              ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
681
                                       A->dtype.bits(), !trans_A)
682
              : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
683
                                 A->dtype.bits(), !trans_A);
684
      results.Set(A, ABLayout);
685
    } else {
686
687
      auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
                                        A->dtype.bits(), trans_A);
688
      results.Set(A, fragment->BindThreadRange(thread_range));
689
690
    }
    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
691
692
693
      int dim_B = B->shape.size();
      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
694
695
      const int64_t continuity =
          trans_B ? mat_continuous : mat_continuous / warp_n;
696

697
      auto ABLayout =
698
          gemm_inst == GemmInst::kWGMMA
699
              ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
700
                                       B->dtype.bits(), trans_B)
701
              : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
702
                                 B->dtype.bits(), trans_B);
703
      results.Set(B, ABLayout);
704
    } else {
705
706
707
      auto fragment =
          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
      results.Set(B, fragment->BindThreadRange(thread_range));
708
    }
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
  } else if (gemm_inst == GemmInst::kTCGEN5MMA) {
    ICHECK(C.scope() == "shared.tmem")
        << "TCGEN5MMA only supports C in shared.tmem scope, got " << C.scope();
    ICHECK(A.scope() == "shared.dyn" || A.scope() == "shared")
        << "Current TCGEN5MMA only supports A in shared.dyn scope";
    auto [can_use_tcgen5mma, meta] =
        GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype);
    ICHECK(can_use_tcgen5mma);
    {
      int dim_A = A->shape.size();
      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
      results.Set(A, makeGemmABLayoutSm100(mat_stride, mat_continuous,
                                           mat_continuous, A->dtype.bits(),
                                           trans_A ? 1 : 2));
    }
    {
      int dim_B = B->shape.size();
      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
      const int64_t continuity = mat_continuous;
      results.Set(B,
                  makeGemmABLayoutSm100(mat_stride, mat_continuous, continuity,
                                        B->dtype.bits(), trans_B ? 2 : 1));
    }
    {
      Layout res;
      IterVar i = make_itervar("i", M);
      IterVar j = make_itervar("j", N);
      ICHECK(M % meta.atom_m == 0);
      PrimExpr atom_idx = FloorDiv(i, meta.atom_m) +
                          FloorDiv(j, meta.atom_n) * (M / meta.atom_m);
      PrimExpr ai = FloorMod(i, meta.atom_m); // "ai" means "atom_i"
      PrimExpr aj = FloorMod(j, meta.atom_n);
      if (meta.atom_m == 128) {
        // Layout D
        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-d)
        res = Layout(Array{i, j}, {ai, aj + atom_idx * meta.atom_n});
      } else if (meta.atom_m == 64) {
        // Layout E
        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-e)
        // since .ws variant is used About why we use .ws variant here, please
        // refer to gemm_sm100.h
        res = Layout(Array{i, j}, {FloorDiv(ai, 32) * 32 + FloorMod(ai, 32) +
                                       FloorDiv(aj, meta.atom_n / 2) * 64,
                                   FloorMod(aj, meta.atom_n / 2) +
                                       atom_idx * (meta.atom_n / 2)});
      } else if (meta.atom_m == 32) {
        // Layout G
        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-g)
        res = Layout(
            Array{i, j},
            {FloorMod(ai, 32) + FloorDiv(aj, meta.atom_n / 4) * 32,
             FloorMod(aj, meta.atom_n / 4) + atom_idx * (meta.atom_n / 4)});
      } else {
        ICHECK(0);
      }
      results.Set(C, res);
    }
768
  } else if (TargetIsCDNA(T.target)) {
769
770
771
    ICHECK(C.scope() == "local.fragment")
        << "CDNA gemm (FMMA) only supports C in local.fragment scope, got "
        << C.scope();
772
773
    auto fragment =
        makeGemmFragmentCCDNA(M, N, M / warp_m, N / warp_n, C->dtype.bits());
774
    results.Set(C, fragment->BindThreadRange(thread_range));
775
776

    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
777
778
779
780
      int dim_A = A->shape.size();
      auto shared_layout = makeGemmABLayoutCDNA(
          *as_const_int(A->shape[dim_A - 2]),
          *as_const_int(A->shape[dim_A - 1]), A->dtype.bits(), kPack);
781
782
      results.Set(A, shared_layout);
    } else if (A.scope() == "local.fragment") {
783
      auto fragment = makeGemmFragmentACDNA(M, N, K, M / warp_m, N / warp_n,
784
                                            A->dtype.bits(), kPack, trans_A);
785
      results.Set(A, fragment->BindThreadRange(thread_range));
786
787
788
789
    } else {
      ICHECK(0);
    }
    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
790
791
792
793
      int dim_B = B->shape.size();
      auto shared_layout = makeGemmABLayoutCDNA(
          *as_const_int(B->shape[dim_B - 2]),
          *as_const_int(B->shape[dim_B - 1]), B->dtype.bits(), kPack);
794
795

      results.Set(B, shared_layout);
796
797
798
799
    } else if (B.scope() == "local.fragment") {
      auto fragment =
          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
      results.Set(B, fragment->BindThreadRange(thread_range));
800
801
802
803
804
805
806
807
808
809
810
811
    } else {
      ICHECK(0);
    }
  } else {
    ICHECK(0) << "Not supported " << T.target->str();
  }
  completed_ = true;
  return results;
}

TIR_REGISTER_TL_OP(Gemm, gemm)
    .set_num_inputs(5)
812
813
    .set_attr<TCallEffectKind>("TCallEffectKind",
                               Integer(CallEffectKind::kOpaque));
814

815
816
817
TVM_REGISTER_OP("tl.GemmWarpPolicy")
    .set_attr<TScriptPrinterName>("TScriptPrinterName", "GemmWarpPolicy");

818
TVM_FFI_STATIC_INIT_BLOCK() {
819
820
821
822
823
  GemmNode::RegisterReflection();
  GemmWarpPolicyNode::RegisterReflection();
  namespace refl = tvm::ffi::reflection;
  refl::GlobalDef().def("tl.GemmWarpPolicyComputeWarpPartition",
                        [](GemmWarpPolicy policy, int M, int N, int block_size,
824
                           Target target, GemmInst gemm_inst) {
825
                          policy->ComputeWarpPartition(M, N, block_size, target,
826
                                                       gemm_inst);
827
                        });
828
}
829

830
} // namespace tl
831
} // namespace tvm