gemm.cc 33.9 KB
Newer Older
1
2
/*!
 * \file tl/op/gemm.cc
3
 * \brief Implementation of General Matrix Multiplication (GEMM) operators
4
5
6
 */

#include "gemm.h"
7
#include "builtin.h"
8
#include <fstream>
9
10
11
#include <tvm/tir/builtin.h>
#include <tvm/tir/op.h>
#include <tvm/tir/op_attr_types.h>
12
#include <tvm/tir/transform.h>
13
14
15
16
17
18
19
20

#include "../target/utils.h"

namespace tvm {
namespace tl {

using namespace tir;

21
22
23
24
25
26
27
28
29
struct TCGEN5MMAMeta {
  int atom_m, atom_n, atom_k;
};

// Return {is_success, meta}
static inline std::pair<bool, TCGEN5MMAMeta>
GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
// TODO (lei) Currently not all shapes / dtypes are supported for TCGEN5MMA.
#define FAIL                                                                   \
30
  return { false, TCGEN5MMAMeta{0, 0, 0} }
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#define SUCCESS(atom_m, atom_n, atom_k)                                        \
  return {                                                                     \
    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k }                             \
  }
  std::vector<int> ws_valid_atom_ns = {256, 128, 64};
  if ((ab_dtype.is_bfloat16() || ab_dtype.is_float16()) &&
      (c_dtype.is_float() && c_dtype.bits() == 32)) {
    if (K % 16 != 0)
      FAIL;
    if (M % 128 == 0) {
      for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
        if (N % atom_n == 0)
          SUCCESS(128, atom_n, 16);
      FAIL;
    } else if (M % 64 == 0) {
      for (int atom_n : ws_valid_atom_ns)
        if (N % atom_n == 0)
          SUCCESS(64, atom_n, 16);
      FAIL;
    } else if (M % 32 == 0) {
      for (int atom_n : ws_valid_atom_ns)
        if (N % atom_n == 0)
          SUCCESS(32, atom_n, 16);
      FAIL;
    } else {
      FAIL;
    }
  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e5m2()) &&
             (c_dtype.is_float() && c_dtype.bits() == 32)) {
    if (K % 32 != 0)
      FAIL;
    if (M % 128 == 0) {
      for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
        if (N % atom_n == 0)
          SUCCESS(128, atom_n, 32);
      FAIL;
    } else if (M % 64 == 0) {
      for (int atom_n : ws_valid_atom_ns)
        if (N % atom_n == 0)
          SUCCESS(64, atom_n, 32);
      FAIL;
    } else if (M % 32 == 0) {
      for (int atom_n : ws_valid_atom_ns)
        if (N % atom_n == 0)
          SUCCESS(32, atom_n, 32);
      FAIL;
    } else {
      FAIL;
    }
  }
  FAIL;
#undef FAIL
#undef SUCCESS
}

86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/**
 * @brief Construct a Gemm operator from serialized TL arguments and a buffer
 * map.
 *
 * This constructor deserializes operator parameters from `args` and resolves
 * buffer references via `vmap`, populating an internal GemmNode with:
 * - device pointers for A, B, C and their corresponding Buffer objects,
 * - transpose flags for A and B,
 * - matrix dimensions M, N, K,
 * - warp allocation policy and clear_accum flag,
 * - strides and memory offsets for A and B,
 * - optional kPack (must be 1 or 2) and optional wg_wait.
 *
 * The populated GemmNode is stored into the wrapper's internal `data_`.
 *
 * @param args Positional serialized arguments produced by the TL frontend:
 *   expected layout is:
 *     [Aptr, Bptr, Cptr, trans_A (Bool), trans_B (Bool),
 *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
 *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
 *      (optional) kPack (Int), (optional) wg_wait (Int)]
 * @param vmap Mapping from access pointer vars to Buffer objects used to
 *   resolve the Buffer corresponding to each pointer argument.
 *
110
 * @note If `kPack` is provided it must be 1; otherwise the constructor
111
112
113
 *       fails with an ICHECK (runtime assertion). No other validation is
 *       performed here.
 */
114
Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
115
116
117
118
119
120
121
122
123
124
125
126
127
  ObjectPtr<GemmNode> node = make_object<GemmNode>();

  node->Aptr = args[0];
  node->Bptr = args[1];
  node->Cptr = args[2];
  node->A = vmap[GetVarFromAccessPtr(node->Aptr)];
  node->B = vmap[GetVarFromAccessPtr(node->Bptr)];
  node->C = vmap[GetVarFromAccessPtr(node->Cptr)];
  node->trans_A = args[3].as<Bool>().value();
  node->trans_B = args[4].as<Bool>().value();
  node->M = args[5].as<IntImm>().value()->value;
  node->N = args[6].as<IntImm>().value()->value;
  node->K = args[7].as<IntImm>().value()->value;
128
  node->policy = GemmWarpPolicy(args[8].as<IntImm>().value()->value);
129
  node->clear_accum = args[9].as<PrimExpr>().value();
130
131
132
133
  node->stride_A = args[10].as<IntImm>().value()->value;
  node->stride_B = args[11].as<IntImm>().value()->value;
  node->offset_A = args[12].as<IntImm>().value()->value;
  node->offset_B = args[13].as<IntImm>().value()->value;
134
  if (args.size() > 14) {
135
136
    node->kPack = args[14].as<IntImm>().value()->value;
    if (node->kPack != 1 && node->kPack != 2) {
137
138
139
      ICHECK(false) << "kPack must be 1 or 2";
    }
  }
140
  if (args.size() > 15) {
141
    node->wg_wait = args[15].as<IntImm>().value()->value;
142
  }
143
144
145
146
147
148
149
150
  node->mbarptr = args[16];
  if (node->mbarptr.as<CallNode>()) {
    node->mbar = vmap[GetVarFromAccessPtr(node->mbarptr)];
  } else {
    node->mbar = std::nullopt;
  }
  node->C_coords = Array<PrimExpr>(
      {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
151
  data_ = std::move(node);
152
153
}

154
155
156
157
158
159
160
161
/**
 * @brief Create a copy of this GemmNode as a TileOperator.
 *
 * Constructs a new GemmNode by copying the current node state and returns it
 * wrapped in a Gemm TileOperator.
 *
 * @return TileOperator A Gemm operator that owns a copy of this node.
 */
162
163
164
165
166
TileOperator GemmNode::Clone() const {
  auto op = make_object<GemmNode>(*this);
  return Gemm(op);
}

167
168
169
170
171
172
173
174
175
176
bool GemmNode::AllowTCGEN5MMA(Target target) const {
  return TargetIsSm100(target) &&
         ((A.scope() == "shared.dyn" || A.scope() == "shared" ||
           A.scope() == "shared.tmem") &&
          (B.scope() == "shared.dyn" || B.scope() == "shared") &&
          C.scope() == "shared.tmem") &&
         GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype).first;
}

bool GemmNode::AllowWGMMA(int block_size, Target target) const {
177
178
  tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();

179
180
  int warp_size = TargetGetWarpSize(target);
  int num_warps = block_size / warp_size;
181
182
183
184
185
186
187
188
189
190
191
  return !ctxt->GetConfig(kDisableWGMMA, Optional<Bool>()).value_or(false) &&
         TargetIsHopper(target) && (this->M >= 64) && (num_warps % 4 == 0) &&
         CheckWGMMA();
}

GemmInst GemmNode::GetGemmInst(int block_size, Target target) const {
  bool allow_tcgen5mma = AllowTCGEN5MMA(target);
  bool allow_wgmma = AllowWGMMA(block_size, target);
  if (allow_tcgen5mma) {
    return GemmInst::kTCGEN5MMA;
  } else if (allow_wgmma) {
192
193
194
    return GemmInst::kWGMMA;
  } else if (TargetIsCDNA(target)) {
    return GemmInst::kMFMA;
195
  } else if (TargetIsCuda(target)) {
196
197
198
199
200
201
    return GemmInst::kMMA;
  } else {
    ICHECK(0) << "Unsupported target for gemm: " << target->str();
  }
}

202
203
std::pair<int, int> GemmWarpPolicyNode::ComputeWarpPartition(
    int M, int N, int block_size, Target target, GemmInst gemm_inst) const {
204
  int num_warps = block_size / TargetGetWarpSize(target);
205
206
207
208
  if (gemm_inst == GemmInst::kTCGEN5MMA) {
    return {1, num_warps}; // TCGEN5MMA doesn't care about warp partitioning
  }

209
  int m_warp = 1, n_warp = 1;
210
211
  constexpr int kMPerWarp = 16; // Rows processed by a single warp
  constexpr int kNPerWarp = 8;  // Columns processed by a single warp
212
213
214
215
216
  ICHECK(M % kMPerWarp == 0)
      << "M must be divisible by " << kMPerWarp << ", but got " << M;
  ICHECK(N % kNPerWarp == 0)
      << "N must be divisible by " << kNPerWarp << ", but got " << N;

217
  if (gemm_inst == GemmInst::kWGMMA) {
218
219
220
221
222
223
224
    ICHECK(num_warps % 4 == 0) << "Warp-Group MMA requires 128×k threads.";

    constexpr int kGroup = 4; // Number of warps in a warp-group

    m_warp = kGroup; // Initially, only one warp-group on M dimension
    n_warp = num_warps / m_warp; // Rest all on N dimension

225
    if (this->isFullRow()) {
226
227
228
      // Try to put as many warp-groups as possible on M dimension
      // (decreasing multiples of 4, ensuring divisibility by M)
      for (int cand = num_warps; cand >= kGroup; cand -= kGroup) {
229
        if (M % (cand * kMPerWarp) == 0) {
230
231
232
233
234
          m_warp = cand;
          n_warp = num_warps / m_warp;
          break;
        }
      }
235
    } else if (this->isFullCol()) {
236
237
      // Try to use warps on N dimension; if N is not divisible, split excess
      // groups to M
238
239
240
      int cand_n = n_warp;                 // Initially assume all on N
      if (N % (cand_n * kNPerWarp) != 0) { // N direction division fails
        int max_n = N / kNPerWarp;
241
242
243
244
245
246
247
248
249
250
        // Find a feasible n_warp from max possible downwards, ensuring
        // num_warps/n_warp is multiple of 4
        for (int n = std::min(cand_n, max_n); n >= 1; --n) {
          if (num_warps % n == 0 && (num_warps / n) % kGroup == 0) {
            n_warp = n;
            m_warp = num_warps / n_warp;
            break;
          }
        }
      }
251
    } else if (this->isSquare()) {
252
      // Exhaustive search, but m must be multiple of 4
253
254
      int max_m = M / kMPerWarp;
      int max_n = N / kNPerWarp;
255

256
      float ideal = N > 0 ? static_cast<float>(M) / N : 1.f;
257
258
259
260
261
262
263
264
265
266
267

      float best_score = std::numeric_limits<float>::max();
      int best_m = kGroup, best_n = n_warp;

      for (int m = kGroup; m <= num_warps && m <= max_m; m += kGroup) {
        if (num_warps % m)
          continue;
        int n = num_warps / m;
        if (n > max_n)
          continue;

268
269
        float m_per_warp = static_cast<float>(M) / (m * kMPerWarp);
        float n_per_warp = static_cast<float>(N) / (n * kNPerWarp);
270
271
272
273
274
275
276
277
278
279
        float score = std::abs(m_per_warp / n_per_warp - ideal);

        if (score < best_score) {
          best_score = score;
          best_m = m;
          best_n = n;
        }
      }
      m_warp = best_m;
      n_warp = best_n;
280
281
282
    } else {
      ICHECK(0) << "Unknown GemmWarpPolicy";
    }
283
284

    ICHECK(m_warp * n_warp == num_warps)
285
286
        << "m_warp * n_warp must equal num_warps, m_warp: " << m_warp
        << ", n_warp: " << n_warp << ", num_warps: " << num_warps;
287
288
289
290
291

    // Store the computed values in the object's member variables
    this->m_warp = m_warp;
    this->n_warp = n_warp;

292
293
    return {m_warp, n_warp};
  }
294

295
  if (this->isFullRow()) {
296
    // Try to partition M first
297
    m_warp = num_warps;
298
299
300
301
    n_warp = 1;

    // If M cannot be evenly divided by m_warp*16, try to split remaining warps
    // to N
302
    if (M % (m_warp * kMPerWarp) != 0) {
303
      // Calculate how many warps we can use for M
304
      int max_m_warps = M / kMPerWarp;
305
306
307
308
309
310
      m_warp = max_m_warps;
      // Use remaining warps for N
      n_warp = num_warps / m_warp;
      if (n_warp == 0)
        n_warp = 1;
    }
311
  } else if (this->isFullCol()) {
312
313
    // Try to partition N first
    m_warp = 1;
314
    n_warp = num_warps;
315
316
317

    // If N cannot be evenly divided by n_warp*8, try to split remaining warps
    // to M
318
    if (N % (n_warp * kNPerWarp) != 0) {
319
      // Calculate how many warps we can use for N
320
      int max_n_warps = N / kNPerWarp;
321
322
323
324
325
326
      n_warp = max_n_warps;
      // Use remaining warps for M
      m_warp = num_warps / n_warp;
      if (m_warp == 0)
        m_warp = 1;
    }
327
  } else if (this->isSquare()) {
328
    // First calculate the maximum possible warps for each dimension
329
    int max_m_warps =
330
        M / kMPerWarp; // Each warp needs at least 16 elements in M
331
332
333

    // Calculate the ideal ratio of M/N warps based on the matrix dimensions
    float ideal_ratio = 1.0f;
334
335
    if (N > 0) {
      ideal_ratio = static_cast<float>(M) / N;
336
337
338
339
340
341
342
343
344
345
346
    }

    // Try to find the best balanced partition
    int best_m = 1;
    int best_n = 1;
    float best_balance = std::numeric_limits<float>::max();
    // Try all possible combinations that satisfy the constraints
    for (int m = 1; m <= max_m_warps && m <= num_warps; m++) {
      int n = num_warps / m;

      // Calculate how balanced this partition is
347
348
      float m_per_warp = static_cast<float>(M) / (m * kMPerWarp);
      float n_per_warp = static_cast<float>(N) / (n * kNPerWarp);
349
350
351
352
353
354
355
      // m_per_warp and n_per_warp must be greater than 1
      if (m_per_warp < 1 || n_per_warp < 1)
        continue;
      // m * n must equal num_warps
      if (m * n != num_warps)
        continue;

356
357
358
359
360
361
      float balance = std::abs(m_per_warp / n_per_warp - ideal_ratio);

      if (balance < best_balance) {
        best_balance = balance;
        best_m = m;
        best_n = n;
362
363
      }
    }
364
365
366

    m_warp = best_m;
    n_warp = best_n;
367
368
369
  } else {
    ICHECK(0) << "Unknown GemmWarpPolicy";
  }
370
371
372
373
  ICHECK(m_warp * n_warp == num_warps)
      << "m_warp * n_warp must equal num_warps, m_warp: " << m_warp
      << ", n_warp: " << n_warp << ", num_warps: " << num_warps;

374
375
376
377
  // Store the computed values in the object's member variables
  this->m_warp = m_warp;
  this->n_warp = n_warp;

378
379
380
  return {m_warp, n_warp};
}

381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
/**
 * @brief Checks whether WGMMA (warp-group MMA) can be used for this GEMM.
 *
 * Evaluates device-memory placement, data-type combinations, transpose flags,
 * and K divisibility constraints required for the Hopper WGMMA code path.
 *
 * The check returns true only when:
 * - B resides in shared memory ("shared" or "shared.dyn"); and
 * - (C, A, B) dtypes match one of the supported combinations below and K
 *   satisfies the required alignment; and
 * - for combinations that require specific orientations, A is not transposed
 *   and B is transposed.
 *
 * Supported combinations and constraints:
 * - C=float16:
 *   - A=float16, B=float16: K % 16 == 0
397
398
 *   - Various float8 mixes (e4m3/e5m2): require (!trans_A && trans_B) and K %
 * 32 == 0
399
400
401
402
403
404
 * - C=float32:
 *   - A=float16, B=float16: K % 16 == 0
 *   - A=bfloat16, B=bfloat16: K % 16 == 0
 *   - A=float32, B=float32: require (!trans_A && trans_B) and K % 8 == 0
 *   - Various float8 mixes: require (!trans_A && trans_B) and K % 32 == 0
 * - C=int32:
405
406
 *   - 8-bit integer combinations (Int8/UInt8): require (!trans_A && trans_B)
 * and K % 32 == 0
407
408
409
410
 *
 * @return true if WGMMA is supported for the current buffers, dtypes, and
 *         transpose/shape constraints; false otherwise.
 */
411
bool GemmNode::CheckWGMMA() const {
412
413
414
415
  if (B.scope() != "shared.dyn" && B.scope() != "shared") {
    return false;
  }

416
417
418
  if (C->dtype == DataType::Float(16)) {
    if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
      return K % 16 == 0;
419
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
420
      return (!trans_A) && trans_B && K % 32 == 0;
421
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
422
      return (!trans_A) && trans_B && K % 32 == 0;
423
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
424
      return (!trans_A) && trans_B && K % 32 == 0;
425
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
426
427
428
429
430
431
432
433
434
435
436
      return (!trans_A) && trans_B && K % 32 == 0;
    else
      return false;
  } else if (C->dtype == DataType::Float(32)) {
    if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
      return K % 16 == 0;
    else if (A->dtype == DataType::BFloat(16) &&
             B->dtype == DataType::BFloat(16))
      return K % 16 == 0;
    else if (A->dtype == DataType::Float(32) && B->dtype == DataType::Float(32))
      return (!trans_A) && trans_B && K % 8 == 0;
437
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
438
      return (!trans_A) && trans_B && K % 32 == 0;
439
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
440
      return (!trans_A) && trans_B && K % 32 == 0;
441
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
442
      return (!trans_A) && trans_B && K % 32 == 0;
443
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
      return (!trans_A) && trans_B && K % 32 == 0;
    else
      return false;
  } else if (C->dtype == DataType::Int(32)) {
    if (A->dtype == DataType::Int(8) && B->dtype == DataType::Int(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else if (A->dtype == DataType::Int(8) && B->dtype == DataType::UInt(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::Int(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::UInt(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else
      return false;
  } else {
    return false;
  }
}

463
464
465
466
467
468
469
470
471
472
473
474
475
476
/**
 * @brief Parse and return the numeric GPU architecture from a Target's "arch"
 * attribute.
 *
 * Examines the target's "arch" string and, if it matches the pattern
 * "sm_<num>", returns <num> as an int. If the attribute is present but does not
 * match that pattern, returns 0.
 *
 * Preconditions: the target must have an "arch" attribute (this is checked via
 * ICHECK).
 *
 * @return int The parsed architecture number (e.g., 80 for "sm_80"), or 0 if
 * the arch string does not match "sm_<num>".
 */
477
478
479
480
static int GetArchInt(Target target) {
  int arch_int = 0;
  auto s = target->GetAttr<String>("arch");
  ICHECK(s.defined());
481
482
483
  std::string arch = s.value();
  if (arch.rfind("sm_", 0) == 0) {
    arch_int = std::stoi(arch.substr(3));
484
485
486
487
488
489
  } else {
    arch_int = 0;
  }
  return arch_int;
}

490
491
492
493
494
495
496
497
498
499
500
501
502
/**
 * @brief Lower the GEMM operator to a TL TIR call expression.
 *
 * Constructs a tl::gemm call string parameterized by M, N, K, warp partition,
 * transpose flags, accumulation clearing, target-specific stride/offset/kPack
 * and optional workgroup wait value, then returns an Evaluate(call) node
 * invoking tl::tl_gemm with the composed string and the A/B/C buffer handles.
 *
 * @param T Contains lowering context including thread bounds and target.
 * @param analyzer Optional arithmetic analyzer used by lowering (may be
 * nullptr).
 * @return Stmt A TIR statement representing the evaluated TL GEMM call.
 */
503
Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
504
  auto block_size = *as_const_int(T.thread_bounds->extent);
505
  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
506
507
  auto [warp_m, warp_n] =
      policy->ComputeWarpPartition(M, N, block_size, T.target, gemm_inst);
508

509
  std::stringstream ss;
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
  std::string op_name;

  if (gemm_inst == GemmInst::kTCGEN5MMA) {
    auto [can_use_tcgen5mma, meta] =
        GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype);
    ICHECK(can_use_tcgen5mma);
    ICHECK(B.scope() == "shared.dyn" || B.scope() == "shared");
    ICHECK(C.scope() == "shared.tmem");
    ICHECK(mbar.has_value()) << "mbar must be provided for TCGEN5MMA";
    if (A.scope() == "shared.tmem") {
      op_name = "tl::tcgen5mma_gemm_ts";
    } else if (A.scope() == "shared.dyn" || A.scope() == "shared") {
      op_name = "tl::tcgen5mma_gemm_ss";
    } else {
      ICHECK(0)
          << "Unsupported A scope for TCGEN5MMA: "
          << A.scope(); // If this is triggered, it means Tilelang has bugs.
    }
    ICHECK(wg_wait == -1)
        << "Currently only wg_wait == -1 is supported for TCGEN5MMA. Please "
           "use "
           "wg_wait = -1 and manually synchronize with mbarrier.";

    std::string accum_dtype = "";
    if (C->dtype.is_float()) {
      if (C->dtype.bits() == 32) {
        accum_dtype = "float";
      }
    }
    ICHECK(!accum_dtype.empty())
        << "Unsupported C dtype for TCGEN5MMA: " << C->dtype;
    ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
    ss << meta.atom_m << ", " << meta.atom_n << ", " << meta.atom_k << ", ";
    ss << trans_A << ", " << trans_B << ", ";
    ss << accum_dtype;
    ss << ">";

    auto C_buffer = T.buffer_remap.count(C) ? T.buffer_remap[C] : C;
    Array<PrimExpr> new_args;
    new_args.push_back(StringImm(ss.str()));
    new_args.push_back(Aptr);
    new_args.push_back(Bptr);
    new_args.push_back(BufferLoad(C_buffer, C_coords));
    new_args.push_back(mbarptr);
    new_args.push_back(clear_accum);
    auto new_call = Call(DataType::Handle(), builtin::call_extern(), new_args);

    // Since TCGEN5MMA atoms provided by CUTLASS always have an internal
    // `elect_one_sync()`, we check if we are calling it using full warps
    constexpr int warp_size = 32;
    ICHECK(
        analyzer->CanProveEqual(FloorMod(T.thread_bounds->min, warp_size), 0) &&
        analyzer->CanProveEqual(FloorMod(T.thread_bounds->extent, warp_size),
                                0))
        << "TCGEN5MMA requires thread bounds to be multiples of warp size (32) "
           "and aligned to warps.";
    if (analyzer->CanProveEqual(T.thread_bounds->extent, warp_size)) {
      // If the thread bounds is exactly one warp, we can use the original call
      return Evaluate(new_call);
    } else {
      // Add an if-else clause
      auto tcgen5mma_call =
          IfThenElse(EQ(FloorDiv(T.thread_var, warp_size),
                        FloorDiv(T.thread_bounds->min, warp_size)),
                     Evaluate(new_call));
      return tcgen5mma_call;
    }
  }

579
580
581
582
583
  if (A.scope() == "local.fragment") {
    ICHECK(B.scope() != "local.fragment");
    op_name = "tl::gemm_rs";
  } else if (B.scope() == "local.fragment") {
    op_name = "tl::gemm_sr";
584
585
  } else {
    op_name = "tl::gemm_ss";
586
  }
587
588
  ICHECK(C.scope() == "local.fragment");

589
590
591
  ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
  ss << warp_m << ", " << warp_n << ", ";
  ss << trans_A << ", " << trans_B;
592
593
594
595
  auto clear_accum_bool = clear_accum.as<Bool>();
  ICHECK(clear_accum_bool.has_value())
      << "clear_accum must be a constant Bool type, got " << clear_accum;
  ss << ", " << bool(clear_accum_bool.value());
596
597
598
599
  if (TargetIsCuda(T.target) && (GetArchInt(T.target) >= 75)) {
    ss << ", " << stride_A << ", " << stride_B;
    ss << ", " << offset_A << ", " << offset_B;
  }
600
601
602
  if (TargetIsCDNA(T.target)) {
    // for cdna gemm, we need to specify kPack
    ss << ", " << kPack;
603
  } else if (TargetIsHopper(T.target)) {
604
    ss << ", " << (gemm_inst == GemmInst::kWGMMA ? "true" : "false");
605
  }
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620

  // Emit wg_wait if necessary
  if (TargetIsHopper(T.target)) {
    if (wg_wait != 0) {
      ss << ", " << wg_wait;
    }
  } else if (TargetIsSm100(T.target)) {
    // NOTE On sm100, only the leading thread issues the TCGEN5MMA instruction
    // but all threads need to wait, so we emit another statement for cases
    // where wg_wait == 0.
    ICHECK(wg_wait == 0 || wg_wait == -1)
        << "wg_wait must be 0 or -1 for Sm100";
  } else {
    ICHECK(wg_wait == 0)
        << "wg_wait must be 0 for non-Hopper and non-Sm100 targets";
621
  }
622
  ss << ">";
623
624
625

  auto new_call = Call(DataType::Handle(), tl::tl_gemm(),
                       Array<PrimExpr>{StringImm(ss.str()), Aptr, Bptr, Cptr});
626
627
628
  return Evaluate(new_call);
}

629
/**
630
 * @brief Infer and bind target-specific memory/layout mappings for A, B, and C.
631
 *
632
633
634
635
 * Infers per-buffer layouts (fragment or shared-memory layouts) for this GEMM
 * operator according to the target architecture, thread bounds, warp
 * partitioning, data types, and transpose flags, then binds fragment layouts
 * to the thread range when required.
636
637
 *
 * Preconditions:
638
 * - C.scope() == "local.fragment"
639
 *
640
641
 * Side effects:
 * - Marks layout inference as completed (sets completed_ = true).
642
643
644
 * - May abort via ICHECK on unsupported targets, invalid buffer scopes, or
 *   incompatible shape constraints.
 *
645
646
 * @param T Input layout-inference context (provides thread bounds and target).
 * @return LayoutMap mapping A, B, and C to their inferred layouts.
647
 */
648
649
LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
                                InferLevel level) const {
650
651
  if (completed_)
    return {};
652
  LayoutMap results;
653
654
  auto thread_range = T.thread_bounds;
  auto block_size = *as_const_int(thread_range->extent);
655
  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
656
657
  auto [warp_m, warp_n] =
      policy->ComputeWarpPartition(M, N, block_size, T.target, gemm_inst);
658
  if (TargetIsVolta(T.target)) {
659
660
661
    ICHECK(C.scope() == "local.fragment")
        << "Volta gemm only supports C in local.fragment scope, got "
        << C.scope();
662
663
    auto fragment =
        makeGemmVoltaFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
664
    results.Set(C, fragment->BindThreadRange(thread_range));
665
    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
666
667
668
      int dim_A = A->shape.size();
      results.Set(A, makeGemmVoltaABLayout(*as_const_int(A->shape[dim_A - 2]),
                                           *as_const_int(A->shape[dim_A - 1]),
669
                                           true, !trans_A));
670
671
    } else if (A.scope() == "local.fragment") {
      ICHECK(trans_A == false);
672
      auto fragment = makeGemmVoltaFragmentA(M, N, K, M / warp_m, N / warp_n);
673
      results.Set(A, fragment->BindThreadRange(thread_range));
674
675
676
677
678
    } else {
      ICHECK(0);
    }

    ICHECK(B.scope() == "shared" || B.scope() == "shared.dyn");
679
680
681
    int dim_B = B->shape.size();
    results.Set(B, makeGemmVoltaABLayout(*as_const_int(B->shape[dim_B - 2]),
                                         *as_const_int(B->shape[dim_B - 1]),
682
                                         false, trans_B));
683
  } else if (TargetIsAmpere(T.target) || TargetIsTuring(T.target) ||
684
685
686
687
688
             TargetIsSM120(T.target) ||
             (TargetIsSm100(T.target) && gemm_inst == GemmInst::kMMA)) {
    ICHECK(C.scope() == "local.fragment")
        << "MMA only supports C in local.fragment scope, got " << C.scope();

689
690
    auto fragment =
        makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
691
    results.Set(C, fragment->BindThreadRange(thread_range));
692
693

    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
694
695
696
      int dim_A = A->shape.size();
      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
697
698
      results.Set(A,
                  makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
699
                                   A->dtype.bits(), !trans_A));
700
    } else if (A.scope() == "local.fragment") {
701
702
      auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
                                        A->dtype.bits(), trans_A);
703
      results.Set(A, fragment->BindThreadRange(thread_range));
704
705
706
707
    } else {
      ICHECK(0);
    }
    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
708
709
710
      int dim_B = B->shape.size();
      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
711
712
      results.Set(B,
                  makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
713
                                   B->dtype.bits(), trans_B));
714
    } else if (B.scope() == "local.fragment") {
715
716
      auto fragment =
          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
717
      results.Set(B, fragment->BindThreadRange(thread_range));
718
719
720
721
    } else {
      ICHECK(0);
    }
  } else if (TargetIsHopper(T.target)) {
722
723
724
    ICHECK(C.scope() == "local.fragment")
        << (gemm_inst == GemmInst::kWGMMA ? "WGMMA " : "MMA ")
        << "only supports C in local.fragment scope, got " << C.scope();
725
    auto fragment =
726
        gemm_inst == GemmInst::kWGMMA
727
728
729
            ? makeGemmFragmentCHopper(M, N, M / warp_m, N / warp_n,
                                      C->dtype.bits())
            : makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
730
    results.Set(C, fragment->BindThreadRange(thread_range));
731
    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
732
733
734
      int dim_A = A->shape.size();
      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
735
      const int64_t continuity =
736
          trans_A ? 4 * mat_continuous / warp_m : mat_continuous;
737
      auto ABLayout =
738
          gemm_inst == GemmInst::kWGMMA
739
              ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
740
                                       A->dtype.bits(), !trans_A)
741
              : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
742
                                 A->dtype.bits(), !trans_A);
743
      results.Set(A, ABLayout);
744
    } else {
745
746
      auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
                                        A->dtype.bits(), trans_A);
747
      results.Set(A, fragment->BindThreadRange(thread_range));
748
749
    }
    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
750
751
752
      int dim_B = B->shape.size();
      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
753
754
      const int64_t continuity =
          trans_B ? mat_continuous : mat_continuous / warp_n;
755

756
      auto ABLayout =
757
          gemm_inst == GemmInst::kWGMMA
758
              ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
759
                                       B->dtype.bits(), trans_B)
760
              : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
761
                                 B->dtype.bits(), trans_B);
762
      results.Set(B, ABLayout);
763
    } else {
764
765
766
      auto fragment =
          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
      results.Set(B, fragment->BindThreadRange(thread_range));
767
    }
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
  } else if (gemm_inst == GemmInst::kTCGEN5MMA) {
    ICHECK(C.scope() == "shared.tmem")
        << "TCGEN5MMA only supports C in shared.tmem scope, got " << C.scope();
    ICHECK(A.scope() == "shared.dyn" || A.scope() == "shared")
        << "Current TCGEN5MMA only supports A in shared.dyn scope";
    auto [can_use_tcgen5mma, meta] =
        GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype);
    ICHECK(can_use_tcgen5mma);
    {
      int dim_A = A->shape.size();
      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
      results.Set(A, makeGemmABLayoutSm100(mat_stride, mat_continuous,
                                           mat_continuous, A->dtype.bits(),
                                           trans_A ? 1 : 2));
    }
    {
      int dim_B = B->shape.size();
      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
      const int64_t continuity = mat_continuous;
      results.Set(B,
                  makeGemmABLayoutSm100(mat_stride, mat_continuous, continuity,
                                        B->dtype.bits(), trans_B ? 2 : 1));
    }
    {
      Layout res;
      IterVar i = make_itervar("i", M);
      IterVar j = make_itervar("j", N);
      ICHECK(M % meta.atom_m == 0);
      PrimExpr atom_idx = FloorDiv(i, meta.atom_m) +
                          FloorDiv(j, meta.atom_n) * (M / meta.atom_m);
      PrimExpr ai = FloorMod(i, meta.atom_m); // "ai" means "atom_i"
      PrimExpr aj = FloorMod(j, meta.atom_n);
      if (meta.atom_m == 128) {
        // Layout D
        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-d)
        res = Layout(Array{i, j}, {ai, aj + atom_idx * meta.atom_n});
      } else if (meta.atom_m == 64) {
        // Layout E
        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-e)
        // since .ws variant is used About why we use .ws variant here, please
        // refer to gemm_sm100.h
        res = Layout(Array{i, j}, {FloorDiv(ai, 32) * 32 + FloorMod(ai, 32) +
                                       FloorDiv(aj, meta.atom_n / 2) * 64,
                                   FloorMod(aj, meta.atom_n / 2) +
                                       atom_idx * (meta.atom_n / 2)});
      } else if (meta.atom_m == 32) {
        // Layout G
        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-g)
        res = Layout(
            Array{i, j},
            {FloorMod(ai, 32) + FloorDiv(aj, meta.atom_n / 4) * 32,
             FloorMod(aj, meta.atom_n / 4) + atom_idx * (meta.atom_n / 4)});
      } else {
        ICHECK(0);
      }
      results.Set(C, res);
    }
827
  } else if (TargetIsCDNA(T.target)) {
828
829
830
    ICHECK(C.scope() == "local.fragment")
        << "CDNA gemm (FMMA) only supports C in local.fragment scope, got "
        << C.scope();
831
    if (TargetIsDCU(T.target)) {
Lukinon's avatar
Lukinon committed
832
833
834
835
      auto fragment =
          makeGemmFragmentCDCU(M, N, M / warp_m, N / warp_n, C->dtype.bits());
      results.Set(C, fragment->BindThreadRange(thread_range));
    } else {
836
837
838
      auto fragment =
          makeGemmFragmentCCDNA(M, N, M / warp_m, N / warp_n, C->dtype.bits());
      results.Set(C, fragment->BindThreadRange(thread_range));
Lukinon's avatar
Lukinon committed
839
    }
840
841

    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
842
843
844
845
      int dim_A = A->shape.size();
      auto shared_layout = makeGemmABLayoutCDNA(
          *as_const_int(A->shape[dim_A - 2]),
          *as_const_int(A->shape[dim_A - 1]), A->dtype.bits(), kPack);
846
847
      results.Set(A, shared_layout);
    } else if (A.scope() == "local.fragment") {
848
      auto fragment = makeGemmFragmentACDNA(M, N, K, M / warp_m, N / warp_n,
849
                                            A->dtype.bits(), kPack, trans_A);
850
      results.Set(A, fragment->BindThreadRange(thread_range));
851
852
853
854
    } else {
      ICHECK(0);
    }
    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
855
856
857
858
      int dim_B = B->shape.size();
      auto shared_layout = makeGemmABLayoutCDNA(
          *as_const_int(B->shape[dim_B - 2]),
          *as_const_int(B->shape[dim_B - 1]), B->dtype.bits(), kPack);
859
860

      results.Set(B, shared_layout);
861
862
863
864
    } else if (B.scope() == "local.fragment") {
      auto fragment =
          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
      results.Set(B, fragment->BindThreadRange(thread_range));
865
866
867
868
869
870
871
872
873
874
875
876
    } else {
      ICHECK(0);
    }
  } else {
    ICHECK(0) << "Not supported " << T.target->str();
  }
  completed_ = true;
  return results;
}

TIR_REGISTER_TL_OP(Gemm, gemm)
    .set_num_inputs(5)
877
878
    .set_attr<TCallEffectKind>("TCallEffectKind",
                               Integer(CallEffectKind::kOpaque));
879

880
881
882
883
884
885
886
887
888
TVM_REGISTER_OP("tl.GemmWarpPolicy")
    .set_attr<TScriptPrinterName>("TScriptPrinterName", "GemmWarpPolicy");

TVM_FFI_STATIC_INIT_BLOCK({
  GemmNode::RegisterReflection();
  GemmWarpPolicyNode::RegisterReflection();
  namespace refl = tvm::ffi::reflection;
  refl::GlobalDef().def("tl.GemmWarpPolicyComputeWarpPartition",
                        [](GemmWarpPolicy policy, int M, int N, int block_size,
889
                           Target target, GemmInst gemm_inst) {
890
                          policy->ComputeWarpPartition(M, N, block_size, target,
891
                                                       gemm_inst);
892
893
894
895
                          return;
                        });
});

896
} // namespace tl
897
} // namespace tvm