gemm.cc 31.8 KB
Newer Older
1
2
/*!
 * \file tl/op/gemm.cc
3
 * \brief Implementation of General Matrix Multiplication (GEMM) operators
4
5
6
7
 */

#include "gemm.h"

8
#include "builtin.h"
9
10
11
#include <tvm/tir/builtin.h>
#include <tvm/tir/op.h>
#include <tvm/tir/op_attr_types.h>
12
#include <tvm/tir/transform.h>
13
14

#include "../target/utils.h"
15
#include "tcgen5_meta.h"
16
17
18
19
20
21

namespace tvm {
namespace tl {

using namespace tir;

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/**
 * @brief Construct a Gemm operator from serialized TL arguments and a buffer
 * map.
 *
 * This constructor deserializes operator parameters from `args` and resolves
 * buffer references via `vmap`, populating an internal GemmNode with:
 * - device pointers for A, B, C and their corresponding Buffer objects,
 * - transpose flags for A and B,
 * - matrix dimensions M, N, K,
 * - warp allocation policy and clear_accum flag,
 * - strides and memory offsets for A and B,
 * - optional kPack (must be 1 or 2) and optional wg_wait.
 *
 * The populated GemmNode is stored into the wrapper's internal `data_`.
 *
 * @param args Positional serialized arguments produced by the TL frontend:
 *   expected layout is:
 *     [Aptr, Bptr, Cptr, trans_A (Bool), trans_B (Bool),
 *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
 *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
 *      (optional) kPack (Int), (optional) wg_wait (Int)]
 * @param vmap Mapping from access pointer vars to Buffer objects used to
 *   resolve the Buffer corresponding to each pointer argument.
 *
46
 * @note If `kPack` is provided it must be 1; otherwise the constructor
47
48
49
 *       fails with an ICHECK (runtime assertion). No other validation is
 *       performed here.
 */
50
Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
51
  ObjectPtr<GemmNode> node = tvm::ffi::make_object<GemmNode>();
52
53
54
55
56
57
58
59
60
61
62
63

  node->Aptr = args[0];
  node->Bptr = args[1];
  node->Cptr = args[2];
  node->A = vmap[GetVarFromAccessPtr(node->Aptr)];
  node->B = vmap[GetVarFromAccessPtr(node->Bptr)];
  node->C = vmap[GetVarFromAccessPtr(node->Cptr)];
  node->trans_A = args[3].as<Bool>().value();
  node->trans_B = args[4].as<Bool>().value();
  node->M = args[5].as<IntImm>().value()->value;
  node->N = args[6].as<IntImm>().value()->value;
  node->K = args[7].as<IntImm>().value()->value;
64
  node->policy = GemmWarpPolicy(args[8].as<IntImm>().value()->value);
65
  node->clear_accum = args[9].as<PrimExpr>().value();
66
67
68
69
  node->stride_A = args[10].as<IntImm>().value()->value;
  node->stride_B = args[11].as<IntImm>().value()->value;
  node->offset_A = args[12].as<IntImm>().value()->value;
  node->offset_B = args[13].as<IntImm>().value()->value;
70
  if (args.size() > 14) {
71
72
    node->kPack = args[14].as<IntImm>().value()->value;
    if (node->kPack != 1 && node->kPack != 2) {
73
74
75
      ICHECK(false) << "kPack must be 1 or 2";
    }
  }
76
  if (args.size() > 15) {
77
    node->wg_wait = args[15].as<IntImm>().value()->value;
78
  }
79
80
81
82
83
84
85
86
  node->mbarptr = args[16];
  if (node->mbarptr.as<CallNode>()) {
    node->mbar = vmap[GetVarFromAccessPtr(node->mbarptr)];
  } else {
    node->mbar = std::nullopt;
  }
  node->C_coords = Array<PrimExpr>(
      {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
87
  data_ = std::move(node);
88
89
}

90
91
92
93
94
95
96
97
/**
 * @brief Create a copy of this GemmNode as a TileOperator.
 *
 * Constructs a new GemmNode by copying the current node state and returns it
 * wrapped in a Gemm TileOperator.
 *
 * @return TileOperator A Gemm operator that owns a copy of this node.
 */
98
TileOperator GemmNode::Clone() const {
99
  auto op = tvm::ffi::make_object<GemmNode>(*this);
100
101
102
  return Gemm(op);
}

103
104
105
106
107
108
109
110
111
112
bool GemmNode::AllowTCGEN5MMA(Target target) const {
  return TargetIsSm100(target) &&
         ((A.scope() == "shared.dyn" || A.scope() == "shared" ||
           A.scope() == "shared.tmem") &&
          (B.scope() == "shared.dyn" || B.scope() == "shared") &&
          C.scope() == "shared.tmem") &&
         GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype).first;
}

bool GemmNode::AllowWGMMA(int block_size, Target target) const {
113
114
  tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();

115
116
  int warp_size = TargetGetWarpSize(target);
  int num_warps = block_size / warp_size;
117
118
119
120
121
122
123
124
  return !ctxt->GetConfig(kDisableWGMMA, Optional<Bool>()).value_or(false) &&
         TargetIsHopper(target) && (this->M >= 64) && (num_warps % 4 == 0) &&
         CheckWGMMA();
}

GemmInst GemmNode::GetGemmInst(int block_size, Target target) const {
  bool allow_tcgen5mma = AllowTCGEN5MMA(target);
  bool allow_wgmma = AllowWGMMA(block_size, target);
125
126
  LOG(INFO) << "allow_tcgen5mma: " << allow_tcgen5mma
            << ", allow_wgmma: " << allow_wgmma;
127
128
129
  if (allow_tcgen5mma) {
    return GemmInst::kTCGEN5MMA;
  } else if (allow_wgmma) {
130
131
132
    return GemmInst::kWGMMA;
  } else if (TargetIsCDNA(target)) {
    return GemmInst::kMFMA;
133
  } else if (TargetIsCuda(target)) {
134
135
    return GemmInst::kMMA;
  } else {
136
    ICHECK(0) << "Unsupported target for gemm: " << target;
137
138
139
  }
}

140
141
std::pair<int, int> GemmWarpPolicyNode::ComputeWarpPartition(
    int M, int N, int block_size, Target target, GemmInst gemm_inst) const {
142
  int num_warps = block_size / TargetGetWarpSize(target);
143
144
145
146
  if (gemm_inst == GemmInst::kTCGEN5MMA) {
    return {1, num_warps}; // TCGEN5MMA doesn't care about warp partitioning
  }

147
  int m_warp = 1, n_warp = 1;
148
149
  constexpr int kMPerWarp = 16; // Rows processed by a single warp
  constexpr int kNPerWarp = 8;  // Columns processed by a single warp
150
151
152
153
154
  ICHECK(M % kMPerWarp == 0)
      << "M must be divisible by " << kMPerWarp << ", but got " << M;
  ICHECK(N % kNPerWarp == 0)
      << "N must be divisible by " << kNPerWarp << ", but got " << N;

155
  if (gemm_inst == GemmInst::kWGMMA) {
156
157
158
159
160
161
162
    ICHECK(num_warps % 4 == 0) << "Warp-Group MMA requires 128×k threads.";

    constexpr int kGroup = 4; // Number of warps in a warp-group

    m_warp = kGroup; // Initially, only one warp-group on M dimension
    n_warp = num_warps / m_warp; // Rest all on N dimension

163
    if (this->isFullRow()) {
164
165
166
      // Try to put as many warp-groups as possible on M dimension
      // (decreasing multiples of 4, ensuring divisibility by M)
      for (int cand = num_warps; cand >= kGroup; cand -= kGroup) {
167
        if (M % (cand * kMPerWarp) == 0) {
168
169
170
171
172
          m_warp = cand;
          n_warp = num_warps / m_warp;
          break;
        }
      }
173
    } else if (this->isFullCol()) {
174
175
      // Try to use warps on N dimension; if N is not divisible, split excess
      // groups to M
176
177
178
      int cand_n = n_warp;                 // Initially assume all on N
      if (N % (cand_n * kNPerWarp) != 0) { // N direction division fails
        int max_n = N / kNPerWarp;
179
180
181
182
183
184
185
186
187
188
        // Find a feasible n_warp from max possible downwards, ensuring
        // num_warps/n_warp is multiple of 4
        for (int n = std::min(cand_n, max_n); n >= 1; --n) {
          if (num_warps % n == 0 && (num_warps / n) % kGroup == 0) {
            n_warp = n;
            m_warp = num_warps / n_warp;
            break;
          }
        }
      }
189
    } else if (this->isSquare()) {
190
      // Exhaustive search, but m must be multiple of 4
191
192
      int max_m = M / kMPerWarp;
      int max_n = N / kNPerWarp;
193

194
      float ideal = N > 0 ? static_cast<float>(M) / N : 1.f;
195
196
197
198
199
200
201
202
203
204
205

      float best_score = std::numeric_limits<float>::max();
      int best_m = kGroup, best_n = n_warp;

      for (int m = kGroup; m <= num_warps && m <= max_m; m += kGroup) {
        if (num_warps % m)
          continue;
        int n = num_warps / m;
        if (n > max_n)
          continue;

206
207
        float m_per_warp = static_cast<float>(M) / (m * kMPerWarp);
        float n_per_warp = static_cast<float>(N) / (n * kNPerWarp);
208
209
210
211
212
213
214
215
216
217
        float score = std::abs(m_per_warp / n_per_warp - ideal);

        if (score < best_score) {
          best_score = score;
          best_m = m;
          best_n = n;
        }
      }
      m_warp = best_m;
      n_warp = best_n;
218
219
220
    } else {
      ICHECK(0) << "Unknown GemmWarpPolicy";
    }
221
222

    ICHECK(m_warp * n_warp == num_warps)
223
224
        << "m_warp * n_warp must equal num_warps, m_warp: " << m_warp
        << ", n_warp: " << n_warp << ", num_warps: " << num_warps;
225
226
227
228
229

    // Store the computed values in the object's member variables
    this->m_warp = m_warp;
    this->n_warp = n_warp;

230
231
    return {m_warp, n_warp};
  }
232

233
  if (this->isFullRow()) {
234
    // Try to partition M first
235
    m_warp = num_warps;
236
237
238
239
    n_warp = 1;

    // If M cannot be evenly divided by m_warp*16, try to split remaining warps
    // to N
240
    if (M % (m_warp * kMPerWarp) != 0) {
241
      // Calculate how many warps we can use for M
242
      int max_m_warps = M / kMPerWarp;
243
244
245
246
247
248
      m_warp = max_m_warps;
      // Use remaining warps for N
      n_warp = num_warps / m_warp;
      if (n_warp == 0)
        n_warp = 1;
    }
249
  } else if (this->isFullCol()) {
250
251
    // Try to partition N first
    m_warp = 1;
252
    n_warp = num_warps;
253
254
255

    // If N cannot be evenly divided by n_warp*8, try to split remaining warps
    // to M
256
    if (N % (n_warp * kNPerWarp) != 0) {
257
      // Calculate how many warps we can use for N
258
      int max_n_warps = N / kNPerWarp;
259
260
261
262
263
264
      n_warp = max_n_warps;
      // Use remaining warps for M
      m_warp = num_warps / n_warp;
      if (m_warp == 0)
        m_warp = 1;
    }
265
  } else if (this->isSquare()) {
266
    // First calculate the maximum possible warps for each dimension
267
    int max_m_warps =
268
        M / kMPerWarp; // Each warp needs at least 16 elements in M
269
270
271

    // Calculate the ideal ratio of M/N warps based on the matrix dimensions
    float ideal_ratio = 1.0f;
272
273
    if (N > 0) {
      ideal_ratio = static_cast<float>(M) / N;
274
275
276
277
278
279
280
281
282
283
284
    }

    // Try to find the best balanced partition
    int best_m = 1;
    int best_n = 1;
    float best_balance = std::numeric_limits<float>::max();
    // Try all possible combinations that satisfy the constraints
    for (int m = 1; m <= max_m_warps && m <= num_warps; m++) {
      int n = num_warps / m;

      // Calculate how balanced this partition is
285
286
      float m_per_warp = static_cast<float>(M) / (m * kMPerWarp);
      float n_per_warp = static_cast<float>(N) / (n * kNPerWarp);
287
288
289
290
291
292
293
      // m_per_warp and n_per_warp must be greater than 1
      if (m_per_warp < 1 || n_per_warp < 1)
        continue;
      // m * n must equal num_warps
      if (m * n != num_warps)
        continue;

294
295
296
297
298
299
      float balance = std::abs(m_per_warp / n_per_warp - ideal_ratio);

      if (balance < best_balance) {
        best_balance = balance;
        best_m = m;
        best_n = n;
300
301
      }
    }
302
303
304

    m_warp = best_m;
    n_warp = best_n;
305
306
307
  } else {
    ICHECK(0) << "Unknown GemmWarpPolicy";
  }
308
309
310
311
  ICHECK(m_warp * n_warp == num_warps)
      << "m_warp * n_warp must equal num_warps, m_warp: " << m_warp
      << ", n_warp: " << n_warp << ", num_warps: " << num_warps;

312
313
314
315
  // Store the computed values in the object's member variables
  this->m_warp = m_warp;
  this->n_warp = n_warp;

316
317
318
  return {m_warp, n_warp};
}

319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/**
 * @brief Checks whether WGMMA (warp-group MMA) can be used for this GEMM.
 *
 * Evaluates device-memory placement, data-type combinations, transpose flags,
 * and K divisibility constraints required for the Hopper WGMMA code path.
 *
 * The check returns true only when:
 * - B resides in shared memory ("shared" or "shared.dyn"); and
 * - (C, A, B) dtypes match one of the supported combinations below and K
 *   satisfies the required alignment; and
 * - for combinations that require specific orientations, A is not transposed
 *   and B is transposed.
 *
 * Supported combinations and constraints:
 * - C=float16:
 *   - A=float16, B=float16: K % 16 == 0
335
336
 *   - Various float8 mixes (e4m3/e5m2): require (!trans_A && trans_B) and K %
 * 32 == 0
337
338
339
340
341
342
 * - C=float32:
 *   - A=float16, B=float16: K % 16 == 0
 *   - A=bfloat16, B=bfloat16: K % 16 == 0
 *   - A=float32, B=float32: require (!trans_A && trans_B) and K % 8 == 0
 *   - Various float8 mixes: require (!trans_A && trans_B) and K % 32 == 0
 * - C=int32:
343
344
 *   - 8-bit integer combinations (Int8/UInt8): require (!trans_A && trans_B)
 * and K % 32 == 0
345
346
347
348
 *
 * @return true if WGMMA is supported for the current buffers, dtypes, and
 *         transpose/shape constraints; false otherwise.
 */
349
bool GemmNode::CheckWGMMA() const {
350
351
352
353
  if (B.scope() != "shared.dyn" && B.scope() != "shared") {
    return false;
  }

354
355
356
  if (C->dtype == DataType::Float(16)) {
    if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
      return K % 16 == 0;
357
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
358
      return (!trans_A) && trans_B && K % 32 == 0;
359
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
360
      return (!trans_A) && trans_B && K % 32 == 0;
361
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
362
      return (!trans_A) && trans_B && K % 32 == 0;
363
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
364
365
366
367
368
369
370
371
372
373
374
      return (!trans_A) && trans_B && K % 32 == 0;
    else
      return false;
  } else if (C->dtype == DataType::Float(32)) {
    if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
      return K % 16 == 0;
    else if (A->dtype == DataType::BFloat(16) &&
             B->dtype == DataType::BFloat(16))
      return K % 16 == 0;
    else if (A->dtype == DataType::Float(32) && B->dtype == DataType::Float(32))
      return (!trans_A) && trans_B && K % 8 == 0;
375
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
376
      return (!trans_A) && trans_B && K % 32 == 0;
377
    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
378
      return (!trans_A) && trans_B && K % 32 == 0;
379
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
380
      return (!trans_A) && trans_B && K % 32 == 0;
381
    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
      return (!trans_A) && trans_B && K % 32 == 0;
    else
      return false;
  } else if (C->dtype == DataType::Int(32)) {
    if (A->dtype == DataType::Int(8) && B->dtype == DataType::Int(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else if (A->dtype == DataType::Int(8) && B->dtype == DataType::UInt(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::Int(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::UInt(8))
      return (!trans_A) && trans_B && K % 32 == 0;
    else
      return false;
  } else {
    return false;
  }
}

401
402
403
404
405
406
407
408
409
410
411
412
413
414
/**
 * @brief Parse and return the numeric GPU architecture from a Target's "arch"
 * attribute.
 *
 * Examines the target's "arch" string and, if it matches the pattern
 * "sm_<num>", returns <num> as an int. If the attribute is present but does not
 * match that pattern, returns 0.
 *
 * Preconditions: the target must have an "arch" attribute (this is checked via
 * ICHECK).
 *
 * @return int The parsed architecture number (e.g., 80 for "sm_80"), or 0 if
 * the arch string does not match "sm_<num>".
 */
415
416
static int GetArchInt(Target target) {
  int arch_int = 0;
417
418
  auto s = target->GetAttr<tvm::ffi::String>("arch");
  ICHECK(s.has_value());
419
420
421
  std::string arch = s.value();
  if (arch.rfind("sm_", 0) == 0) {
    arch_int = std::stoi(arch.substr(3));
422
423
424
425
426
427
  } else {
    arch_int = 0;
  }
  return arch_int;
}

428
429
430
431
432
433
434
435
436
437
438
439
440
/**
 * @brief Lower the GEMM operator to a TL TIR call expression.
 *
 * Constructs a tl::gemm call string parameterized by M, N, K, warp partition,
 * transpose flags, accumulation clearing, target-specific stride/offset/kPack
 * and optional workgroup wait value, then returns an Evaluate(call) node
 * invoking tl::tl_gemm with the composed string and the A/B/C buffer handles.
 *
 * @param T Contains lowering context including thread bounds and target.
 * @param analyzer Optional arithmetic analyzer used by lowering (may be
 * nullptr).
 * @return Stmt A TIR statement representing the evaluated TL GEMM call.
 */
441
Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
442
  auto block_size = *as_const_int(T.thread_bounds->extent);
443
  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
444
445
  auto [warp_m, warp_n] =
      policy->ComputeWarpPartition(M, N, block_size, T.target, gemm_inst);
446

447
  std::stringstream ss;
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
  std::string op_name;

  if (gemm_inst == GemmInst::kTCGEN5MMA) {
    auto [can_use_tcgen5mma, meta] =
        GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype);
    ICHECK(can_use_tcgen5mma);
    ICHECK(B.scope() == "shared.dyn" || B.scope() == "shared");
    ICHECK(C.scope() == "shared.tmem");
    ICHECK(mbar.has_value()) << "mbar must be provided for TCGEN5MMA";
    if (A.scope() == "shared.tmem") {
      op_name = "tl::tcgen5mma_gemm_ts";
    } else if (A.scope() == "shared.dyn" || A.scope() == "shared") {
      op_name = "tl::tcgen5mma_gemm_ss";
    } else {
      ICHECK(0)
          << "Unsupported A scope for TCGEN5MMA: "
          << A.scope(); // If this is triggered, it means Tilelang has bugs.
    }
    ICHECK(wg_wait == -1)
        << "Currently only wg_wait == -1 is supported for TCGEN5MMA. Please "
           "use "
           "wg_wait = -1 and manually synchronize with mbarrier.";

    std::string accum_dtype = "";
    if (C->dtype.is_float()) {
      if (C->dtype.bits() == 32) {
        accum_dtype = "float";
      }
    }
    ICHECK(!accum_dtype.empty())
        << "Unsupported C dtype for TCGEN5MMA: " << C->dtype;
    ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
    ss << meta.atom_m << ", " << meta.atom_n << ", " << meta.atom_k << ", ";
    ss << trans_A << ", " << trans_B << ", ";
    ss << accum_dtype;
    ss << ">";

    auto C_buffer = T.buffer_remap.count(C) ? T.buffer_remap[C] : C;
    Array<PrimExpr> new_args;
    new_args.push_back(StringImm(ss.str()));
    new_args.push_back(Aptr);
    new_args.push_back(Bptr);
    new_args.push_back(BufferLoad(C_buffer, C_coords));
    new_args.push_back(mbarptr);
    new_args.push_back(clear_accum);
    auto new_call = Call(DataType::Handle(), builtin::call_extern(), new_args);

    // Since TCGEN5MMA atoms provided by CUTLASS always have an internal
    // `elect_one_sync()`, we check if we are calling it using full warps
    constexpr int warp_size = 32;
    ICHECK(
        analyzer->CanProveEqual(FloorMod(T.thread_bounds->min, warp_size), 0) &&
        analyzer->CanProveEqual(FloorMod(T.thread_bounds->extent, warp_size),
                                0))
        << "TCGEN5MMA requires thread bounds to be multiples of warp size (32) "
           "and aligned to warps.";
    if (analyzer->CanProveEqual(T.thread_bounds->extent, warp_size)) {
      // If the thread bounds is exactly one warp, we can use the original call
      return Evaluate(new_call);
    } else {
      // Add an if-else clause
      auto tcgen5mma_call =
          IfThenElse(EQ(FloorDiv(T.thread_var, warp_size),
                        FloorDiv(T.thread_bounds->min, warp_size)),
                     Evaluate(new_call));
      return tcgen5mma_call;
    }
  }

517
518
  if (A.scope() == "local.fragment") {
    ICHECK(B.scope() != "local.fragment");
519
520
    ICHECK(!trans_A)
        << "gemm_rs requires the A operand to be in non-transposed layout.";
521
522
523
    op_name = "tl::gemm_rs";
  } else if (B.scope() == "local.fragment") {
    op_name = "tl::gemm_sr";
524
525
  } else {
    op_name = "tl::gemm_ss";
526
  }
527
528
  ICHECK(C.scope() == "local.fragment");

529
530
531
  ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
  ss << warp_m << ", " << warp_n << ", ";
  ss << trans_A << ", " << trans_B;
532
533
534
535
  auto clear_accum_bool = clear_accum.as<Bool>();
  ICHECK(clear_accum_bool.has_value())
      << "clear_accum must be a constant Bool type, got " << clear_accum;
  ss << ", " << bool(clear_accum_bool.value());
536
537
538
539
  if (TargetIsCuda(T.target) && (GetArchInt(T.target) >= 75)) {
    ss << ", " << stride_A << ", " << stride_B;
    ss << ", " << offset_A << ", " << offset_B;
  }
540
541
542
  if (TargetIsCDNA(T.target)) {
    // for cdna gemm, we need to specify kPack
    ss << ", " << kPack;
543
  } else if (TargetIsHopper(T.target)) {
544
    ss << ", " << (gemm_inst == GemmInst::kWGMMA ? "true" : "false");
545
  }
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560

  // Emit wg_wait if necessary
  if (TargetIsHopper(T.target)) {
    if (wg_wait != 0) {
      ss << ", " << wg_wait;
    }
  } else if (TargetIsSm100(T.target)) {
    // NOTE On sm100, only the leading thread issues the TCGEN5MMA instruction
    // but all threads need to wait, so we emit another statement for cases
    // where wg_wait == 0.
    ICHECK(wg_wait == 0 || wg_wait == -1)
        << "wg_wait must be 0 or -1 for Sm100";
  } else {
    ICHECK(wg_wait == 0)
        << "wg_wait must be 0 for non-Hopper and non-Sm100 targets";
561
  }
562
  ss << ">";
563
564
565

  auto new_call = Call(DataType::Handle(), tl::tl_gemm(),
                       Array<PrimExpr>{StringImm(ss.str()), Aptr, Bptr, Cptr});
566
567
568
  return Evaluate(new_call);
}

569
/**
570
 * @brief Infer and bind target-specific memory/layout mappings for A, B, and C.
571
 *
572
573
574
575
 * Infers per-buffer layouts (fragment or shared-memory layouts) for this GEMM
 * operator according to the target architecture, thread bounds, warp
 * partitioning, data types, and transpose flags, then binds fragment layouts
 * to the thread range when required.
576
577
 *
 * Preconditions:
578
 * - C.scope() == "local.fragment"
579
 *
580
581
 * Side effects:
 * - Marks layout inference as completed (sets completed_ = true).
582
583
584
 * - May abort via ICHECK on unsupported targets, invalid buffer scopes, or
 *   incompatible shape constraints.
 *
585
586
 * @param T Input layout-inference context (provides thread bounds and target).
 * @return LayoutMap mapping A, B, and C to their inferred layouts.
587
 */
588
589
LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
                                InferLevel level) const {
590
591
  if (completed_)
    return {};
592
  LayoutMap results;
593
594
  auto thread_range = T.thread_bounds;
  auto block_size = *as_const_int(thread_range->extent);
595
  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
596
597
  auto [warp_m, warp_n] =
      policy->ComputeWarpPartition(M, N, block_size, T.target, gemm_inst);
598
  if (TargetIsVolta(T.target)) {
599
600
601
    ICHECK(C.scope() == "local.fragment")
        << "Volta gemm only supports C in local.fragment scope, got "
        << C.scope();
602
603
    auto fragment =
        makeGemmVoltaFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
604
    results.Set(C, fragment->BindThreadRange(thread_range));
605
    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
606
607
608
      int dim_A = A->shape.size();
      results.Set(A, makeGemmVoltaABLayout(*as_const_int(A->shape[dim_A - 2]),
                                           *as_const_int(A->shape[dim_A - 1]),
609
                                           true, !trans_A));
610
611
    } else if (A.scope() == "local.fragment") {
      ICHECK(trans_A == false);
612
      auto fragment = makeGemmVoltaFragmentA(M, N, K, M / warp_m, N / warp_n);
613
      results.Set(A, fragment->BindThreadRange(thread_range));
614
615
616
617
618
    } else {
      ICHECK(0);
    }

    ICHECK(B.scope() == "shared" || B.scope() == "shared.dyn");
619
620
621
    int dim_B = B->shape.size();
    results.Set(B, makeGemmVoltaABLayout(*as_const_int(B->shape[dim_B - 2]),
                                         *as_const_int(B->shape[dim_B - 1]),
622
                                         false, trans_B));
623
  } else if (TargetIsAmpere(T.target) || TargetIsTuring(T.target) ||
624
625
626
627
628
             TargetIsSM120(T.target) ||
             (TargetIsSm100(T.target) && gemm_inst == GemmInst::kMMA)) {
    ICHECK(C.scope() == "local.fragment")
        << "MMA only supports C in local.fragment scope, got " << C.scope();

629
630
    auto fragment =
        makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
631
    results.Set(C, fragment->BindThreadRange(thread_range));
632
633

    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
634
635
636
      int dim_A = A->shape.size();
      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
637
638
      results.Set(A,
                  makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
639
                                   A->dtype.bits(), !trans_A));
640
    } else if (A.scope() == "local.fragment") {
641
642
      auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
                                        A->dtype.bits(), trans_A);
643
      results.Set(A, fragment->BindThreadRange(thread_range));
644
645
646
647
    } else {
      ICHECK(0);
    }
    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
648
649
650
      int dim_B = B->shape.size();
      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
651
652
      results.Set(B,
                  makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
653
                                   B->dtype.bits(), trans_B));
654
    } else if (B.scope() == "local.fragment") {
655
656
      auto fragment =
          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
657
      results.Set(B, fragment->BindThreadRange(thread_range));
658
659
660
661
    } else {
      ICHECK(0);
    }
  } else if (TargetIsHopper(T.target)) {
662
663
664
    ICHECK(C.scope() == "local.fragment")
        << (gemm_inst == GemmInst::kWGMMA ? "WGMMA " : "MMA ")
        << "only supports C in local.fragment scope, got " << C.scope();
665
    auto fragment =
666
        gemm_inst == GemmInst::kWGMMA
667
668
669
            ? makeGemmFragmentCHopper(M, N, M / warp_m, N / warp_n,
                                      C->dtype.bits())
            : makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
670
    results.Set(C, fragment->BindThreadRange(thread_range));
671
    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
672
673
674
      int dim_A = A->shape.size();
      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
675
      const int64_t continuity =
676
          trans_A ? 4 * mat_continuous / warp_m : mat_continuous;
677
      auto ABLayout =
678
          gemm_inst == GemmInst::kWGMMA
679
              ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
680
                                       A->dtype.bits(), !trans_A)
681
              : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
682
                                 A->dtype.bits(), !trans_A);
683
      results.Set(A, ABLayout);
684
    } else {
685
686
      auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
                                        A->dtype.bits(), trans_A);
687
      results.Set(A, fragment->BindThreadRange(thread_range));
688
689
    }
    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
690
691
692
      int dim_B = B->shape.size();
      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
693
694
      const int64_t continuity =
          trans_B ? mat_continuous : mat_continuous / warp_n;
695

696
      auto ABLayout =
697
          gemm_inst == GemmInst::kWGMMA
698
              ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
699
                                       B->dtype.bits(), trans_B)
700
              : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
701
                                 B->dtype.bits(), trans_B);
702
      results.Set(B, ABLayout);
703
    } else {
704
705
706
      auto fragment =
          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
      results.Set(B, fragment->BindThreadRange(thread_range));
707
    }
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
  } else if (gemm_inst == GemmInst::kTCGEN5MMA) {
    ICHECK(C.scope() == "shared.tmem")
        << "TCGEN5MMA only supports C in shared.tmem scope, got " << C.scope();
    ICHECK(A.scope() == "shared.dyn" || A.scope() == "shared")
        << "Current TCGEN5MMA only supports A in shared.dyn scope";
    auto [can_use_tcgen5mma, meta] =
        GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype);
    ICHECK(can_use_tcgen5mma);
    {
      int dim_A = A->shape.size();
      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
      results.Set(A, makeGemmABLayoutSm100(mat_stride, mat_continuous,
                                           mat_continuous, A->dtype.bits(),
                                           trans_A ? 1 : 2));
    }
    {
      int dim_B = B->shape.size();
      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
      const int64_t continuity = mat_continuous;
      results.Set(B,
                  makeGemmABLayoutSm100(mat_stride, mat_continuous, continuity,
                                        B->dtype.bits(), trans_B ? 2 : 1));
    }
    {
      Layout res;
      IterVar i = make_itervar("i", M);
      IterVar j = make_itervar("j", N);
      ICHECK(M % meta.atom_m == 0);
      PrimExpr atom_idx = FloorDiv(i, meta.atom_m) +
                          FloorDiv(j, meta.atom_n) * (M / meta.atom_m);
      PrimExpr ai = FloorMod(i, meta.atom_m); // "ai" means "atom_i"
      PrimExpr aj = FloorMod(j, meta.atom_n);
      if (meta.atom_m == 128) {
        // Layout D
        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-d)
        res = Layout(Array{i, j}, {ai, aj + atom_idx * meta.atom_n});
      } else if (meta.atom_m == 64) {
        // Layout E
        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-e)
        // since .ws variant is used About why we use .ws variant here, please
        // refer to gemm_sm100.h
        res = Layout(Array{i, j}, {FloorDiv(ai, 32) * 32 + FloorMod(ai, 32) +
                                       FloorDiv(aj, meta.atom_n / 2) * 64,
                                   FloorMod(aj, meta.atom_n / 2) +
                                       atom_idx * (meta.atom_n / 2)});
      } else if (meta.atom_m == 32) {
        // Layout G
        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-g)
        res = Layout(
            Array{i, j},
            {FloorMod(ai, 32) + FloorDiv(aj, meta.atom_n / 4) * 32,
             FloorMod(aj, meta.atom_n / 4) + atom_idx * (meta.atom_n / 4)});
      } else {
        ICHECK(0);
      }
      results.Set(C, res);
    }
767
  } else if (TargetIsCDNA(T.target)) {
768
769
770
    ICHECK(C.scope() == "local.fragment")
        << "CDNA gemm (FMMA) only supports C in local.fragment scope, got "
        << C.scope();
771
772
    auto fragment =
        makeGemmFragmentCCDNA(M, N, M / warp_m, N / warp_n, C->dtype.bits());
773
    results.Set(C, fragment->BindThreadRange(thread_range));
774
775

    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
776
777
778
779
      int dim_A = A->shape.size();
      auto shared_layout = makeGemmABLayoutCDNA(
          *as_const_int(A->shape[dim_A - 2]),
          *as_const_int(A->shape[dim_A - 1]), A->dtype.bits(), kPack);
780
781
      results.Set(A, shared_layout);
    } else if (A.scope() == "local.fragment") {
782
      auto fragment = makeGemmFragmentACDNA(M, N, K, M / warp_m, N / warp_n,
783
                                            A->dtype.bits(), kPack, trans_A);
784
      results.Set(A, fragment->BindThreadRange(thread_range));
785
786
787
788
    } else {
      ICHECK(0);
    }
    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
789
790
791
792
      int dim_B = B->shape.size();
      auto shared_layout = makeGemmABLayoutCDNA(
          *as_const_int(B->shape[dim_B - 2]),
          *as_const_int(B->shape[dim_B - 1]), B->dtype.bits(), kPack);
793
794

      results.Set(B, shared_layout);
795
796
797
798
    } else if (B.scope() == "local.fragment") {
      auto fragment =
          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
      results.Set(B, fragment->BindThreadRange(thread_range));
799
800
801
802
803
804
805
806
807
808
809
810
    } else {
      ICHECK(0);
    }
  } else {
    ICHECK(0) << "Not supported " << T.target->str();
  }
  completed_ = true;
  return results;
}

TIR_REGISTER_TL_OP(Gemm, gemm)
    .set_num_inputs(5)
811
812
    .set_attr<TCallEffectKind>("TCallEffectKind",
                               Integer(CallEffectKind::kOpaque));
813

814
815
816
TVM_REGISTER_OP("tl.GemmWarpPolicy")
    .set_attr<TScriptPrinterName>("TScriptPrinterName", "GemmWarpPolicy");

817
TVM_FFI_STATIC_INIT_BLOCK() {
818
819
820
821
822
  GemmNode::RegisterReflection();
  GemmWarpPolicyNode::RegisterReflection();
  namespace refl = tvm::ffi::reflection;
  refl::GlobalDef().def("tl.GemmWarpPolicyComputeWarpPartition",
                        [](GemmWarpPolicy policy, int M, int N, int block_size,
823
                           Target target, GemmInst gemm_inst) {
824
                          policy->ComputeWarpPartition(M, N, block_size, target,
825
                                                       gemm_inst);
826
                        });
827
}
828

829
} // namespace tl
830
} // namespace tvm