layout_inference.cc 17.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \file layout_inference.cc
 * \brief infer the fragment/shared memory layout
 */

#include <tvm/tir/builtin.h>
#include <tvm/tir/op.h>
#include <tvm/tir/stmt_functor.h>
#include <tvm/tir/transform.h>
#include <tvm/tir/utils.h>

#include <queue>

#include "../op/parallel.h"
34
35
#include "arith/ir_mutator_with_analyzer.h"
#include "common/loop_fusion_utils.h"
36
37
#include "loop_partition.h"
#include "loop_vectorize.h"
38
39
#include "runtime/thread_storage_scope.h"
#include "tir/transforms/ir_utils.h"
40
41
42
43

namespace tvm {
namespace tl {

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
using namespace tir;

using runtime::StorageRank;
using runtime::StorageScope;

static bool IsDynamicSharedMemory(Var buffer_var) {
  StorageScope storage_scope =
      runtime::StorageScope::Create(GetPtrStorageScope(buffer_var));
  return storage_scope.rank == runtime::StorageRank::kShared &&
         storage_scope.tag == ".dyn";
}

static bool IsStaticSharedMemory(Var buffer_var) {
  StorageScope storage_scope =
      runtime::StorageScope::Create(GetPtrStorageScope(buffer_var));
  return storage_scope.rank == runtime::StorageRank::kShared &&
         storage_scope.tag == "";
}

static bool isLocalFragment(Var buffer_var) {
  StorageScope storage_scope =
      runtime::StorageScope::Create(GetPtrStorageScope(buffer_var));
  return storage_scope.rank == runtime::StorageRank::kLocal &&
         storage_scope.tag == ".fragment";
}

/*!
 * \brief collect the mapping from the buffer var to its allocate
 */
class AllocateCollector : public StmtExprVisitor {
public:
  void VisitStmt_(const AllocateNode *op) final {
    if (IsDynamicSharedMemory(op->buffer_var)) {
      dyn_shmem_allocs_[op->buffer_var.get()] = op;
    } else if (IsStaticSharedMemory(op->buffer_var)) {
      static_shmem_allocs_[op->buffer_var.get()] = op;
    } else if (isLocalFragment(op->buffer_var)) {
      local_fragment_allocs_[op->buffer_var.get()] = op;
    }
    StmtExprVisitor::VisitStmt_(op);
  }
  void VisitStmt_(const BlockNode *op) final {
    for (auto buffer : op->alloc_buffers) {
      if (IsDynamicSharedMemory(buffer->data)) {
        dyn_shmem_allocs_[buffer->data.get()] = op;
      } else if (IsStaticSharedMemory(buffer->data)) {
        static_shmem_allocs_[buffer->data.get()] = op;
      } else if (isLocalFragment(buffer->data)) {
        local_fragment_allocs_[buffer->data.get()] = op;
      }
    }
    StmtExprVisitor::VisitStmt_(op);
  }

  void VisitStmt_(const AllocateConstNode *op) final {
    StmtExprVisitor::VisitStmt_(op);
  }

  void VisitStmt_(const SeqStmtNode *op) final {
    StmtExprVisitor::VisitStmt_(op);
  }

  void VisitStmt_(const AttrStmtNode *op) final {
    StmtExprVisitor::VisitStmt_(op);
  }

  // The dynamic mapping from the original buffer var to its allocate
  std::unordered_map<const VarNode *, const Object *> dyn_shmem_allocs_;
  // The static mapping from the original buffer var to its allocate
  std::unordered_map<const VarNode *, const Object *> static_shmem_allocs_;
  // The local fragment mapping from the original buffer var to its allocate
  std::unordered_map<const VarNode *, const Object *> local_fragment_allocs_;
};

118
119
120
121
122
123
124
125
126
127
using namespace tir;
using arith::IRMutatorWithAnalyzer;

struct LayoutInferenceResult {
  Map<Buffer, Layout> layout_map;
  Map<For, Fragment> for_map;
  Map<For, PrimExpr> predicate_map;
};

class BufferUseDefCollector : public StmtExprVisitor {
128
public:
129
130
  BufferUseDefCollector(bool skip_thread_partition)
      : skip_thread_partition_(skip_thread_partition) {}
131
132

  LayoutInferenceResult Run() {
133
134
135
136
137
138
139
140
141
142
    // Basic consistency check: infer_list_ and thread_var_vec_ should have the
    // same size
    ICHECK_EQ(infer_list_.size(), thread_var_vec_.size())
        << "Size mismatch: infer_list_ and thread_var_vec_ must match in "
           "length.";

    // If needed, you can also check that annotated_layout_map_ is not empty, or
    // anything else relevant to your setup.

    // Copy the annotated layout map to local variable
143
144
145
    Map<Buffer, Layout> layout_map = annotated_layout_map_;
    int num_infer = infer_list_.size();

146
    // Prepare BFS queue for iterative inference
147
148
    std::queue<int> q;
    std::vector<bool> in_queue(num_infer, true);
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
    for (int i = 0; i < num_infer; i++) {
      // Check that each infer_list_ entry is valid
      ICHECK(infer_list_[i] != nullptr)
          << "infer_list_[" << i
          << "] is null. The inference object is not allocated properly.";

      // Check that each thread_var_vec_ entry is defined
      if (!thread_var_vec_[i].defined() && skip_thread_partition_) {
        // TODO(lei): This is a hack for cpu backend
        if (!thread_var_.defined()) {
          // Fake thread var to inference predicate for the buffer
          thread_var_ = IterVar(Range::FromMinExtent(PrimExpr(0), PrimExpr(1)),
                                Var(""), IterVarType::kDataPar);
        }
        thread_var_vec_[i] = thread_var_;
      }
165
      q.push(i);
166
    }
167
168
    auto run_infer_step = [&](int cur_infer_id, InferLevel level,
                              bool update_queue) {
169
170
171
172
173
174
175
176
177
      // Range check for cur_infer_id
      ICHECK_GE(cur_infer_id, 0)
          << "cur_infer_id is negative, which is invalid.";
      ICHECK_LT(cur_infer_id, num_infer)
          << "cur_infer_id " << cur_infer_id << " is out of range, must be < "
          << num_infer << ".";

      // Make sure we can safely access infer_list_[cur_infer_id] and
      // thread_var_vec_[cur_infer_id]
178
      auto &next = infer_list_[cur_infer_id];
179
      auto iter_var = thread_var_vec_[cur_infer_id];
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200

      // Double-check that 'next' is valid
      ICHECK(next != nullptr) << "infer_list_[" << cur_infer_id
                              << "] is null inside run_infer_step.";

      // Check iter_var->dom and dom->extent
      ICHECK(iter_var.defined())
          << "thread_var_vec_[" << cur_infer_id << "] is not defined.";
      ICHECK(iter_var->dom.defined())
          << "iter_var->dom is not defined for infer_list_[" << cur_infer_id
          << "].";
      ICHECK(iter_var->dom->extent.defined())
          << "iter_var->dom->extent is not defined for infer_list_["
          << cur_infer_id << "].";

      const int64_t *extent_ptr = as_const_int(iter_var->dom->extent);
      ICHECK(extent_ptr != nullptr)
          << "iter_var->dom->extent is not a constant integer, which is "
             "required for layout inference.";

      // Run InferLayout
201
      auto updates = next->InferLayout(
202
203
          LayoutInferArgs{target_, static_cast<size_t>(*extent_ptr),
                          layout_map},
204
          level);
205
206

      // Process the returned updates
207
      for (const auto &[buffer, layout] : updates) {
208
209
210
211
        // Basic validity checks
        ICHECK(buffer.defined()) << "InferLayout returned an undefined buffer.";
        ICHECK(layout.defined()) << "InferLayout returned an undefined layout.";

212
        if (layout_map.count(buffer)) {
213
          // If already in map, ensure they are structurally equal
214
          ICHECK(StructuralEqual()(layout, layout_map[buffer]))
215
216
              << "Get different layout for " << buffer
              << " in cur_infer_id = " << cur_infer_id;
217
        } else {
218
          // Otherwise, update map
219
          layout_map.Set(buffer, layout);
220
221
          if (!update_queue)
            continue;
222
223
224
225
226
227
228

          // Check if buffer exists in use_list_
          ICHECK(use_list_.count(buffer))
              << "Buffer " << buffer << " not found in use_list_. "
              << "Potential mismatch between inference updates and use_list_.";

          // Push back into BFS queue
229
          for (int idx : use_list_[buffer]) {
230
231
232
233
234
235
            ICHECK_GE(idx, 0) << "Index in use_list_ for buffer " << buffer
                              << " is negative.";
            ICHECK_LT(idx, num_infer)
                << "Index in use_list_ for buffer " << buffer
                << " out of range: " << idx << " >= " << num_infer << ".";

236
237
238
239
240
241
242
243
            if (!in_queue[idx] && idx != cur_infer_id) {
              in_queue[idx] = true;
              q.push(idx);
            }
          }
        }
      }
    };
244

245
246
247
248
    auto finish_infer_queue = [&]() {
      while (!q.empty()) {
        int cur_infer_id = q.front();
        q.pop();
249
250
251
252
        // Range check again, just to be safe
        ICHECK_GE(cur_infer_id, 0);
        ICHECK_LT(cur_infer_id, num_infer);

253
254
255
256
257
        in_queue[cur_infer_id] = false;
        run_infer_step(cur_infer_id, InferLevel::kCommon, true);
      }
    };

258
    // step 1: infer strict layout
259
260
261
    for (int i = 0; i < num_infer; i++) {
      run_infer_step(i, InferLevel::kStrict, false);
    }
262
    // step 2: infer common layout with BFS
263
264

    finish_infer_queue();
265
    // step 3: relax constraints to free and re-run
266
267
268
269
270
    for (int i = 0; i < num_infer; i++) {
      run_infer_step(i, InferLevel::kFree, true);
      finish_infer_queue();
    }

271
    // Check that all local.fragment buffers have inferred layouts
272
    for (const auto &[buffer, _] : use_list_) {
273
274
275
276
277
      if (buffer.scope() == "local.fragment") {
        ICHECK_NE(layout_map.count(buffer), 0)
            << "The layout for fragment " << buffer
            << " can not be inferred correctly.";
      }
278
279
    }

280
    // Collect layout info for For nodes
281
282
    Map<For, Fragment> for_map;
    Map<For, PrimExpr> predicate_map;
283
    for (auto &base_infer : infer_list_) {
284
285
286
287
      // Check if base_infer is valid
      ICHECK(base_infer != nullptr) << "Null pointer encountered in "
                                       "infer_list_ while collecting for_map.";

288
      if (auto for_infer = dynamic_cast<ParallelOp *>(base_infer.get())) {
289
        // Check that the loop layout is defined
290
        ICHECK(for_infer->GetLoopLayout().defined())
291
            << "The Layout for Parallel for cannot be inferred correctly:\n"
292
293
            << for_infer->GetRoot();
        for_map.Set(for_infer->GetRoot(), for_infer->GetLoopLayout());
294
295
296
297
298
299

        // thread_var_ should be defined if we rely on it
        ICHECK(thread_var_.defined())
            << "thread_var_ is not defined. Cannot retrieve predicate.";

        if (auto predicate = for_infer->GetPredicate(thread_var_->var)) {
300
          predicate_map.Set(for_infer->GetRoot(), predicate.value());
301
        }
302
303
304
305
306
307
      }
    }

    return {layout_map, for_map, predicate_map};
  }

308
309
  void Collect(const PrimFunc &f) {
    for (const auto &[_, buffer] : f->buffer_map) {
310
311
312
      buffer_data_to_buffer_.Set(buffer->data, buffer);
    }
    auto target = f->GetAttr<Target>(tvm::attr::kTarget);
313
314
    ICHECK(target.defined())
        << "Layout_Inference: Require the target attribute";
315
316
317
318
    target_ = target.value();
    this->operator()(f->body);
  }

319
320
private:
  void VisitExpr_(const CallNode *op) final {
321
322
    StmtExprVisitor::VisitExpr_(op);
    // Do not analysis the call node to the global function.
323
324
    if (op->op.as<GlobalVarNode>())
      return;
325
326
327

    auto p = ParseOperator(GetRef<Call>(op), buffer_data_to_buffer_);
    if (p != nullptr) {
328
      for (const auto &arg : op->args) {
329
330
331
332
333
334
335
336
337
        if (auto buffer = getBufferFromAccessPtr(arg)) {
          addToUseList(buffer.value());
        }
      }
      infer_list_.push_back(std::move(p));
      thread_var_vec_.push_back(thread_var_);
    }
  }

338
  Optional<Buffer> getBufferFromAccessPtr(const PrimExpr &expr) {
339
340
341
342
343
344
345
346
    auto call = expr.as<CallNode>();
    if (call && call->op.same_as(builtin::tvm_access_ptr())) {
      auto var = call->args[1].as<Var>().value();
      return buffer_data_to_buffer_[var];
    }
    return NullOpt;
  }

347
  void addToUseList(const Buffer &buffer) {
348
349
350
351
352
353
354
    int infer_idx = infer_list_.size();
    if (use_list_.find(buffer) == use_list_.end()) {
      use_list_[buffer] = {};
    }
    use_list_[buffer].push_back(infer_idx);
  }

355
  void VisitStmt_(const ForNode *op) final {
356
357
    if (op->kind == ForKind::kParallel) {
      auto infer = std::make_unique<ParallelOp>(GetRef<For>(op));
358
      for (const auto &[buffer, _] : infer->GetIndiceMap()) {
359
360
361
362
363
364
365
366
367
        addToUseList(buffer);
      }
      infer_list_.push_back(std::move(infer));
      thread_var_vec_.push_back(thread_var_);
    } else {
      StmtExprVisitor::VisitStmt(op->body);
    }
  }

368
  void VisitStmt_(const BlockNode *op) final {
369
370
371
372
    for (auto buffer : op->alloc_buffers) {
      buffer_data_to_buffer_.Set(buffer->data, buffer);
    }
    if (op->annotations.count(attr::kLayoutMap)) {
373
374
375
      auto map =
          op->annotations.Get(attr::kLayoutMap).as<Map<Var, Layout>>().value();
      for (const auto &[var, layout] : map) {
376
377
378
379
380
381
382
383
        auto buffer = buffer_data_to_buffer_[var];
        ICHECK(StructuralEqual()(layout->InputShape(), buffer->shape));
        annotated_layout_map_.Set(buffer, layout);
      }
    }
    StmtExprVisitor::VisitStmt_(op);
  }

384
  void VisitStmt_(const AttrStmtNode *op) final {
385
386
387
388
389
390
391
392
393
394
395
396
    if (op->attr_key == tir::attr::thread_extent) {
      IterVar iv = Downcast<IterVar>(op->node);
      if (iv->thread_tag == "threadIdx.x") {
        ICHECK(iv->dom->extent.as<IntImmNode>());
        thread_var_ = iv;
      }
    }
    StmtExprVisitor::VisitStmt_(op);
  }

  Map<Var, Buffer> buffer_data_to_buffer_;
  std::vector<std::unique_ptr<Operator>> infer_list_;
397
398
  std::unordered_map<Buffer, std::vector<int>, ObjectPtrHash, ObjectPtrEqual>
      use_list_;
399
400
401
402
  IterVar thread_var_;
  std::vector<IterVar> thread_var_vec_;
  Target target_;
  LayoutMap annotated_layout_map_;
403
  bool skip_thread_partition_{false};
404
405
406
};

class LayoutInferencer : public IRMutatorWithAnalyzer {
407
public:
408
  static PrimFunc Substitute(PrimFunc f, bool skip_thread_partition = false) {
409
    arith::Analyzer analyzer;
410
    PrimFuncNode *fptr = f.CopyOnWrite();
411
    fptr->body = ParallelLoopFuser::Fuse(f->body);
412
    BufferUseDefCollector collector(skip_thread_partition);
413
414
    collector.Collect(f);
    auto result = collector.Run();
415
    LayoutInferencer substituter(result, skip_thread_partition, &analyzer);
416
417
418
419
    fptr->body = substituter.VisitStmt(f->body);
    return f;
  }

420
421
private:
  LayoutInferencer(const LayoutInferenceResult result,
422
423
424
                   bool skip_thread_partition, arith::Analyzer *analyzer)
      : arith::IRMutatorWithAnalyzer(analyzer), result_(result),
        skip_thread_partition_(skip_thread_partition){};
425

426
  Stmt VisitStmt_(const BlockNode *op) final {
427
428
429
430
431
432
433
434
435
436
437
438
439
    Block block = Downcast<Block>(IRMutatorWithAnalyzer::VisitStmt_(op));

    for (auto buffer : block->alloc_buffers) {
      if (buffer.scope() == "local.framgent") {
        ICHECK(result_.layout_map.count(buffer))
            << "Cannot inference fragment layout for " << buffer;
      }
    }
    auto block_ptr = block.CopyOnWrite();
    block_ptr->annotations.Set(attr::kLayoutMap, result_.layout_map);
    return block;
  }

440
  Stmt VisitStmt_(const ForNode *op) final {
441
442
443
    For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
    if (result_.for_map.count(GetRef<For>(op))) {
      auto loop_layout = result_.for_map[GetRef<For>(op)];
444
445
446
447
448
449

      if (!skip_thread_partition_) {
        // If none thread bindings are provided, partition the loop
        for_node =
            PartitionLoop(for_node, thread_var_->var, analyzer_, loop_layout);
      }
450
451
452
453
454
455
456
457
458
459
      for_node = VectorizeLoop(for_node);
      if (result_.predicate_map.count(GetRef<For>(op))) {
        return IfThenElse(result_.predicate_map[GetRef<For>(op)], for_node);
      } else {
        return for_node;
      }
    }
    return for_node;
  }

460
  Stmt VisitStmt_(const AttrStmtNode *op) final {
461
462
463
464
465
466
467
468
469
470
    if (op->attr_key == tir::attr::thread_extent) {
      IterVar iv = Downcast<IterVar>(op->node);
      ICHECK_NE(iv->thread_tag.length(), 0U);
      if (iv->thread_tag == "threadIdx.x") {
        thread_var_ = iv;
      }
    }
    return IRMutatorWithAnalyzer::VisitStmt_(op);
  }

471
private:
472
473
  const LayoutInferenceResult result_;
  IterVar thread_var_;
474
  bool skip_thread_partition_{false};
475
476
477
478
479
};

tvm::transform::Pass LayoutInference() {
  using namespace tir::transform;
  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
480
481
482
483
484
485
486
487
488
489
    AllocateCollector collector;
    collector(f->body);
    // TODO(Lei): This is a hack to avoid the issue of thread partition
    // for cpu backend. We should remove this after we have a better
    // solution for thread partition detect.
    bool need_thread_partition = (collector.dyn_shmem_allocs_.size() > 1 ||
                                  collector.static_shmem_allocs_.size() > 1 ||
                                  collector.local_fragment_allocs_.size() > 1);
    bool skip_thread_partition = !need_thread_partition;
    return LayoutInferencer::Substitute(std::move(f), skip_thread_partition);
490
491
492
493
494
495
496
  };
  return CreatePrimFuncPass(pass_func, 0, "tl.LayoutInference", {});
}

TVM_REGISTER_GLOBAL("tl.transform.LayoutInference")
    .set_body_typed(LayoutInference);

497
498
} // namespace tl
} // namespace tvm