[Chore] fix typos (#719)

* chore: fix typos * chore: fix ruff * chore: fix clang-format

[Chore] fix typos (#719)
* chore: fix typos * chore: fix ruff * chore: fix clang-format
d0742860 · Gabriel Wu · GitHub · 6545b084 · d0742860 · d0742860
Unverified Commit d0742860 authored Aug 15, 2025 by Gabriel Wu Committed by GitHub Aug 15, 2025
20 changed files
--- a/benchmark/matmul/benchmark_matmul.py
+++ b/benchmark/matmul/benchmark_matmul.py
@@ -53,10 +53,7 @@ def get_configs(args, kwargs):
        from tilelang.carver.roller.rasterization import NoRasterization
        import torch
-        if torch.version.hip is not None:
+        arch = CDNA("cuda") if torch.version.hip is None else CUDA("hip")
-            arch=CDNA("hip")
-        else:
-            arch = CUDA("cuda")
        topk = 10
        carve_template = MatmulTemplate(

--- a/benchmark/matmul/benchmark_matmul_intrinsic.py
+++ b/benchmark/matmul/benchmark_matmul_intrinsic.py
@@ -187,10 +187,7 @@ def get_configs(args, kwargs):
        from tilelang.carver.roller.rasterization import NoRasterization
        import torch
-        if torch.version.hip is not None:
+        arch = CDNA("cuda") if torch.version.hip is None else CUDA("hip")
-            arch=CDNA("hip")
-        else:
-            arch = CUDA("cuda")
        topk = 10
        carve_template = MatmulTemplate(

--- a/docs/deeplearning_operators/gemv.md
+++ b/docs/deeplearning_operators/gemv.md
@@ -252,7 +252,7 @@ def splitk_gemv_vectorized(
    return main
 ```
-With vectorized read, now the kernel finishs in **~0.0084 ms**, which is getting close to cuBLAS performance.
+With vectorized read, now the kernel finishes in **~0.0084 ms**, which is getting close to cuBLAS performance.
 ## `tvm_thread_allreduce` Instead of `atomicAdd`

--- a/examples/analyze/example_conv_analyze.py
+++ b/examples/analyze/example_conv_analyze.py
@@ -4,6 +4,7 @@ from tilelang.carver.arch import CUDA
 from tilelang.carver.arch import CDNA
 from tilelang.layout import make_swizzled_layout
 import torch
 N = 64
 C = 256
 H = 512
@@ -95,10 +96,7 @@ def kernel(N,
 def main():
    my_func = kernel(N, C, H, W, F, K, S, D, P, 64, 128, 32, 3, 256)
-    if torch.version.hip is not None:
+    cuda_device = CDNA("cuda") if torch.version.hip is None else CUDA("hip")
-        cuda_device=CDNA("hip")
-    else:
-        cuda_device = CUDA("cuda")
    result = Analyzer.analysis(my_func, cuda_device)
    print(result)
    print(f"Analyzed FLOPs: {result.total_flops}")

--- a/examples/analyze/example_gemm_analyze.py
+++ b/examples/analyze/example_gemm_analyze.py
@@ -49,10 +49,7 @@ def kernel(
 def main():
    my_func = kernel(128, 128, 32, 3, 128, True)
-    if torch.version.hip is not None:
+    cuda_device = CDNA("cuda") if torch.version.hip is None else CUDA("hip")
-        cuda_device=CDNA("hip")
-    else:
-        cuda_device = CUDA("cuda")
    result = Analyzer.analysis(my_func, cuda_device)
    print(f"Analyzed FLOPs: {result.total_flops}")

--- a/examples/bitnet-1.58b/modeling_bitnet.py
+++ b/examples/bitnet-1.58b/modeling_bitnet.py
@@ -1373,7 +1373,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
                    cache_length + input_ids.shape[1] > max_cache_length):
                attention_mask = attention_mask[:, -max_cache_length:]
-        position_ids = kwargs.get("position_ids", None)
+        position_ids = kwargs.get("position_ids")
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1

--- a/examples/gemm/example_gemm_autotune.py
+++ b/examples/gemm/example_gemm_autotune.py
@@ -16,10 +16,7 @@ def ref_program(A, B):
 def get_configs(M, N, K, with_roller=False, topk=20):
    if with_roller:
-        if torch.version.hip is not None:
+        arch = CDNA("cuda") if torch.version.hip is None else CUDA("hip")
-            arch=CDNA("hip")
-        else:
-            arch = CUDA("cuda")
        carve_template = MatmulTemplate(
            M=M,
            N=N,

--- a/src/op/gemm_sp.cc
+++ b/src/op/gemm_sp.cc
@@ -230,7 +230,7 @@ Stmt GemmSP::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
      << " and " << B.scope();
  ICHECK((E.scope() == "shared" || E.scope() == "shared.dyn"))
      << "Only support shared.dyn scope for E as copy from smem to rmem are "
-         "delegated to cute implemntation, found "
+         "delegated to cute implementation, found "
      << E.scope();
  ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
  ss << warp_m << ", " << warp_n << ", ";

--- a/src/target/codegen_cpp.h
+++ b/src/target/codegen_cpp.h
@@ -95,7 +95,7 @@ private:
  Array<String> function_names_;
  /*! \brief whether to emit asserts in the resulting C code */
  bool emit_asserts_;
-  /*! \brief whether to emit forwared function declarations in the resulting C
+  /*! \brief whether to emit forward function declarations in the resulting C
   * code */
  bool emit_fwd_func_decl_;

--- a/src/target/codegen_webgpu.cc
+++ b/src/target/codegen_webgpu.cc
@@ -252,9 +252,9 @@ CodeGenTileLangWebGPU::AddFunction(const PrimFunc &f, bool skip_readonly_decl) {
  os_param_access << "]";
  func_info.launch_param_tags.push_back(os_param_access.str());
-  ICHECK(!info.has_block_index_z)
+  ICHECK(!info.has_block_index_z) << "blockIdx.z is not supported in WebGPU to "
-      << "blockIdx.z is not supported in WebGPU to accomodate large blockIdx.x";
+                                     "accommodate large blockIdx.x";
-  // anotate workgroup
+  // annotate workgroup
  this->stream << "@compute @workgroup_size(" << info.workgroup_size[0] << ", "
               << info.workgroup_size[1] << ", " << info.workgroup_size[2]
               << ")\n";

--- a/src/tl_templates/cpp/half.hpp
+++ b/src/tl_templates/cpp/half.hpp
@@ -284,7 +284,7 @@
 #endif
 #ifndef HALF_ENABLE_F16C_INTRINSICS
-/// Enable F16C intruction set intrinsics.
+/// Enable F16C instruction set intrinsics.
 /// Defining this to 1 enables the use of [F16C compiler
 /// intrinsics](https://en.wikipedia.org/wiki/F16C) for converting between
 /// half-precision and single-precision values which may result in improved
@@ -1674,7 +1674,7 @@ template <typename T> T half2float(unsigned int value) {
 /// \tparam R rounding mode to use
 /// \tparam E `true` for round to even, `false` for round away from zero
 /// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never
-/// raise it \tparam T type to convert to (buitlin integer type with at least 16
+/// raise it \tparam T type to convert to (builtin integer type with at least 16
 /// bits precision, excluding any implicit sign bits) \param value
 /// half-precision value to convert \return rounded integer value \exception
 /// FE_INVALID if value is not representable in type \a T \exception FE_INEXACT
@@ -1778,7 +1778,7 @@ inline uint32 divide64(uint32 x, uint32 y, int &s) {
 /// \tparam R `true` to compute signed remainder, `false` for positive remainder
 /// \param x first operand as positive finite half-precision value
 /// \param y second operand as positive finite half-precision value
-/// \param quo adress to store quotient at, `nullptr` if \a Q `false`
+/// \param quo address to store quotient at, `nullptr` if \a Q `false`
 /// \return modulus of \a x / \a y
 template <bool Q, bool R>
 unsigned int mod(unsigned int x, unsigned int y, int *quo = NULL) {
@@ -2435,7 +2435,7 @@ template <typename, typename, std::float_round_style> struct half_caster;
 /// Half-precision floating-point type.
 /// This class implements an IEEE-conformant half-precision floating-point type
 /// with the usual arithmetic operators and conversions. It is implicitly
-/// convertible to single-precision floating-point, which makes artihmetic
+/// convertible to single-precision floating-point, which makes arithmetic
 /// expressions and functions with mixed-type operands to be of the most precise
 /// operand type.
 ///
@@ -2445,9 +2445,9 @@ template <typename, typename, std::float_round_style> struct half_caster;
 /// which means it can be standard-conformantly copied using raw binary copies.
 /// But in this context some more words about the actual size of the type.
 /// Although the half is representing an IEEE 16-bit type, it does not
-/// neccessarily have to be of exactly 16-bits size. But on any reasonable
+/// necessarily have to be of exactly 16-bits size. But on any reasonable
 /// implementation the actual binary representation of this type will most
-/// probably not ivolve any additional "magic" or padding beyond the simple
+/// probably not involve any additional "magic" or padding beyond the simple
 /// binary representation of the underlying 16-bit IEEE number, even if not
 /// strictly guaranteed by the standard. But even then it only has an actual
 /// size of 16 bits if your C++ implementation supports an unsigned integer type
@@ -2801,7 +2801,7 @@ public:
  static HALF_CONSTEXPR_CONST bool traps = true;
 #else
  /// Traps only if [HALF_ERRHANDLING_THROW_...](\ref
-  /// HALF_ERRHANDLING_THROW_INVALID) is acitvated.
+  /// HALF_ERRHANDLING_THROW_INVALID) is activated.
  static HALF_CONSTEXPR_CONST bool traps = false;
 #endif
@@ -5067,7 +5067,7 @@ inline half frexp(half arg, int *exp) {
 /// [std::scalbln](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
 /// \param arg number to modify
 /// \param exp power of two to multiply with
-/// \return \a arg multplied by 2 raised to \a exp
+/// \return \a arg multiplied by 2 raised to \a exp
 /// \exception FE_INVALID for signaling NaN
 /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
 inline half scalbln(half arg, long exp) {
@@ -5096,7 +5096,7 @@ inline half scalbln(half arg, long exp) {
 /// **See also:** Documentation for
 /// [std::scalbn](https://en.cppreference.com/w/cpp/numeric/math/scalbn). \param
 /// arg number to modify \param exp power of two to multiply with \return \a arg
-/// multplied by 2 raised to \a exp \exception FE_INVALID for signaling NaN
+/// multiplied by 2 raised to \a exp \exception FE_INVALID for signaling NaN
 /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
 inline half scalbn(half arg, int exp) { return scalbln(arg, exp); }
@@ -5106,7 +5106,7 @@ inline half scalbn(half arg, int exp) { return scalbln(arg, exp); }
 /// **See also:** Documentation for
 /// [std::ldexp](https://en.cppreference.com/w/cpp/numeric/math/ldexp). \param
 /// arg number to modify \param exp power of two to multiply with \return \a arg
-/// multplied by 2 raised to \a exp \exception FE_INVALID for signaling NaN
+/// multiplied by 2 raised to \a exp \exception FE_INVALID for signaling NaN
 /// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
 inline half ldexp(half arg, int exp) { return scalbln(arg, exp); }
@@ -5379,7 +5379,7 @@ inline HALF_CONSTEXPR bool islessequal(half x, half y) {
         !isnan(x) && !isnan(y);
 }
-/// Quiet comarison for less or greater.
+/// Quiet comparison for less or greater.
 /// **See also:** Documentation for
 /// [std::islessgreater](https://en.cppreference.com/w/cpp/numeric/math/islessgreater).
 /// \param x first operand
@@ -5503,7 +5503,7 @@ inline int feraiseexcept(int excepts) {
 ///
 /// **See also:** Documentation for
 /// [std::fegetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
-/// \param flagp adress to store flag state at
+/// \param flagp address to store flag state at
 /// \param excepts OR of flags to save
 /// \retval 0 for success
 inline int fegetexceptflag(int *flagp, int excepts) {
@@ -5520,7 +5520,7 @@ inline int fegetexceptflag(int *flagp, int excepts) {
 ///
 /// **See also:** Documentation for
 /// [std::fesetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
-/// \param flagp adress to take flag state from
+/// \param flagp address to take flag state from
 /// \param excepts OR of flags to restore
 /// \retval 0 for success
 inline int fesetexceptflag(const int *flagp, int excepts) {

--- a/src/tl_templates/cuda/common.h
+++ b/src/tl_templates/cuda/common.h
@@ -48,7 +48,7 @@ using int4_t = int4;
    }                                                                          \
  } while (0)
-// abs function for bfloat_t and half_t since there is no implicit convertion
+// abs function for bfloat_t and half_t since there is no implicit conversion
 // method
 TL_PATCH TL_DEVICE half_t __habs(const half_t x) {
  return half_t(__habs(x.to_half()));

--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -118,7 +118,7 @@ debug_print_buffer_value<signed char>(const char *msg, const char *buf_name,
         threadIdx.z, buf_name, index, var);
 }
-// Specialization for unsiged char type
+// Specialization for unsigned char type
 template <>
 __device__ void
 debug_print_buffer_value<unsigned char>(const char *msg, const char *buf_name,

--- a/src/transform/atomicadd_vectorize.cc
+++ b/src/transform/atomicadd_vectorize.cc
 /*!
 * \file atomicadd_vectorize.cc
- * \brief A tool to atomatically vectorize atomic add
+ * \brief A tool to automatically vectorize atomic add
 */
 #include "../layout/layout.h"

--- a/src/transform/merge_shared_memory_allocations.cc
+++ b/src/transform/merge_shared_memory_allocations.cc
@@ -303,7 +303,7 @@ private:
  bool IsAppropriateSharedMemory(const Var &var) {
    return is_dynamic_ ? IsDynamicSharedMemory(var) : IsStaticSharedMemory(var);
  }
-  // Whether do dyanmic analysis.
+  // Whether do dynamic analysis.
  bool is_dynamic_{true};
  // Whether do aggressive merge.
  bool enable_aggressive_merge_{false};
@@ -435,7 +435,7 @@ private:
            const AllocateNode *alloc = shmem_allocs_[buffer];
            auto alignment = align[i];
            // Modern nvidia architecture performs hardware swizzling (hopper
-            // wgmma/tma for exmaple) requires dynamic shared memory address to
+            // wgmma/tma for example) requires dynamic shared memory address to
            // be aligned to 1024 bytes For other devices, we align to 16 bytes
            if (shmem_alignment_map_.find(buffer) !=
                shmem_alignment_map_.end()) {
@@ -943,7 +943,7 @@ private:
   */
  StorageEntry *NewAlloc(const AllocateNode *op, size_t const_nbits) {
    ICHECK(op != nullptr);
-    // Re-use not successful, allocate a new buffer.
+    // Reuse not successful, allocate a new buffer.
    StorageEntry *entry = arena_.make<StorageEntry>();
    entry->allocs.push_back({op->buffer_var.get()});
    entry->const_nbits = const_nbits;
@@ -1046,7 +1046,7 @@ private:
      sym_free_list_.push_back(e);
    }
  }
-  // Wheather enable dyanmic analysis.
+  // Whether enable dynamic analysis.
  bool is_dynamic_{true};
  // Whether enable verbose logging.

--- a/src/transform/storage_rewrite.cc
+++ b/src/transform/storage_rewrite.cc
@@ -140,9 +140,9 @@ public:
 //
 class LinearAccessPatternFinder final : public StmtExprVisitor {
 public:
-  /*! \brief record the touch hist of statment. */
+  /*! \brief record the touch hist of statement. */
  struct StmtEntry {
-    // The statment
+    // The statement
    const Object *stmt;
    // The index in the linear_seq_ to point to end of the nested scope.
    // This is only set to non-zero if stmt is a nested scope.
@@ -150,7 +150,7 @@ public:
    // offset if offset < 0, means this is the end, the begin entry is
    // current_index + offset
    int64_t scope_pair_offset{0};
-    // The buffer variables this statment touched.
+    // The buffer variables this statement touched.
    std::vector<const VarNode *> touched;
  };
  // The scope of each allocation
@@ -675,7 +675,7 @@ private:
           scope.tag != ".workspace" && scope.tag != ".vtcm";
  }
-  // Alllocate entry of node.
+  // Allocate entry of node.
  // Event entry in liveness analysis
  struct EventEntry {
    // variables we generate
@@ -785,10 +785,10 @@ private:
          for (const AllocateNode *op : e->allocs) {
            ICHECK_EQ(op->extents.size(), 1)
                << "Buffer var " << op->buffer_var->name_hint
-                << " was identified as a re-usable allocation, but has "
+                << " was identified as a reusable allocation, but has "
                << op->extents.size() << " physical dimensions.  "
                << "Currently, only flat 1-d memory spaces should be "
-                   "identified as re-usable "
+                   "identified as reusable "
                   "allocations.";
            PrimExpr sz = op->extents[0];
            auto nbits = op->dtype.bits() * op->dtype.lanes();
@@ -905,7 +905,7 @@ private:
  void PlanNewScope(const Object *op) {
    if (thread_scope_ != nullptr) {
      ICHECK(thread_scope_ == op);
-      // erase all memory atatched to this scope.
+      // erase all memory attached to this scope.
      for (auto it = const_free_map_.begin(); it != const_free_map_.end();) {
        if (it->second->attach_scope_ == op) {
          it = const_free_map_.erase(it);
@@ -1023,7 +1023,7 @@ private:
  StorageEntry *NewAlloc(const AllocateNode *op, const Object *attach_scope,
                         const StorageScope &scope, size_t const_nbits) {
    ICHECK(op != nullptr);
-    // Re-use not successful, allocate a new buffer.
+    // Reuse not successful, allocate a new buffer.
    auto entry = std::make_unique<StorageEntry>();
    entry->attach_scope_ = attach_scope;
    entry->scope = scope;
@@ -1050,7 +1050,7 @@ private:
    // have its own allocation with size determined at runtime.
    bool is_known_size = (const_nbits != 0);
-    // Currently, only flat memory spaces can be re-used.  Packing
+    // Currently, only flat memory spaces can be reused.  Packing
    // into N-d space (e.g. 2-d texture memory on GPUs) will require
    // more in-depth algorithms.
    bool is_flat_memory_space = (num_physical_dimensions == 1);

--- a/src/transform/thread_storage_sync.cc
+++ b/src/transform/thread_storage_sync.cc
@@ -189,7 +189,7 @@ protected:
        }
      }
    }
-    // return the exposed entries, remove unecessary ones.
+    // return the exposed entries, remove unnecessary ones.
    int sync_count = 0;
    // head are before first sync, tail are after last sync
    std::vector<AccessEntry> head, tail;

--- a/src/transform/vectorize_loop.cc
+++ b/src/transform/vectorize_loop.cc
@@ -527,7 +527,7 @@ public:
    // A single var can be binded in multiple lets
    // but they have to bind to the same value.
    // This is used to allow cases when we reuse a single let
-    // expression to cosntruct a nested expr.
+    // expression to construct a nested expr.
    // (let x = 1 in x + 1) * (let x = 1 in x + 1)
    auto it = let_binding_.find(op->var);
    if (it != let_binding_.end()) {
@@ -683,7 +683,7 @@ public:
    return StmtMutator::VisitStmt_(op);
  }
-  // scalarize the statment
+  // scalarize the statement
  Stmt Scalarize(Stmt stmt) {
    Var idx(var_->name_hint + ".s", var_->dtype);
    stmt = Substitute(stmt, {{var_, idx}});
@@ -701,7 +701,7 @@ private:
  PrimExpr var_lanes_;
  // ramp representing the var.
  PrimExpr ramp_;
-  // flag to mark requirment of scalarization.
+  // flag to mark requirement of scalarization.
  bool need_scalarize_{false};
  // Let binding
  std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_binding_;

--- a/testing/python/language/test_tilelang_language_reshape.py
+++ b/testing/python/language/test_tilelang_language_reshape.py
@@ -88,6 +88,7 @@ def reshape_test_smem_2d_2_1d(N, M, dtype):
    return main
 def run_reshape_smem_2d_2_1d(N, M, dtype):
    program = reshape_test_smem_2d_2_1d(N, M, dtype)
    jit_kernel = tl.compile(program, out_idx=-1)
@@ -98,11 +99,11 @@ def run_reshape_smem_2d_2_1d(N, M, dtype):
    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 def test_reshape_smem_2d_2_1d():
    run_reshape_smem_2d_2_1d(1024, 32, "float32")
    run_reshape_smem_2d_2_1d(2048, 64, "float16")
 if __name__ == "__main__":
    tilelang.testing.main()
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -203,7 +203,7 @@ class AutoTuner:
                logger.warning(
                    "`supply_prog` will be ignored as this program is under `with set_autotune_inputs` context."
                )
-            supply_prog = lambda _: get_autotune_inputs()  # noqa: E731·
+            supply_prog = lambda _: get_autotune_inputs()  # noqa: E731
        self.profile_args = ProfileArgs(
            supply_type=supply_type,