[CI] change spell checker from codespell to typos (#18711)

Signed-off-by: Andy Xie <andy.xning@gmail.com>

[CI] change spell checker from codespell to typos (#18711)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2f1c19b2 · Ning Xie · GitHub · 42f52cc9 · 2f1c19b2 · 2f1c19b2
Unverified Commit 2f1c19b2 authored Jun 12, 2025 by Ning Xie Committed by GitHub Jun 11, 2025
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -200,5 +200,5 @@ benchmarks/**/*.json
 actionlint
 shellcheck*/
-# Ingore moe/marlin_moe gen code
+# Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,12 +20,10 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/codespell-project/codespell
+- repo: https://github.com/crate-ci/typos
-  rev: v2.4.1
+  rev: v1.32.0
  hooks:
-  - id: codespell
+  - id: typos
-    additional_dependencies: ['tomli']
-    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
  rev: 6.0.1
  hooks:

--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
 }
 template <typename T>
-FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
+FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
-                                        const int size) {
+                                         const int size) {
  T max = max_data[0];
  for (int i = 1; i < size; ++i) {
    max = max >= max_data[i] ? max : max_data[i];
@@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
        if (partition_num == 1) continue;
-        reducePartitonSoftmax(
+        reducePartitionSoftmax(
            max_logits + seq_idx * num_heads * max_num_partitions +
                head_idx * max_num_partitions,
            exp_sums + seq_idx * num_heads * max_num_partitions +

--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
  explicit FP16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
-  // non-temproal load
+  // non-temporal load
  explicit FP16Vec16(bool, void* ptr)
      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
-  // non-temproal load
+  // non-temporal load
  explicit BF16Vec16(bool, void* ptr)
      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  // normal load
  explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
-  // non-temproal load
+  // non-temporal load
  explicit FP32Vec16(bool, void* ptr)
      : reg((__m512)_mm512_stream_load_si512(ptr)) {}
@@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
  // normal load
  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
-  // non-temproal load
+  // non-temporal load
  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
@@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
    _mm512_mask_storeu_epi8(ptr, mask, reg);
  }
-  // non-temproal save
+  // non-temporal save
  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
 };
 #endif

--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -12,7 +12,7 @@ void moe_permute(
    const torch::Tensor& input,                      // [n_token, hidden]
    const torch::Tensor& topk_weights,               //[n_token, topk]
    torch::Tensor& topk_ids,                         // [n_token, topk]
-    const torch::Tensor& token_expert_indicies,      // [n_token, topk]
+    const torch::Tensor& token_expert_indices,       // [n_token, topk]
    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
    int64_t n_expert, int64_t n_local_expert, int64_t topk,
    const std::optional<int64_t>& align_block_size,
@@ -27,15 +27,15 @@ void moe_permute(
              "expert_first_token_offset must be int64");
  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
              "topk_ids must be int32");
-  TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
+  TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
-              "token_expert_indicies must be int32");
+              "token_expert_indices must be int32");
  TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
              "src_row_id2dst_row_id_map must be int32");
  TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
              "expert_first_token_offset shape != n_local_expert+1")
  TORCH_CHECK(
-      src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
+      src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
-      "token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
+      "token_expert_indices shape must be same as src_row_id2dst_row_id_map");
  auto n_token = input.sizes()[0];
  auto n_hidden = input.sizes()[1];
  auto align_block_size_value =
@@ -71,7 +71,7 @@ void moe_permute(
                             expert_map_ptr, n_expert, stream);
  }
  // expert sort topk expert id and scan expert id get expert_first_token_offset
-  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
+  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
                    get_ptr<int>(permuted_experts_id),
                    get_ptr<int>(dst_row_id2src_row_id_map),
                    get_ptr<int64_t>(expert_first_token_offset), n_token,
@@ -190,7 +190,7 @@ void shuffle_rows(const torch::Tensor& input_tensor,
 void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
                 torch::Tensor& topk_ids,
-                 const torch::Tensor& token_expert_indicies,
+                 const torch::Tensor& token_expert_indices,
                 const std::optional<torch::Tensor>& expert_map,
                 int64_t n_expert, int64_t n_local_expert, int64_t topk,
                 const std::optional<int64_t>& align_block_size,
@@ -203,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
 void moe_unpermute(const torch::Tensor& input,
                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
-                   const torch::Tensor& token_expert_indicies,
+                   const torch::Tensor& token_expert_indices,
                   const std::optional<torch::Tensor>& expert_map,
                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
                   const std::optional<int64_t>& align_block_size,

--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                       \
    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>(         \
-        gating_output, nullptr, topk_weights, topk_indicies,            \
+        gating_output, nullptr, topk_weights, topk_indices,            \
        token_expert_indices, num_tokens, topk, 0, num_experts,         \
        stream);
@@ -433,7 +433,7 @@ template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
    const float* gating_output,
    float* topk_weights,
-    IndType* topk_indicies,
+    IndType* topk_indices,
    int* token_expert_indices,
    float* softmax_workspace,
    const int num_tokens,
@@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
                gating_output, nullptr, softmax_workspace, num_experts);
            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
-                softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
+                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
                num_experts, topk, 0, num_experts);
        }
    }

--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -66,7 +66,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  m.def(
      "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
-      "Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
+      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
      "int n_local_expert,"
      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
      "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "

--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -1003,7 +1003,7 @@ struct MacheteCollectiveMma {
    static constexpr int A_CPY_VEC =
        decltype(max_common_vector(tCsA, tCrA_load)){};
-    static constexpr int COVERSION_WIDTH =
+    static constexpr int CONVERSION_WIDTH =
        std::min(A_CPY_VEC, int(size<0>(tCrA_mma)));
    auto load_A_to_registers = [&](int read_stage) {
@@ -1026,8 +1026,8 @@ struct MacheteCollectiveMma {
    // PIPELINED MAIN LOOP
    //
-    auto convert_A = [&, a_vec = Int<COVERSION_WIDTH>{}](int k_block,
+    auto convert_A = [&, a_vec = Int<CONVERSION_WIDTH>{}](int k_block,
-                                                         int read_stage) {
+                                                          int read_stage) {
      load_extra_info_to_registers(partitioned_extra_info,
                                   copy_partitions_extra_info, k_block,
                                   read_stage);

--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -320,7 +320,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // Goal is to bring the activation matrix A to the LDS
  // and use it across the lifetime of the work group
  // TODO: When activation matrix is larger than 64 KB
-  //	     then this is not goint to work!
+  //	     then this is not going to work!
  //----------------------------------------------------
  __shared__ scalar_t s[max_lds_len];
@@ -581,7 +581,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // Goal is to bring the activation matrix A to the LDS
  // and use it across the lifetime of the work group
  // TODO: When activation matrix is larger than 64 KB
-  //	     then this is not goint to work!
+  //	     then this is not going to work!
  //----------------------------------------------------
  __shared__ scalar_t s[max_lds_len];
@@ -601,7 +601,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
-  // Check whether there will be fragmenation!
+  // Check whether there will be fragmentation!
  // This will happen only for the last wave!
  if (m < M && (m + YTILE) >= M) {
    uint32_t startColumn = M - YTILE;
@@ -827,7 +827,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    m += CuCount * _WvPrGrp * YTILE;
-    // Check whether there will be fragmenation!
+    // Check whether there will be fragmentation!
    // This will happen only for the last wave!
    if (m < M && (m + YTILE) >= M) {
      uint32_t startColumn = M - YTILE;
@@ -882,7 +882,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // Goal is to bring the activation matrix A to the LDS
  // and use it across the lifetime of the work group
  // TODO: When activation matrix is larger than 64 KB
-  //	     then this is not goint to work!
+  //	     then this is not going to work!
  //----------------------------------------------------
  __shared__ scalar_t s[max_lds_len];
@@ -904,7 +904,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  //----------------------------------------------------
  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
-  // Check whether there will be fragmenation!
+  // Check whether there will be fragmentation!
  // This will happen only for the last wave!
  if (m < M && (m + YTILE) >= M) {
    uint32_t startColumn = M - YTILE;
@@ -1176,7 +1176,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    m += CuCount * _WvPrGrp * YTILE;
    kBase = 0;
-    // Check whether there will be fragmenation!
+    // Check whether there will be fragmentation!
    // This will happen only for the last wave!
    if (m < M && (m + YTILE) >= M) {
      uint32_t startColumn = M - YTILE;

--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -277,7 +277,7 @@ CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) {
  uint32_t const m = 1;  // Set M to 1 for compression
  uint32_t const n = a.size(1);
-  // Note: For correctess, the compressed format must be invariant in:
+  // Note: For correctness, the compressed format must be invariant in:
  //  - M, the flattened number of tokens
  //  - Whether output dtype is fp16 or bf16
  //  - CUTLASS epilogues

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -137,10 +137,6 @@ exclude = [
    'vllm/attention/ops/.*\.py$'
 ]
-[tool.codespell]
-ignore-words-list = "dout, te, indicies, subtile, ElementE"
-skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
 [tool.isort]
 skip_glob = [
    ".buildkite/*",

--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -223,7 +223,7 @@ def test_async_tp_pass_correctness(
        "VLLM_USE_V1": "1",
    }
-    aysnc_tp_args = [
+    async_tp_args = [
        *common_args,
        "--tensor-parallel-size",
        str(tp_size),
@@ -242,7 +242,7 @@ def test_async_tp_pass_correctness(
    ]
    compare_two_settings(model_id,
-                         aysnc_tp_args,
+                         async_tp_args,
                         tp_args,
                         async_tp_env,
                         tp_env,

--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -437,8 +437,8 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
    "enable_prefix_caching": True,
 }])
 @pytest.mark.parametrize("seed", [1])
-def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
+def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
-                                                 test_llm_generator):
+                                                  test_llm_generator):
    """Verify block manager v2 with auto prefix caching could works normal
    even when eviction started.
    With APC enabled, all blocks are held by native block at the beginning.

--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -33,8 +33,8 @@ BLOCK_SIZE = 16
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
+def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
-                                 batch_size, seed, backend, monkeypatch):
+                                  batch_size, seed, backend, monkeypatch):
    """
    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
    asks for value of one of them (which is outside the sliding window).
@@ -100,7 +100,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
 def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
                                        backend, monkeypatch):
    """
-    This is similar to test_sliding_window_retrival, however, it doesn't
+    This is similar to test_sliding_window_retrieval, however, it doesn't
    compare against the v1 block manager since v1 doesn't support
    chunked prefill with sliding window.

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -594,8 +594,8 @@ def test_decode_schedule_preempted():
    # should be preempted. 1 will also be preempted.
    budget = create_token_budget()
    output = scheduler._schedule_running(budget, curr_loras)
-    remainig_running = scheduler.running
+    remaining_running = scheduler.running
-    assert len(remainig_running) == 0
+    assert len(remaining_running) == 0
    assert len(output.decode_seq_groups) == 1
    assert len(output.prefill_seq_groups) == 0
    assert output.decode_seq_groups[0].seq_group.request_id == "0"

--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()
 # Define models, templates, and their corresponding expected outputs
-MODEL_TEMPLATE_GENERATON_OUTPUT = [
+MODEL_TEMPLATE_GENERATION_OUTPUT = [
    ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
@@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike():
 @pytest.mark.parametrize(
    "model,template,add_generation_prompt,continue_final_message,expected_output",
-    MODEL_TEMPLATE_GENERATON_OUTPUT)
+    MODEL_TEMPLATE_GENERATION_OUTPUT)
 def test_get_gen_prompt(model, template, add_generation_prompt,
                        continue_final_message, expected_output):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)

--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -72,8 +72,8 @@ def test_copy_blocks(
    # destination blocks.
    assert 2 * num_mappings <= num_blocks
    src_blocks = random.sample(range(num_blocks), num_mappings)
-    remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
+    remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
-    dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
+    dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
    block_mapping: list[tuple[int, int]] = []
    for i in range(num_mappings):
        src = src_blocks[i]
@@ -189,12 +189,12 @@ def test_reshape_and_cache(
    # Run the reference implementation.
    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
-    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_indicies_lst = block_indicies.cpu().tolist()
+    block_indices_lst = block_indices.cpu().tolist()
    block_offsets = slot_mapping % block_size
    block_offsets_lst = block_offsets.cpu().tolist()
    for i in range(num_tokens):
-        block_idx = block_indicies_lst[i]
+        block_idx = block_indices_lst[i]
        block_offset = block_offsets_lst[i]
        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
        cloned_value_cache[block_idx, :, :, block_offset] = value[i]
@@ -322,12 +322,12 @@ def test_reshape_and_cache_flash(
                        kv_dtype=kv_cache_dtype)
    # Run the reference implementation.
-    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_indicies_lst = block_indicies.cpu().tolist()
+    block_indices_lst = block_indices.cpu().tolist()
    block_offsets = slot_mapping % block_size
    block_offsets_lst = block_offsets.cpu().tolist()
    for i in range(num_tokens):
-        block_idx = block_indicies_lst[i]
+        block_idx = block_indices_lst[i]
        block_offset = block_offsets_lst[i]
        if kv_cache_layout == "NHD":
            cloned_key_cache[block_idx, block_offset, :, :] = key[i]

--- a/tests/kernels/attention/test_encoder_decoder_attn.py
+++ b/tests/kernels/attention/test_encoder_decoder_attn.py
@@ -46,7 +46,7 @@ CUDA_DEVICE = "cuda:0"
 MAX_DEC_SEQ_LENS = [128]
 MAX_ENC_SEQ_LENS = [128]
-# Narrow teest-cases for unsupported-scenario
+# Narrow test-cases for unsupported-scenario
 # tests
 HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]

--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -39,10 +39,10 @@ def rotary_embedding_opcheck(rot,
 @pytest.mark.parametrize("head_size", [32, 108])
 @pytest.mark.parametrize("seq_len", [11, 1024])
 @pytest.mark.parametrize("use_key", [True, False])
-@pytest.mark.parametrize("head_stride_is_contingous", [True, False])
+@pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
 def test_rotary_embedding_opcheck(dist_init, device, max_position,
                                  is_neox_style, rotary_dim, head_size,
-                                  seq_len, use_key, head_stride_is_contingous):
+                                  seq_len, use_key, head_stride_is_contiguous):
    batch_size = 1
    base = 10000
    num_heads = 7
@@ -52,7 +52,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
    positions = torch.randint(0,
                              max_position, (batch_size, seq_len),
                              device=device)
-    head_stride = head_size + (64 if head_stride_is_contingous else 0)
+    head_stride = head_size + (64 if head_stride_is_contiguous else 0)
    query = torch.randn(batch_size,
                        seq_len,
@@ -72,7 +72,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
    # if we have a contiguous head stride, test the alternate
    # [..., num_heads * head_dim] shape/layout
-    if head_stride_is_contingous:
+    if head_stride_is_contiguous:
        rotary_embedding_opcheck(
            rot, positions, query.flatten(start_dim=-2),
            key.flatten(start_dim=-2) if use_key else None)
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -107,15 +107,15 @@ def generate_random_inputs(batch_size,
    return A, dt, X, B, C
-def generate_continous_batched_examples(example_lens_by_batch,
+def generate_continuous_batched_examples(example_lens_by_batch,
-                                        num_examples,
+                                         num_examples,
-                                        full_length,
+                                         full_length,
-                                        last_taken,
+                                         last_taken,
-                                        exhausted,
+                                         exhausted,
-                                        n_heads,
+                                         n_heads,
-                                        d_head,
+                                         d_head,
-                                        itype,
+                                         itype,
-                                        device='cuda'):
+                                         device='cuda'):
    # this function generates a random examples of certain length
    # and then cut according to "example_lens_by_batch" and feed
@@ -269,11 +269,10 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
    exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
    states = None
-    for Y_min, cu_seqlens, seq_idx, (A, dt, X, B,
+    for Y_min, cu_seqlens, seq_idx, (
-                                     C) in generate_continous_batched_examples(
+            A, dt, X, B, C) in generate_continuous_batched_examples(
-                                         cases, num_examples, seqlen,
+                cases, num_examples, seqlen, last_taken, exhausted, n_heads,
-                                         last_taken, exhausted, n_heads,
+                d_head, itype):
-                                         d_head, itype):
        chunk_indices, chunk_offsets = \
            _query_start_loc_to_chunk_indices_offsets(