generate.py 22.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
7
8
import itertools
import math
import os
import shutil
from collections.abc import Iterable
9
10
11
from copy import deepcopy
from dataclasses import dataclass, fields
from functools import reduce
12
from typing import Optional, Union
13
14

import jinja2
15

16
17
# yapf conflicts with isort for this block
# yapf: disable
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from vllm_cutlass_library_extension import (
    DataType,
    EpilogueScheduleTag,
    EpilogueScheduleType,
    MixedInputKernelScheduleType,
    TileSchedulerTag,
    TileSchedulerType,
    VLLMDataType,
    VLLMDataTypeNames,
    VLLMDataTypeSize,
    VLLMDataTypeTag,
    VLLMDataTypeTorchDataTypeTag,
    VLLMDataTypeVLLMScalarTypeTag,
    VLLMKernelScheduleTag,
)
33
34
35
36
37
38
39
40
41
42
43

# yapf: enable

#
#   Generator templating
#

DISPATCH_TEMPLATE = """
#include "../machete_mm_launcher.cuh"

namespace machete {
44
45
46
47
48
49
50
51

{% for impl_config in impl_configs %}
{% set type_sig = gen_type_sig(impl_config.types) -%}
{% for s in impl_config.schedules %}
extern torch::Tensor impl_{{type_sig}}_sch_{{gen_sch_sig(s)}}(MMArgs);
{%- endfor %}

torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) {
52
53
54
55
  [[maybe_unused]] auto M = args.A.size(0);
  [[maybe_unused]] auto N = args.B.size(1);
  [[maybe_unused]] auto K = args.A.size(1);
    
56
57
  if (!args.maybe_schedule) {
    {%- for cond, s in impl_config.heuristic %}
58
59
60
    {%if cond is not none%}if ({{cond}})
    {%- else %}else
    {%- endif %}
61
        return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);{% endfor %}
62
63
  }

64
65
66
67
  {%- for s in impl_config.schedules %}
  if (*args.maybe_schedule == "{{ gen_sch_sig(s) }}")
    return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);
  {%- endfor %}
68
  TORCH_CHECK_NOT_IMPLEMENTED(false, "machete_gemm(..) is not implemented for "
69
                                     "schedule = ", *args.maybe_schedule);
70
}
71
72
{%- endfor %}

73

74
static inline std::optional<at::ScalarType> maybe_scalartype(
75
    std::optional<at::Tensor> const& t) {
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
    if (!t) {
      return std::nullopt;
    } else {
      return t->scalar_type();
    };
}

torch::Tensor mm_dispatch(MMArgs args) {
  auto out_type = args.maybe_out_type.value_or(args.A.scalar_type());
  auto a_type = args.A.scalar_type();
  auto maybe_g_scales_type = maybe_scalartype(args.maybe_group_scales);
  auto maybe_g_zeros_type = maybe_scalartype(args.maybe_group_zeros);
  auto maybe_ch_scales_type = maybe_scalartype(args.maybe_channel_scales);
  auto maybe_tok_scales_type = maybe_scalartype(args.maybe_token_scales);

  {% for impl_config in impl_configs %}
  {% set t = impl_config.types -%}
  {% set type_sig = gen_type_sig(t) -%}
  if (args.b_type == {{VLLMScalarTypeTag[t.b]}}
      && a_type == {{TorchTypeTag[t.a]}}
      && out_type == {{TorchTypeTag[t.out]}}
      && {%if t.b_group_scale != void -%}
      maybe_g_scales_type == {{TorchTypeTag[t.b_group_scale]}}
      {%- else %}!maybe_g_scales_type{%endif%}
      && {%if t.b_group_zeropoint != void -%}
      maybe_g_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}}
      {%- else %}!maybe_g_zeros_type{%endif%}
      && {%if t.b_channel_scale != void -%}
      maybe_ch_scales_type == {{TorchTypeTag[t.b_channel_scale]}}
      {%- else %}!maybe_ch_scales_type{%endif%}
      && {%if t.a_token_scale != void -%}
      maybe_tok_scales_type == {{TorchTypeTag[t.a_token_scale]}}
      {%- else %}!maybe_tok_scales_type{%endif%}
  ) {
      return mm_dispatch_{{type_sig}}(args);
  }
  {%- endfor %}
  
  TORCH_CHECK_NOT_IMPLEMENTED(
    false, "machete_mm(..) is not implemented for "
    "a_type=", args.A.scalar_type(),
    ", b_type=", args.b_type.str(),
    ", out_type=", out_type,
    ", with_group_scale_type=", maybe_g_scales_type
        ? toString(*maybe_g_scales_type) : "None",
    ", with_group_zeropoint_type=", maybe_g_zeros_type
        ? toString(*maybe_g_zeros_type) : "None",
    ", with_channel_scale_type=", maybe_ch_scales_type
        ? toString(*maybe_ch_scales_type) : "None",
    ", with_token_scale_type=", maybe_tok_scales_type
        ? toString(*maybe_tok_scales_type) : "None",
    "; implemented types are: \\n",
    {%- for impl_config in impl_configs %}
    {% set t = impl_config.types -%}
    "\\t{{gen_type_option_name(t)}}\\n",
    {%- endfor %}
    "");
133
134
}

135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
std::vector<std::string> supported_schedules_dispatch(
    SupportedSchedulesArgs args) {
    auto out_type = args.maybe_out_type.value_or(args.a_type);
    
    {% for impl_config in impl_configs %}
    {% set t = impl_config.types -%}
    {% set schs = impl_config.schedules -%}
    if (args.b_type == {{VLLMScalarTypeTag[t.b]}}
        && args.a_type == {{TorchTypeTag[t.a]}}
        && out_type == {{TorchTypeTag[t.out]}}
        && {%if t.b_group_scale != void -%}
        args.maybe_group_scales_type == {{TorchTypeTag[t.b_group_scale]}}
        {%- else %}!args.maybe_group_scales_type{%endif%}
        && {%if t.b_group_zeropoint != void-%}
        args.maybe_group_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}}
        {%- else %}!args.maybe_group_zeros_type{%endif%}
    ) {
        return {
            {%- for s in impl_config.schedules %}
            "{{gen_sch_sig(s)}}"{% if not loop.last %},{% endif %}
            {%- endfor %}
        };
    }
    {%- endfor %}
    
    return {};
};

163
164
165
166
167
168
169
}; // namespace machete
"""

IMPL_TEMPLATE = """
#include "../machete_mm_launcher.cuh"

namespace machete {
170
171
172
173
    
{% for sch in unique_schedules(impl_configs) %}
{% set sch_sig = gen_sch_sig(sch) -%}
struct sch_{{sch_sig}} {
174
175
176
177
178
179
180
181
182
183
  using TileShapeNM = Shape<{{
      to_cute_constant(sch.tile_shape_mn)|join(', ')}}>;
  using ClusterShape = Shape<{{
      to_cute_constant(sch.cluster_shape_mnk)|join(', ')}}>;
  // TODO: Reimplement
  // using KernelSchedule   = {{KernelScheduleTag[sch.kernel_schedule]}};
  using EpilogueSchedule = {{EpilogueScheduleTag[sch.epilogue_schedule]}};
  using TileScheduler    = {{TileSchedulerTag[sch.tile_scheduler]}};
  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
};
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
{% endfor %}
    
{% for impl_config in impl_configs %}
{% set t = impl_config.types -%}
{% set schs = impl_config.schedules -%}
{% set type_sig = gen_type_sig(t) -%}

template<typename Sch>
using Kernel_{{type_sig}} = MacheteKernelTemplate<
  {{DataTypeTag[t.a]}},  // ElementA
  {{DataTypeTag[t.b]}},  // ElementB
  {{DataTypeTag[t.out]}},  // ElementD
  {{DataTypeTag[t.accumulator]}}, // Accumulator
  {{DataTypeTag[t.b_group_scale]}}, // GroupScaleT
  {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT
  {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT
  {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT
201
  cutlass::gemm::KernelTmaWarpSpecializedCooperative,
202
203
204
205
  Sch>;

{% for sch in schs %}
{% set sch_sig = gen_sch_sig(sch) -%}
206
torch::Tensor 
207
208
impl_{{type_sig}}_sch_{{sch_sig}}(MMArgs args) {
  return run_impl<Kernel_{{type_sig}}<sch_{{sch_sig}}>>(args);
209
}
210
211
{%- endfor %}
{%- endfor %}
212
213
214
215
216
217
218
219

}; // namespace machete
"""

PREPACK_TEMPLATE = """
#include "../machete_prepack_launcher.cuh"

namespace machete {
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234

torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
  auto convert_type = args.maybe_group_scales_type.value_or(args.a_type);
  {%- for t in types %}
  {% set b_type = unsigned_type_with_bitwidth(t.b_num_bits) %}
  if (args.a_type == {{TorchTypeTag[t.a]}}
      && args.b_type.size_bits() == {{t.b_num_bits}} 
      && convert_type == {{TorchTypeTag[t.convert]}}) {
    return prepack_impl<
      PrepackedLayoutBTemplate<
        {{DataTypeTag[t.a]}}, // ElementA
        {{DataTypeTag[b_type]}}, // ElementB
        {{DataTypeTag[t.convert]}}, // ElementConvert
        {{DataTypeTag[t.accumulator]}}, // Accumulator
        cutlass::layout::ColumnMajor,
235
        cutlass::gemm::KernelTmaWarpSpecializedCooperative>
236
237
238
239
240
241
242
243
244
245
    >(args.B); 
  }
  {%- endfor %}
  
  TORCH_CHECK_NOT_IMPLEMENTED(false, 
    "prepack_B_dispatch(..) is not implemented for "
    "atype = ", args.a_type,
    ", b_type = ", args.b_type.str(),
    ", with_group_scales_type= ", args.maybe_group_scales_type ? 
        toString(*args.maybe_group_scales_type) : "None");
246
}
247

248
249
250
}; // namespace machete
"""

251
TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative
252
253
254
TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative


255
@dataclass(frozen=True)
256
class ScheduleConfig:
257
258
    tile_shape_mn: tuple[int, int]
    cluster_shape_mnk: tuple[int, int, int]
259
260
261
262
263
    kernel_schedule: MixedInputKernelScheduleType
    epilogue_schedule: EpilogueScheduleType
    tile_scheduler: TileSchedulerType


264
@dataclass(frozen=True)
265
class TypeConfig:
266
267
268
269
270
271
272
    a: DataType
    b: Union[DataType, VLLMDataType]
    b_group_scale: DataType
    b_group_zeropoint: DataType
    b_channel_scale: DataType
    a_token_scale: DataType
    out: DataType
273
274
275
    accumulator: DataType


276
277
278
279
280
281
@dataclass(frozen=True)
class PrepackTypeConfig:
    a: DataType
    b_num_bits: int
    convert: DataType
    accumulator: DataType
282
283
284
285


@dataclass
class ImplConfig:
286
    types: TypeConfig
287
288
    schedules: list[ScheduleConfig]
    heuristic: list[tuple[Optional[str], ScheduleConfig]]
289
290


291
def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
292
293
294
    tile_shape = (
        f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
    )
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
    cluster_shape = (
        f"{schedule_config.cluster_shape_mnk[0]}"
        + f"x{schedule_config.cluster_shape_mnk[1]}"
        + f"x{schedule_config.cluster_shape_mnk[2]}"
    )
    kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split(
        "::"
    )[-1]
    epilogue_schedule = EpilogueScheduleTag[schedule_config.epilogue_schedule].split(
        "::"
    )[-1]
    tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler].split("::")[-1]

    return (
        f"{tile_shape}_{cluster_shape}_{kernel_schedule}"
        + f"_{epilogue_schedule}_{tile_scheduler}"
    )
312
313


314
315
# mostly unique shorter sch_sig
def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
316
    kernel_terse_names_replace = {
317
        "KernelTmaWarpSpecializedCooperative": "TmaMI_",
318
319
320
321
        "TmaWarpSpecializedCooperative_": "TmaCoop_",
        "StreamKScheduler": "streamK",
    }

322
    sch_sig = generate_sch_sig(schedule_config)
323
    for orig, terse in kernel_terse_names_replace.items():
324
325
        sch_sig = sch_sig.replace(orig, terse)
    return sch_sig
326
327
328


# unique type_name
329
def generate_type_signature(kernel_types: TypeConfig):
330
331
332
333
334
335
336
337
    return str(
        "".join(
            [
                VLLMDataTypeNames[getattr(kernel_types, field.name)]
                for field in fields(TypeConfig)
            ]
        )
    )
338
339


340
def generate_type_option_name(kernel_types: TypeConfig):
341
342
343
344
345
346
347
    return ", ".join(
        [
            f"{field.name.replace('b_', 'with_') + '_type'}="
            + VLLMDataTypeNames[getattr(kernel_types, field.name)]
            for field in fields(TypeConfig)
        ]
    )
348
349
350
351
352
353


def is_power_of_two(n):
    return (n != 0) and (n & (n - 1) == 0)


354
def to_cute_constant(value: list[int]):
355
356
357
358
359
360
361
362
363
364
365
366
    def _to_cute_constant(value: int):
        if is_power_of_two(value):
            return f"_{value}"
        else:
            return f"Int<{value}>"

    if isinstance(value, Iterable):
        return [_to_cute_constant(value) for value in value]
    else:
        return _to_cute_constant(value)


367
def unique_schedules(impl_configs: list[ImplConfig]):
368
    # Use dict over set for deterministic ordering
369
370
371
372
373
    return list(
        {
            sch: None for impl_config in impl_configs for sch in impl_config.schedules
        }.keys()
    )
374
375
376
377
378
379
380
381
382
383
384
385


def unsigned_type_with_bitwidth(num_bits):
    return {
        4: DataType.u4,
        8: DataType.u8,
        16: DataType.u16,
        32: DataType.u32,
        64: DataType.u64,
    }[num_bits]


386
template_globals = {
387
    "void": DataType.void,
388
    "DataTypeTag": VLLMDataTypeTag,
389
390
    "VLLMScalarTypeTag": VLLMDataTypeVLLMScalarTypeTag,
    "TorchTypeTag": VLLMDataTypeTorchDataTypeTag,
391
392
393
394
    "KernelScheduleTag": VLLMKernelScheduleTag,
    "EpilogueScheduleTag": EpilogueScheduleTag,
    "TileSchedulerTag": TileSchedulerTag,
    "to_cute_constant": to_cute_constant,
395
396
397
398
    "gen_sch_sig": generate_terse_sch_sig,
    "gen_type_sig": generate_type_signature,
    "unique_schedules": unique_schedules,
    "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
399
    "gen_type_option_name": generate_type_option_name,
400
401
402
403
404
405
406
407
408
409
410
411
412
413
}


def create_template(template_str):
    template = jinja2.Template(template_str)
    template.globals.update(template_globals)
    return template


mm_dispatch_template = create_template(DISPATCH_TEMPLATE)
mm_impl_template = create_template(IMPL_TEMPLATE)
prepack_dispatch_template = create_template(PREPACK_TEMPLATE)


414
def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
415
416
    sources = []

417
418
419
420
421
422
    sources.append(
        (
            "machete_mm_dispatch",
            mm_dispatch_template.render(impl_configs=impl_configs),
        )
    )
423

424
425
    prepack_types = []
    for impl_config in impl_configs:
426
427
428
429
430
        convert_type = (
            impl_config.types.a
            if impl_config.types.b_group_scale == DataType.void
            else impl_config.types.b_group_scale
        )
431
432
433
434
435
436
        prepack_types.append(
            PrepackTypeConfig(
                a=impl_config.types.a,
                b_num_bits=VLLMDataTypeSize[impl_config.types.b],
                convert=convert_type,
                accumulator=impl_config.types.accumulator,
437
438
            )
        )
439
440

    def prepacked_type_key(prepack_type: PrepackTypeConfig):
441
        # For now, we can just use the first accumulator type seen since
442
443
444
445
446
447
448
449
450
451
452
453
        # the tensor core shapes/layouts don't vary based on accumulator
        # type so we can generate less code this way
        return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)

    unique_prepack_types = []
    prepack_types_seen = set()
    for prepack_type in prepack_types:
        key = prepacked_type_key(prepack_type)
        if key not in prepack_types_seen:
            unique_prepack_types.append(prepack_type)
            prepack_types_seen.add(key)

454
455
456
457
458
459
460
461
    sources.append(
        (
            "machete_prepack",
            prepack_dispatch_template.render(
                types=unique_prepack_types,
            ),
        )
    )
462

463
464
465
466
    # Split up impls across files
    num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
    num_impls_per_file = math.ceil(num_impls / num_impl_files)

467
    files_impls: list[list[ImplConfig]] = [[]]
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491

    curr_num_impls_assigned = 0
    curr_impl_in_file = 0
    curr_impl_configs = deepcopy(list(reversed(impl_configs)))

    while curr_num_impls_assigned < num_impls:
        room_left_in_file = num_impls_per_file - curr_impl_in_file
        if room_left_in_file == 0:
            files_impls.append([])
            room_left_in_file = num_impls_per_file
            curr_impl_in_file = 0

        curr_ic = curr_impl_configs[-1]
        if len(curr_ic.schedules) >= room_left_in_file:
            # Break apart the current impl config
            tmp_ic = deepcopy(curr_ic)
            tmp_ic.schedules = curr_ic.schedules[:room_left_in_file]
            curr_ic.schedules = curr_ic.schedules[room_left_in_file:]
            files_impls[-1].append(tmp_ic)
        else:
            files_impls[-1].append(curr_ic)
            curr_impl_configs.pop()
        curr_num_impls_assigned += len(files_impls[-1][-1].schedules)
        curr_impl_in_file += len(files_impls[-1][-1].schedules)
492

493
    for part, file_impls in enumerate(files_impls):
494
495
496
497
498
499
        sources.append(
            (
                f"machete_mm_impl_part{part + 1}",
                mm_impl_template.render(impl_configs=file_impls),
            )
        )
500

501
502
503
504
505
506
507
508
    return sources


def generate():
    # See csrc/quantization/machete/Readme.md, the Codegeneration for more info
    # about how this works
    SCRIPT_DIR = os.path.dirname(__file__)

509
    sch_common_params = dict(
510
511
512
513
        kernel_schedule=TmaMI,
        epilogue_schedule=TmaCoop,
        tile_scheduler=TileSchedulerType.StreamK,
    )
514

515
516
    # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
    default_tile_heuristic_config = {
517
        #### M = 257+
518
519
        "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
        "M > 256": ((128, 256), (2, 1, 1)),
520
        #### M = 129-256
521
522
523
        "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
        "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
        "M > 128": ((128, 256), (2, 1, 1)),
524
        #### M = 65-128
525
526
527
528
        "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
        "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
        "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
        "M > 64": ((128, 128), (2, 1, 1)),
529
        #### M = 33-64
530
531
532
        "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
        "M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
        "M > 32": ((128, 64), (2, 1, 1)),
533
        #### M = 17-32
534
535
        "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
        "M > 16": ((256, 32), (2, 1, 1)),
536
        #### M = 1-16
537
538
539
540
541
542
543
        "N >= 26624": ((256, 16), (1, 1, 1)),
        None: ((128, 16), (1, 1, 1)),
    }

    # For now we use the same heuristic for all types
    # Heuristic is currently tuned for H100s
    default_heuristic = [
544
        (cond, ScheduleConfig(*tile_config, **sch_common_params))  # type: ignore
545
        for cond, tile_config in default_tile_heuristic_config.items()
546
547
    ]

548
    def get_unique_schedules(heuristic: dict[str, ScheduleConfig]):
549
550
551
552
553
554
555
556
        # Do not use schedules = list(set(...)) because we need to make sure
        # the output list is deterministic; otherwise the generated kernel file
        # will be non-deterministic and causes ccache miss.
        schedules = []
        for _, schedule_config in heuristic:
            if schedule_config not in schedules:
                schedules.append(schedule_config)
        return schedules
557

558
559
560
    impl_configs = []

    GPTQ_kernel_type_configs = list(
561
        TypeConfig(
562
563
564
565
566
567
568
            a=a,
            b=b,
            b_group_scale=a,
            b_group_zeropoint=DataType.void,
            b_channel_scale=DataType.void,
            a_token_scale=DataType.void,
            out=a,
569
            accumulator=DataType.f32,
570
571
572
573
        )
        for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
        for a in (DataType.f16, DataType.bf16)
    )
574
575

    impl_configs += [
576
        ImplConfig(x[0], x[1], x[2])
577
578
579
580
581
        for x in zip(
            GPTQ_kernel_type_configs,
            itertools.repeat(get_unique_schedules(default_heuristic)),
            itertools.repeat(default_heuristic),
        )
582
583
584
    ]

    AWQ_kernel_type_configs = list(
585
        TypeConfig(
586
587
588
589
590
591
592
            a=a,
            b=b,
            b_group_scale=a,
            b_group_zeropoint=a,
            b_channel_scale=DataType.void,
            a_token_scale=DataType.void,
            out=a,
593
            accumulator=DataType.f32,
594
595
596
597
        )
        for b in (DataType.u4, DataType.u8)
        for a in (DataType.f16, DataType.bf16)
    )
598
599
600

    impl_configs += [
        ImplConfig(x[0], x[1], x[2])
601
602
603
604
605
        for x in zip(
            AWQ_kernel_type_configs,
            itertools.repeat(get_unique_schedules(default_heuristic)),
            itertools.repeat(default_heuristic),
        )
606
    ]
607

608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
    # TODO: Support W4A8 when ready
    # # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
    # # TODO (LucasWilkinson): Further tuning required
    # qqq_tile_heuristic_config = {
    #     #### M = 257+
    #     # ((128, 256), (2, 1, 1)) Broken for QQQ types
    #     # TODO (LucasWilkinson): Investigate further
    #     # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
    #     # "M > 256": ((128, 256), (2, 1, 1)),
    #     "M > 256": ((128, 128), (2, 1, 1)),
    #     #### M = 129-256
    #     "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
    #     "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
    #     # ((128, 256), (2, 1, 1)) Broken for QQQ types
    #     # TODO (LucasWilkinson): Investigate further
    #     # "M > 128": ((128, 256), (2, 1, 1)),
    #     "M > 128": ((128, 128), (2, 1, 1)),
    #     #### M = 65-128
    #     "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
    #     "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
    #     "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
    #     "M > 64": ((128, 128), (2, 1, 1)),
    #     #### M = 33-64
    #     "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
    #     # Broken for QQQ types
    #     # TODO (LucasWilkinson): Investigate further
    #     #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
    #     "M > 32": ((128, 64), (2, 1, 1)),
    #     #### M = 17-32
    #     "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
    #     "M > 16": ((256, 32), (2, 1, 1)),
    #     #### M = 1-16
    #     "N >= 26624": ((256, 16), (1, 1, 1)),
    #     None: ((128, 16), (1, 1, 1)),
    # }

    # # For now we use the same heuristic for all types
    # # Heuristic is currently tuned for H100s
    # qqq_heuristic = [
    #     (cond, ScheduleConfig(*tile_config,
    #                           **sch_common_params))  # type: ignore
    #     for cond, tile_config in qqq_tile_heuristic_config.items()
    # ]

    # QQQ_kernel_types = [
    #     *(TypeConfig(
    #         a=DataType.s8,
    #         b=VLLMDataType.u4b8,
    #         b_group_scale=b_group_scale,
    #         b_group_zeropoint=DataType.void,
    #         b_channel_scale=DataType.f32,
    #         a_token_scale=DataType.f32,
    #         out=DataType.f16,
    #         accumulator=DataType.s32,
    #     ) for b_group_scale in (DataType.f16, DataType.void)),
    #     *(TypeConfig(
    #         a=DataType.e4m3,
    #         b=VLLMDataType.u4b8,
    #         b_group_scale=b_group_scale,
    #         b_group_zeropoint=DataType.void,
    #         b_channel_scale=DataType.f32,
    #         a_token_scale=DataType.f32,
    #         out=DataType.f16,
    #         accumulator=DataType.f32,
    #     ) for b_group_scale in (DataType.f16, DataType.void)),
    # ]

    # impl_configs += [
    #     ImplConfig(x[0], x[1], x[2])
    #     for x in zip(QQQ_kernel_types,
    #                  itertools.repeat(get_unique_schedules(qqq_heuristic)),
    #                  itertools.repeat(qqq_heuristic))
    # ]
681
682
683
684
685
686
687
688
689
690
691

    output_dir = os.path.join(SCRIPT_DIR, "generated")

    # Delete the "generated" directory if it exists
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)

    # Create the "generated" directory
    os.makedirs(output_dir)

    # Render each group of configurations into separate files
692
693
694
695
696
    for filename, code in create_sources(impl_configs):
        filepath = os.path.join(output_dir, f"{filename}.cu")
        with open(filepath, "w") as output_file:
            output_file.write(code)
        print(f"Rendered template to {filepath}")
697
698
699
700


if __name__ == "__main__":
    generate()