scaled_mm_entry.cu 2.94 KB
Newer Older
1
2
#include <cudaTypedefs.h>

3
#include <c10/cuda/CUDAGuard.h>
4
#include <torch/all.h>
5

6
7
8
9
void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales);
10

11
12
13
14
void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales);
15

16
17
18
19
void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales);
20

21
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
22
23
24
25
void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
                            torch::Tensor const& b_scales);
26
#endif
27

28
29
30
void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                       torch::Tensor const& b, torch::Tensor const& a_scales,
                       torch::Tensor const& b_scales) {
31
32
33
34
35
36
37
38
39
40
41
  int32_t major_capability;
  int32_t minor_capability;
  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
                         0);
  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
                         0);
  int32_t version_num = major_capability * 10 + minor_capability;

  // Checks for conformality
  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
42
              b.size(1) == c.size(1));
43
44
45
46
  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));

  // Check for strides and alignment
47
48
49
50
  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
  TORCH_CHECK(c.stride(0) % 16 == 0 &&
              b.stride(1) % 16 == 0);  // 16 Byte Alignment
51
52
53
54
55
56
  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());

  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));

  if (version_num >= 90) {
    // Hopper
57
58
59

    // Guard against compilation issues for sm90 kernels
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
60
    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales);
61
#else
62
    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
63
#endif
64
65
  } else if (version_num == 89) {
    // Ada Lovelace
66
    cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales);
67
68
  } else if (version_num >= 80) {
    // Ampere
69
    cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
70
71
72
  } else {
    // Turing
    TORCH_CHECK(version_num >= 75);
73
    cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales);
74
75
  }
}