common.hpp 3.96 KB
Newer Older
1
2
3
4
5
#pragma once

#include "cutlass/cutlass.h"
#include <climits>
#include "cuda_runtime.h"
6
7
#include <cstdio>
#include <cstdlib>
8
9
10
11
12
13
14
15
16
17
18
19
20
21

/**
 * Helper function for checking CUTLASS errors
 */
#define CUTLASS_CHECK(status)                       \
  {                                                 \
    cutlass::Status error = status;                 \
    TORCH_CHECK(error == cutlass::Status::kSuccess, \
                cutlassGetStatusString(error));     \
  }

inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
  int max_shared_mem_per_block_opt_in = 0;
  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
22
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
23
24
25
26
  return max_shared_mem_per_block_opt_in;
}

int32_t get_sm_version_num();
27
28
29
30
31
32
33
34

/**
 * A wrapper for a kernel that is used to guard against compilation on
 * architectures that will never use the kernel. The purpose of this is to
 * reduce the size of the compiled binary.
 * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
 * into code that will be executed on the device where it is defined.
 */
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

template <typename Kernel>
struct enable_sm75_to_sm80 : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE static void invoke(Args&&... args) {
#if defined __CUDA_ARCH__
  #if __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
    Kernel::invoke(std::forward<Args>(args)...);
  #else
    printf("This kernel only supports sm[75, 80).\n");
    asm("trap;");
  #endif
#endif
  }
};

template <typename Kernel>
struct enable_sm80_to_sm89 : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE static void invoke(Args&&... args) {
#if defined __CUDA_ARCH__
  #if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
    Kernel::invoke(std::forward<Args>(args)...);
  #else
    printf("This kernel only supports sm[80, 89).\n");
    asm("trap;");
  #endif
#endif
  }
};

template <typename Kernel>
struct enable_sm89_to_sm90 : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE static void invoke(Args&&... args) {
#if defined __CUDA_ARCH__
  #if __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
    Kernel::invoke(std::forward<Args>(args)...);
  #else
    printf("This kernel only supports sm[89, 90).\n");
    asm("trap;");
  #endif
#endif
  }
};

81
82
83
84
template <typename Kernel>
struct enable_sm90_or_later : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
85
86
#if defined __CUDA_ARCH__
  #if __CUDA_ARCH__ >= 900
87
    Kernel::operator()(std::forward<Args>(args)...);
88
89
90
91
  #else
    printf("This kernel only supports sm >= 90.\n");
    asm("trap;");
  #endif
92
93
#endif
  }
94
95
96
97
98
99
};

template <typename Kernel>
struct enable_sm90_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#if defined __CUDA_ARCH__
  #if __CUDA_ARCH__ == 900
    Kernel::operator()(std::forward<Args>(args)...);
  #else
    printf("This kernel only supports sm90.\n");
    asm("trap;");
  #endif
#endif
  }
};

template <typename Kernel>
struct enable_sm100f_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
#if defined __CUDA_ARCH__
  #if __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030
117
    Kernel::operator()(std::forward<Args>(args)...);
118
119
120
121
  #else
    printf("This kernel only supports sm100f.\n");
    asm("trap;");
  #endif
122
123
124
#endif
  }
};
125
126

template <typename Kernel>
127
struct enable_sm100a_only : Kernel {
128
129
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
130
131
#if defined __CUDA_ARCH__
  #if __CUDA_ARCH__ == 1000
132
    Kernel::operator()(std::forward<Args>(args)...);
133
134
135
136
  #else
    printf("This kernel only supports sm100a.\n");
    asm("trap;");
  #endif
137
138
139
#endif
  }
};
140
141
142
143
144

template <typename Kernel>
struct enable_sm120_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
145
146
#if defined __CUDA_ARCH__
  #if __CUDA_ARCH__ == 1200
147
    Kernel::operator()(std::forward<Args>(args)...);
148
149
150
151
  #else
    printf("This kernel only supports sm120.\n");
    asm("trap;");
  #endif
152
153
154
#endif
  }
};