common.h 6.22 KB
Newer Older
Przemek Tredak's avatar
Przemek Tredak committed
1
/*************************************************************************
2
 * Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Przemek Tredak's avatar
Przemek Tredak committed
3
4
5
6
7
8
9
 *
 * See LICENSE for license information.
 ************************************************************************/

#ifndef TRANSFORMER_ENGINE_PYTORCH_CSRC_COMMON_H_
#define TRANSFORMER_ENGINE_PYTORCH_CSRC_COMMON_H_

Tim Moon's avatar
Tim Moon committed
10
11
12
13
14
15
16
17
#include <cstring>
#include <iomanip>
#include <iostream>
#include <memory>
#include <random>
#include <stdexcept>
#include <vector>

Przemek Tredak's avatar
Przemek Tredak committed
18
#include <ATen/ATen.h>
cyanguwa's avatar
cyanguwa committed
19
#include <ATen/Dispatch.h>
Tim Moon's avatar
Tim Moon committed
20
#include <ATen/cuda/CUDAContext.h>
cyanguwa's avatar
cyanguwa committed
21
22
#include <ATen/cuda/CUDAGeneratorImpl.h>
#include <ATen/cuda/CUDAGraphsUtils.cuh>
Tim Moon's avatar
Tim Moon committed
23
24
25
26
#include <ATen/cudnn/Handle.h>
#include <ATen/native/DispatchStub.h>
#include <c10/macros/Macros.h>
#include <cublasLt.h>
Przemek Tredak's avatar
Przemek Tredak committed
27
28
#include <cuda.h>
#include <cuda_bf16.h>
Tim Moon's avatar
Tim Moon committed
29
#include <cuda_runtime.h>
30
#include <cudnn.h>
Tim Moon's avatar
Tim Moon committed
31
32
#include <torch/extension.h>
#include <torch/torch.h>
Przemek Tredak's avatar
Przemek Tredak committed
33

Tim Moon's avatar
Tim Moon committed
34
35
36
37
#include "common/util/logging.h"
#include <transformer_engine/activation.h>
#include <transformer_engine/cast.h>
#include <transformer_engine/fused_attn.h>
38
#include <transformer_engine/fused_rope.h>
Tim Moon's avatar
Tim Moon committed
39
40
#include <transformer_engine/gemm.h>
#include <transformer_engine/layer_norm.h>
41
#include <transformer_engine/recipe.h>
Tim Moon's avatar
Tim Moon committed
42
43
44
45
#include <transformer_engine/rmsnorm.h>
#include <transformer_engine/softmax.h>
#include <transformer_engine/transformer_engine.h>
#include <transformer_engine/transpose.h>
46
#include <transformer_engine/cast_transpose_noop.h>
Przemek Tredak's avatar
Przemek Tredak committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

namespace transformer_engine {

// Each tensor here is shape (N, ) holding all scaling
// data for a single FP8 block, e.g. LayerNormLinear
class FP8TensorMeta {
 public:
    at::Tensor scale;
    at::Tensor scale_inv;
    at::Tensor amax_history;
};

// Used as named indices on the `scale`, `scale_inv`,
// and `amax` tensors in the `FP8TensorMeta` class.
enum FP8FwdTensors {
    GEMM1_INPUT  = 0,
    GEMM1_WEIGHT = 1,
64
65
66
    GEMM1_OUTPUT = 2,
    GEMM2_INPUT  = 3,
    GEMM2_WEIGHT = 4,
67
68
69
70
    GEMM2_OUTPUT = 5,
    GEMM3_INPUT  = 6,
    GEMM3_WEIGHT = 7,
    GEMM3_OUTPUT = 8
Przemek Tredak's avatar
Przemek Tredak committed
71
72
73
74
75
76
};

// Used as named indices on the `scale`, `scale_inv`,
// and `amax` tensors in the `FP8TensorMeta` class.
enum FP8BwdTensors {
    GRAD_OUTPUT1 = 0,
77
78
    GRAD_INPUT1 = 1,
    GRAD_OUTPUT2 = 2,
79
80
81
    GRAD_INPUT2 = 3,
    GRAD_OUTPUT3 = 4,
    GRAD_INPUT3 = 5
Przemek Tredak's avatar
Przemek Tredak committed
82
83
84
85
86
87
88
89
90
91
92
93
94
};


}  // namespace transformer_engine


transformer_engine::DType getTransformerEngineFP8Type(bool e4m3_if_hybrid,
                                                      const std::string &fp8_recipe);


inline at::ScalarType GetATenDType(transformer_engine::DType t) {
    switch (t) {
        case transformer_engine::DType::kInt32:
95
96
97
            return torch::kInt32;
        case transformer_engine::DType::kInt64:
            return torch::kInt64;
Przemek Tredak's avatar
Przemek Tredak committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
        case transformer_engine::DType::kFloat32:
            return at::kFloat;
        case transformer_engine::DType::kFloat16:
            return at::kHalf;
        case transformer_engine::DType::kBFloat16:
            return at::kBFloat16;
        case transformer_engine::DType::kByte:
        case transformer_engine::DType::kFloat8E4M3:
        case transformer_engine::DType::kFloat8E5M2:
            return at::kByte;
        default:
            NVTE_ERROR("Invalid type");
    }
}


inline transformer_engine::DType GetTransformerEngineDType(at::ScalarType t) {
    switch (t) {
        case at::kHalf:
            return transformer_engine::DType::kFloat16;
        case at::kFloat:
            return transformer_engine::DType::kFloat32;
        case at::kBFloat16:
            return transformer_engine::DType::kBFloat16;
122
123
        case at::kBool:
            return transformer_engine::DType::kByte;
cyanguwa's avatar
cyanguwa committed
124
125
126
127
128
129
        case torch::kByte:
            return transformer_engine::DType::kByte;
        case torch::kInt32:
            return transformer_engine::DType::kInt32;
        case torch::kInt64:
            return transformer_engine::DType::kInt64;
Przemek Tredak's avatar
Przemek Tredak committed
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
        default:
            NVTE_ERROR("Invalid type");
    }
}


inline transformer_engine::DType GetTransformerEngineDType(int DType_value) {
    return static_cast<transformer_engine::DType>(DType_value);
}

transformer_engine::TensorWrapper makeTransformerEngineTensor(void* data_ptr,
                                                              const std::vector<size_t>& shape,
                                                              const transformer_engine::DType type
);

145
146
147
148
149
150
151
152
transformer_engine::TensorWrapper makeTransformerEngineTensor(void* data_ptr,
                                                              const std::vector<size_t>& shape,
                                                              const transformer_engine::DType type,
                                                              void* amax_ptr,
                                                              void* scale_ptr,
                                                              void* scale_inv_ptr
);

Przemek Tredak's avatar
Przemek Tredak committed
153
154
155
156
157
158
159
160
161

transformer_engine::TensorWrapper makeTransformerEngineTensor(void* data_ptr,
                                                              const NVTEShape& shape,
                                                              const transformer_engine::DType type
);


transformer_engine::TensorWrapper makeTransformerEngineTensor(at::Tensor tensor);

162
163
164
165
166
transformer_engine::TensorWrapper makeTransformerEngineTensor(at::Tensor tensor,
                                                              at::Tensor amax,
                                                              const at::Tensor scale,
                                                              at::Tensor scale_inv);

Przemek Tredak's avatar
Przemek Tredak committed
167
168
169

size_t product(const std::vector<size_t> &shape);

cyanguwa's avatar
cyanguwa committed
170
171
172
at::Tensor allocateSpace(const std::vector<size_t>& shape,
                         const transformer_engine::DType type,
                         bool init_to_zeros);
Przemek Tredak's avatar
Przemek Tredak committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

at::Tensor allocateSpace(const NVTEShape &shape,
                         const transformer_engine::DType type,
                         bool init_to_zeros = false);


at::Tensor allocateTorchTensor(int M,
                               int N,
                               transformer_engine::DType dtype
);


at::Tensor allocateTorchTensor(int M,
                               transformer_engine::DType dtype
);

189
void* getDataPtr(at::Tensor tensor, int offset = 0);
190

Przemek Tredak's avatar
Przemek Tredak committed
191
#endif  // TRANSFORMER_ENGINE_PYTORCH_CSRC_COMMON_H_