"...resnet50_tensorflow.git" did not exist on "1408d2c6197f968c95bafbaa14b721061bade1b7"
runtime.cc 7.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
/*!
 * \file tl/runtime/runtime.h
 * \brief Runtime functions.
 *
 */

#include "runtime.h"

#include "../target/cuda.h"
10
11
#include <tvm/ffi/function.h>
#include <tvm/node/node.h>
12
13
14
15

namespace tvm {
namespace tl {

16
#if (CUDA_MAJOR_VERSION >= 12)
17
template <typename T> static std::string ArrayToStr(const T *ptr, size_t n) {
18
19
20
  std::stringstream ss;
  ss << "[";
  for (size_t i = 0; i < n; i++) {
21
22
    if (i > 0)
      ss << ", ";
23
24
25
26
27
28
29
    ss << ptr[i];
  }
  ss << "]";
  return ss.str();
}

struct TensorMapArgs {
30
  CUtensorMap *map;
31
32
  CUtensorMapDataType type;
  cuuint32_t tensorRank;
33
  void *globalAddress;
34
35
36
37
38
39
40
  cuuint64_t globalDim[5], globalStride[5];
  cuuint32_t boxDim[5], elementStrides[5];
  CUtensorMapInterleave interleave;
  CUtensorMapSwizzle swizzle;
  CUtensorMapL2promotion l2Promotion;
  CUtensorMapFloatOOBfill oobFill;

41
  static TensorMapArgs Extract(PackedArgs args) {
42
43
    TensorMapArgs T;
    int idx = 0;
44
45
46
47
48
    ICHECK(args.size() >= 8);
    T.map = reinterpret_cast<CUtensorMap *>(args[idx++].cast<void *>());
    T.type = static_cast<CUtensorMapDataType>(args[idx++].cast<int64_t>());
    T.tensorRank = static_cast<cuuint32_t>(args[idx++].cast<int64_t>());
    T.globalAddress = args[idx++].cast<void *>();
49
    ICHECK(T.tensorRank >= 1 && T.tensorRank <= 5);
50
    ICHECK(args.size() == static_cast<int>(8 + T.tensorRank * 4));
51
    for (size_t i = 0; i < T.tensorRank; i++) {
52
      T.globalDim[i] = args[idx++].cast<cuuint64_t>();
53
54
    }
    for (size_t i = 0; i < T.tensorRank; i++) {
55
      T.globalStride[i] = args[idx++].cast<cuuint64_t>();
56
57
    }
    for (size_t i = 0; i < T.tensorRank; i++) {
58
      T.boxDim[i] = args[idx++].cast<cuuint64_t>();
59
60
    }
    for (size_t i = 0; i < T.tensorRank; i++) {
61
      T.elementStrides[i] = args[idx++].cast<cuuint64_t>();
62
    }
63
    T.interleave =
64
65
        static_cast<CUtensorMapInterleave>(args[idx++].cast<int64_t>());
    T.swizzle = static_cast<CUtensorMapSwizzle>(args[idx++].cast<int64_t>());
66
    T.l2Promotion =
67
        static_cast<CUtensorMapL2promotion>(args[idx++].cast<int64_t>());
68
    T.oobFill =
69
        static_cast<CUtensorMapFloatOOBfill>(args[idx++].cast<int64_t>());
70
71
72
73
74
    return T;
  }

  std::string ToDebugString() {
    std::stringstream ss;
75
76
77
78
79
80
81
82
83
84
85
86
    ss << "TMA Desc Addr:   " << map << '\n'
       << "format         " << type << '\n'
       << "dim            " << tensorRank << '\n'
       << "gmem_address   " << globalAddress << '\n'
       << "globalDim      " << ArrayToStr(globalDim, tensorRank) << '\n'
       << "globalStrides  " << ArrayToStr(globalStride, tensorRank) << '\n'
       << "boxDim         " << ArrayToStr(boxDim, tensorRank) << '\n'
       << "elementStrides " << ArrayToStr(elementStrides, tensorRank) << '\n'
       << "interleave     " << interleave << '\n'
       << "swizzle        " << swizzle << '\n'
       << "l2Promotion    " << l2Promotion << '\n'
       << "oobFill        " << oobFill << '\n';
87
88
89
90
91
    return ss.str();
  }
};

// set device api
92
93
TVM_FFI_STATIC_INIT_BLOCK({
  namespace refl = tvm::ffi::reflection;
94
95
96
97
98
99
100
101
102
103
104
105
106
  refl::GlobalDef().def_packed("tvm_tensormap_create_tiled", [](PackedArgs args,
                                                                Any *ret) {
    TensorMapArgs T = TensorMapArgs::Extract(args);
    CUresult result = cuTensorMapEncodeTiled(
        T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
        T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave, T.swizzle,
        T.l2Promotion, T.oobFill);
    if (result != CUDA_SUCCESS) {
      LOG_FATAL << "Failed to initialize the TMA descriptor " << result << '\n'
                << T.ToDebugString();
    }
    *ret = static_cast<int>(result);
  });
107
});
108
109

struct TensorMapIm2ColArgs {
110
  CUtensorMap *map;
111
112
  CUtensorMapDataType type;
  cuuint32_t tensorRank;
113
  void *globalAddress;
114
115
116
117
118
119
120
121
122
  cuuint64_t globalDim[5], globalStride[5];
  cuuint32_t elementStrides[5];
  int pixelBoxLowerCorner[3], pixelBoxUpperCorner[3];
  cuuint32_t smem_box_channel, smem_box_pixel;
  CUtensorMapInterleave interleave;
  CUtensorMapSwizzle swizzle;
  CUtensorMapL2promotion l2Promotion;
  CUtensorMapFloatOOBfill oobFill;

123
  static TensorMapIm2ColArgs Extract(PackedArgs args) {
124
125
    TensorMapIm2ColArgs T;
    int idx = 0;
126
127
128
129
130
    ICHECK(args.size() >= 8);
    T.map = reinterpret_cast<CUtensorMap *>(args[idx++].cast<void *>());
    T.type = static_cast<CUtensorMapDataType>(args[idx++].cast<int64_t>());
    T.tensorRank = static_cast<cuuint32_t>(args[idx++].cast<int64_t>());
    T.globalAddress = args[idx++].cast<void *>();
131
    ICHECK(T.tensorRank >= 3 && T.tensorRank <= 5);
132
    ICHECK(args.size() == static_cast<int>(6 + T.tensorRank * 5));
133
    for (size_t i = 0; i < T.tensorRank; i++) {
134
      T.globalDim[i] = args[idx++].cast<cuuint64_t>();
135
136
    }
    for (size_t i = 0; i < T.tensorRank; i++) {
137
      T.globalStride[i] = args[idx++].cast<cuuint64_t>();
138
139
    }
    for (size_t i = 0; i < T.tensorRank; i++) {
140
      T.elementStrides[i] = args[idx++].cast<cuuint64_t>();
141
142
    }
    for (size_t i = 0; i < T.tensorRank - 2; i++) {
143
      T.pixelBoxLowerCorner[i] = args[idx++].cast<int>();
144
145
    }
    for (size_t i = 0; i < T.tensorRank - 2; i++) {
146
      T.pixelBoxUpperCorner[i] = args[idx++].cast<int>();
147
    }
148
149
    T.smem_box_pixel = args[idx++].cast<cuuint64_t>();
    T.smem_box_channel = args[idx++].cast<cuuint64_t>();
150
    T.interleave =
151
152
        static_cast<CUtensorMapInterleave>(args[idx++].cast<int64_t>());
    T.swizzle = static_cast<CUtensorMapSwizzle>(args[idx++].cast<int64_t>());
153
    T.l2Promotion =
154
        static_cast<CUtensorMapL2promotion>(args[idx++].cast<int64_t>());
155
    T.oobFill =
156
        static_cast<CUtensorMapFloatOOBfill>(args[idx++].cast<int64_t>());
157
158
159
160
161
    return T;
  }

  std::string ToDebugString() {
    std::stringstream ss;
162
163
164
165
166
167
168
169
    ss << "TMA Desc Addr:   " << map << '\n'
       << "format         " << type << '\n'
       << "dim            " << tensorRank << '\n'
       << "gmem_address   " << globalAddress << '\n'
       << "globalDim      " << ArrayToStr(globalDim, tensorRank) << '\n'
       << "globalStrides  " << ArrayToStr(globalStride, tensorRank) << '\n'
       << "smem_box_pixel " << smem_box_pixel << '\n'
       << "smem_box_channel " << smem_box_channel << '\n'
170
       << "pixelBoxLowerCorner  "
171
       << ArrayToStr(pixelBoxLowerCorner, tensorRank - 2) << '\n'
172
       << "pixelBoxUpperCorner  "
173
174
175
176
177
178
       << ArrayToStr(pixelBoxUpperCorner, tensorRank - 2) << '\n'
       << "elementStrides " << ArrayToStr(elementStrides, tensorRank) << '\n'
       << "interleave     " << interleave << '\n'
       << "swizzle        " << swizzle << '\n'
       << "l2Promotion    " << l2Promotion << '\n'
       << "oobFill        " << oobFill << '\n';
179
180
181
182
    return ss.str();
  }
};

183
184
185
186
187
188
189
190
191
192
193
194
TVM_FFI_STATIC_INIT_BLOCK({
  namespace refl = tvm::ffi::reflection;
  refl::GlobalDef().def_packed(
      "tvm_tensormap_create_im2col", [](PackedArgs args, Any *ret) {
        TensorMapIm2ColArgs T = TensorMapIm2ColArgs::Extract(args);
        CUresult result = cuTensorMapEncodeIm2col(
            T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
            T.globalStride + 1, T.pixelBoxLowerCorner, T.pixelBoxUpperCorner,
            T.smem_box_channel, T.smem_box_pixel, T.elementStrides,
            T.interleave, T.swizzle, T.l2Promotion, T.oobFill);
        if (result != CUDA_SUCCESS) {
          LOG_FATAL << "Failed to initialize the TMA descriptor " << result
195
                    << '\n'
196
197
198
199
200
201
                    << T.ToDebugString();
        }
        *ret = static_cast<int>(result);
      });
});

202
#endif // (CUDA_MAJOR_VERSION >= 12)
203

204
205
} // namespace tl
} // namespace tvm