amd_buffer_addressing.hpp 40.4 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
// SPDX-License-Identifier: MIT
arai713's avatar
arai713 committed
2
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
Chao Liu's avatar
Chao Liu committed
3

4
#pragma once
5
#include "data_type.hpp"
Chao Liu's avatar
Chao Liu committed
6
7
8
9

namespace ck {

template <typename T>
Chao Liu's avatar
tidy  
Chao Liu committed
10
union BufferResource
Chao Liu's avatar
Chao Liu committed
11
{
12
13
    __device__ constexpr BufferResource() : content{} {}

Chao Liu's avatar
Chao Liu committed
14
15
    // 128 bit SGPRs to supply buffer resource in buffer instructions
    // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
16
    int32x4_t content;
17
18
19
    StaticallyIndexedArray<T*, 2> address;
    StaticallyIndexedArray<int32_t, 4> range;
    StaticallyIndexedArray<int32_t, 4> config;
Chao Liu's avatar
Chao Liu committed
20
21
22
};

template <typename T>
23
__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
Chao Liu's avatar
Chao Liu committed
24
{
Chao Liu's avatar
tidy  
Chao Liu committed
25
    BufferResource<T> wave_buffer_resource;
Chao Liu's avatar
Chao Liu committed
26
27

    // wavewise base address (64 bit)
28
    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
Chao Liu's avatar
Chao Liu committed
29
    // wavewise range (32 bit)
30
    wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
Chao Liu's avatar
Chao Liu committed
31
    // wavewise setting (32 bit)
32
    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
33

34
    return wave_buffer_resource.content;
Chao Liu's avatar
Chao Liu committed
35
36
}

carlushuang's avatar
carlushuang committed
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
template <typename T>
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T* p_wave)
{
    BufferResource<T> wave_buffer_resource;

    // wavewise base address (64 bit)
    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
    // wavewise range (32 bit)
    wave_buffer_resource.range(Number<2>{}) = 0xffffffff; // max possible range
    // wavewise setting (32 bit)
    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;

    return wave_buffer_resource.content;
}

52
template <typename T>
illsilin's avatar
illsilin committed
53
54
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new(T* p_wave,
                                                                index_t element_space_size)
55
56
{
    // wavewise base address (64 bit)
illsilin's avatar
illsilin committed
57
    auto p         = const_cast<remove_cv_t<T>*>(p_wave);
58
    int32_t stride = 0;
illsilin's avatar
illsilin committed
59
60
    int32_t num    = element_space_size * sizeof(T);
    auto flags     = CK_BUFFER_RESOURCE_3RD_DWORD;
Chao Liu's avatar
Chao Liu committed
61

62
63
    return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
}
64

Chao Liu's avatar
Chao Liu committed
65
66
67
68
69
70
71
72
73
// buffer atomic-add fp16
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
    half2_t vdata,
    int32x4_t rsrc,
    index_t voffset,
    index_t soffset,
    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");

// buffer atomic-add i32
zjing14's avatar
zjing14 committed
74
75
76
77
78
79
80
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
    int32_t vdata,
    int32x4_t rsrc,
    index_t voffset,
    index_t soffset,
    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");

Chao Liu's avatar
Chao Liu committed
81
// buffer atomic-add fp32
zjing14's avatar
zjing14 committed
82
83
84
85
86
87
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
    float vdata,
    int32x4_t rsrc,
    index_t voffset,
    index_t soffset,
    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
Chao Liu's avatar
Chao Liu committed
88

rocking5566's avatar
rocking5566 committed
89
90
91
92
93
94
95
96
// buffer atomic-add fp32
__device__ double
llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
                                       int32x4_t rsrc, // dst_wave_buffer_resource
                                       int voffset,    // dst_thread_addr_offset
                                       int soffset,    // dst_wave_addr_offset
                                       int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");

97
98
99
100
101
102
103
104
105
106
107
// memory coherency bit for buffer store/load instruction
// check ISA manual for each GFX target
// e.g. for
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
// page 67~68
enum struct AmdBufferCoherenceEnum
{
    DefaultCoherence = 0, // default value
    GLC              = 1,
    SLC              = 2,
    GLC_SLC          = 3,
108
109
110
111
112
113
114
115
116
117
118
    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
    WAVE_NT0   = 0,
    WAVE_NT1   = 2,
    GROUP_NT0  = 1,
    GROUP_NT1  = 3,
    DEVICE_NT0 = 8,
    DEVICE_NT1 = 10,
    SYSTEM_NT0 = 9,
    SYSTEM_NT1 = 11,
119
120
};

zjing14's avatar
zjing14 committed
121
122
template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ typename vector_type<int8_t, N>::type
123
amd_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
zjing14's avatar
zjing14 committed
124
125
                         index_t src_thread_addr_offset,
                         index_t src_wave_addr_offset)
Chao Liu's avatar
Chao Liu committed
126
{
zjing14's avatar
zjing14 committed
127
128
    static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
                  "wrong! not implemented");
Chao Liu's avatar
Chao Liu committed
129

zjing14's avatar
zjing14 committed
130
    if constexpr(N == 1)
Chao Liu's avatar
Chao Liu committed
131
    {
132
        return __builtin_amdgcn_raw_buffer_load_b8(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
133
134
135
                                                   src_thread_addr_offset,
                                                   src_wave_addr_offset,
                                                   static_cast<index_t>(coherence));
Chao Liu's avatar
Chao Liu committed
136
    }
zjing14's avatar
zjing14 committed
137
    else if constexpr(N == 2)
Chao Liu's avatar
Chao Liu committed
138
    {
zjing14's avatar
zjing14 committed
139

140
        int16_t tmp = __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
141
142
143
                                                           src_thread_addr_offset,
                                                           src_wave_addr_offset,
                                                           static_cast<index_t>(coherence));
Chao Liu's avatar
Chao Liu committed
144

zjing14's avatar
zjing14 committed
145
        return bit_cast<int8x2_t>(tmp);
146
    }
zjing14's avatar
zjing14 committed
147
    else if constexpr(N == 4)
148
    {
149
        int32_t tmp = __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
150
151
152
                                                           src_thread_addr_offset,
                                                           src_wave_addr_offset,
                                                           static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
153

zjing14's avatar
zjing14 committed
154
        return bit_cast<int8x4_t>(tmp);
Chao Liu's avatar
Chao Liu committed
155
    }
zjing14's avatar
zjing14 committed
156
    else if constexpr(N == 8)
157
    {
158
        int32x2_t tmp = __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
159
160
161
                                                             src_thread_addr_offset,
                                                             src_wave_addr_offset,
                                                             static_cast<index_t>(coherence));
162

zjing14's avatar
zjing14 committed
163
164
165
166
        return bit_cast<int8x8_t>(tmp);
    }
    else if constexpr(N == 16)
    {
167
        int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
168
169
170
                                                              src_thread_addr_offset,
                                                              src_wave_addr_offset,
                                                              static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
171
172
173
174
        return bit_cast<int8x16_t>(tmp);
    }
    else if constexpr(N == 32)
    {
175
        int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
176
177
178
                                                               src_thread_addr_offset,
                                                               src_wave_addr_offset,
                                                               static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
179
        int32x4_t tmp1 =
180
            __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
181
182
183
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 4 * sizeof(int32_t),
                                                  static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
184
        vector_type<int32_t, 8> tmp;
185

zjing14's avatar
zjing14 committed
186
187
        tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
        tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
188

zjing14's avatar
zjing14 committed
189
190
191
192
        return bit_cast<int8x32_t>(tmp);
    }
    else if constexpr(N == 64)
    {
193
        int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
194
195
196
                                                               src_thread_addr_offset,
                                                               src_wave_addr_offset,
                                                               static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
197
        int32x4_t tmp1 =
198
            __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
199
200
201
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 4 * sizeof(int32_t),
                                                  static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
202
        int32x4_t tmp2 =
203
            __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
204
205
206
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 8 * sizeof(int32_t),
                                                  static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
207
        int32x4_t tmp3 =
208
            __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
illsilin's avatar
illsilin committed
209
210
211
                                                  src_thread_addr_offset,
                                                  src_wave_addr_offset + 12 * sizeof(int32_t),
                                                  static_cast<index_t>(coherence));
212

zjing14's avatar
zjing14 committed
213
214
215
216
217
218
219
220
        vector_type<int32_t, 16> tmp;

        tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
        tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
        tmp.AsType<int32x4_t>()(Number<2>{}) = tmp2;
        tmp.AsType<int32x4_t>()(Number<3>{}) = tmp3;

        return bit_cast<int8x64_t>(tmp);
221
    }
Chao Liu's avatar
Chao Liu committed
222
223
}

224
225
226
template <typename T,
          index_t N,
          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
illsilin's avatar
illsilin committed
227
228
229
230
__device__ typename vector_type<T, N>::type
amd_buffer_load_impl(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
                     index_t src_thread_addr_offset,
                     index_t src_wave_addr_offset)
Chao Liu's avatar
Chao Liu committed
231
{
232
    static_assert(
zjing14's avatar
zjing14 committed
233
234
235
236
237
238
239
        (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
zjing14's avatar
zjing14 committed
240
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
241
242
            (is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, pk_i4_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
243
        "wrong! not implemented");
Chao Liu's avatar
Chao Liu committed
244

zjing14's avatar
zjing14 committed
245
246
247
248
249
250
251
252
253
    using r_t     = typename vector_type<T, N>::type;
    auto raw_data = amd_buffer_load_impl_raw<sizeof(T) * N, coherence>(
        src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
    return bit_cast<r_t>(raw_data);
}

template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void
amd_buffer_store_impl_raw(const typename vector_type<int8_t, N>::type src_thread_data,
254
                          __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
zjing14's avatar
zjing14 committed
255
256
257
258
259
260
261
                          index_t dst_thread_addr_offset,
                          index_t dst_wave_addr_offset)
{
    static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
                  "wrong! not implemented");

    if constexpr(N == 1)
Chao Liu's avatar
Chao Liu committed
262
    {
263
        __builtin_amdgcn_raw_buffer_store_b8(src_thread_data,
illsilin's avatar
illsilin committed
264
265
266
267
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
                                             static_cast<index_t>(coherence));
Chao Liu's avatar
Chao Liu committed
268
    }
zjing14's avatar
zjing14 committed
269
    else if constexpr(N == 2)
Chao Liu's avatar
Chao Liu committed
270
    {
zjing14's avatar
zjing14 committed
271

272
        __builtin_amdgcn_raw_buffer_store_b16(bit_cast<int16_t>(src_thread_data),
illsilin's avatar
illsilin committed
273
274
275
276
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              static_cast<index_t>(coherence));
Chao Liu's avatar
Chao Liu committed
277
    }
zjing14's avatar
zjing14 committed
278
    else if constexpr(N == 4)
Chao Liu's avatar
Chao Liu committed
279
    {
280
        __builtin_amdgcn_raw_buffer_store_b32(bit_cast<int32_t>(src_thread_data),
illsilin's avatar
illsilin committed
281
282
283
284
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              static_cast<index_t>(coherence));
285
    }
zjing14's avatar
zjing14 committed
286
    else if constexpr(N == 8)
287
    {
288
        __builtin_amdgcn_raw_buffer_store_b64(bit_cast<int32x2_t>(src_thread_data),
illsilin's avatar
illsilin committed
289
290
291
292
                                              dst_wave_buffer_resource,
                                              dst_thread_addr_offset,
                                              dst_wave_addr_offset,
                                              static_cast<index_t>(coherence));
Chao Liu's avatar
Chao Liu committed
293
    }
zjing14's avatar
zjing14 committed
294
    else if constexpr(N == 16)
Chao Liu's avatar
Chao Liu committed
295
    {
296
        __builtin_amdgcn_raw_buffer_store_b128(bit_cast<int32x4_t>(src_thread_data),
illsilin's avatar
illsilin committed
297
298
299
300
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               static_cast<index_t>(coherence));
Chao Liu's avatar
Chao Liu committed
301
    }
zjing14's avatar
zjing14 committed
302
    else if constexpr(N == 32)
Chao Liu's avatar
Chao Liu committed
303
    {
zjing14's avatar
zjing14 committed
304
305
        vector_type<int32_t, 8> tmp{bit_cast<int32x8_t>(src_thread_data)};

306
        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<0>{}],
illsilin's avatar
illsilin committed
307
308
309
310
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
311

312
        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<1>{}],
illsilin's avatar
illsilin committed
313
314
315
316
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset + sizeof(int32_t) * 4,
                                               static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
317
318
319
320
321
    }
    else if constexpr(N == 64)
    {
        vector_type<int32_t, 16> tmp{bit_cast<int32x16_t>(src_thread_data)};

322
        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<0>{}],
illsilin's avatar
illsilin committed
323
324
325
326
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
                                               static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
327

328
        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<1>{}],
illsilin's avatar
illsilin committed
329
330
331
332
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset + sizeof(int32_t) * 4,
                                               static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
333

334
        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<2>{}],
illsilin's avatar
illsilin committed
335
336
337
338
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset + sizeof(int32_t) * 8,
                                               static_cast<index_t>(coherence));
zjing14's avatar
zjing14 committed
339

340
        __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[Number<3>{}],
illsilin's avatar
illsilin committed
341
342
343
344
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset + sizeof(int32_t) * 12,
                                               static_cast<index_t>(coherence));
345
    }
Chao Liu's avatar
Chao Liu committed
346
347
}

zjing14's avatar
zjing14 committed
348
349
350
351
template <typename T,
          index_t N,
          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
352
                                      __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
zjing14's avatar
zjing14 committed
353
354
355
356
357
358
359
360
361
                                      index_t dst_thread_addr_offset,
                                      index_t dst_wave_addr_offset)
{
    static_assert(
        (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
362
363
364
365
            (is_same<T, f8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, bf8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, fp8_storage_t>::value &&
             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
zjing14's avatar
zjing14 committed
366
367
368
369
370
371
372
373
374
375
376
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
        "wrong! not implemented");

    using r_t = typename vector_type<int8_t, sizeof(T) * N>::type;

    amd_buffer_store_impl_raw<sizeof(T) * N, coherence>(bit_cast<r_t>(src_thread_data),
                                                        dst_wave_buffer_resource,
                                                        dst_thread_addr_offset,
                                                        dst_wave_addr_offset);
}

377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
template <typename T, index_t N>
__device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
                                           T* addr)
{
    static_assert((is_same<T, bhalf_t>::value && (N == 2 || N == 4 || N == 8)) ||
                      (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)),
                  "wrong! not implemented");

    if constexpr(is_same<T, half_t>::value)
    {
        vector_type<half_t, N> tmp{src_thread_data};
        static_for<0, N / 2, 1>{}([&](auto i) {
            __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
                                                      tmp.template AsType<half2_t>()[i]);
        });
    }
Illia Silin's avatar
Illia Silin committed
393
#if defined(__gfx942__) || defined(__gfx950__)
394
395
396
397
398
399
400
401
402
403
404
    else if constexpr(is_same<T, bhalf_t>::value)
    {
        vector_type<bhalf_t, N> tmp{src_thread_data};
        static_for<0, N / 2, 1>{}([&](auto i) {
            __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
                                                       tmp.template AsType<bhalf2_t>()[i]);
        });
    }
#endif
}

zjing14's avatar
zjing14 committed
405
406
407
408
409
410
411
template <typename T, index_t N>
__device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
                                           int32x4_t dst_wave_buffer_resource,
                                           index_t dst_thread_addr_offset,
                                           index_t dst_wave_addr_offset)
{
    static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
Chao Liu's avatar
Chao Liu committed
412
                      (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)) ||
zjing14's avatar
zjing14 committed
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
                      (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
                  "wrong! not implemented");

    if constexpr(is_same<T, float>::value)
    {
        if constexpr(N == 1)
        {
            llvm_amdgcn_raw_buffer_atomic_add_fp32(src_thread_data,
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);
        }
        else if constexpr(N == 2)
        {
            vector_type<float, 2> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + sizeof(float),
                                                   0);
        }
        else if constexpr(N == 4)
        {
            vector_type<float, 4> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + sizeof(float),
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<2>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + 2 * sizeof(float),
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<3>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + 3 * sizeof(float),
                                                   0);
        }
    }
Chao Liu's avatar
Chao Liu committed
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
    else if constexpr(is_same<T, half_t>::value)
    {
        if constexpr(N == 2)
        {
            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(src_thread_data,
                                                     dst_wave_buffer_resource,
                                                     dst_thread_addr_offset,
                                                     dst_wave_addr_offset,
                                                     0);
        }
        else if constexpr(N == 4)
        {
            vector_type<half_t, 4> tmp{src_thread_data};

            static_for<0, 2, 1>{}([&](auto i) {
                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
                                                         dst_wave_buffer_resource,
                                                         dst_thread_addr_offset,
                                                         dst_wave_addr_offset + i * sizeof(half2_t),
                                                         0);
            });
        }
        else if constexpr(N == 8)
        {
            vector_type<half_t, 8> tmp{src_thread_data};

            static_for<0, 4, 1>{}([&](auto i) {
                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
                                                         dst_wave_buffer_resource,
                                                         dst_thread_addr_offset,
                                                         dst_wave_addr_offset + i * sizeof(half2_t),
                                                         0);
            });
        }
    }
zjing14's avatar
zjing14 committed
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)
        {
            llvm_amdgcn_raw_buffer_atomic_add_i32(src_thread_data,
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset,
                                                  0);
        }
        else if constexpr(N == 2)
        {
            vector_type<int32_t, 2> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset,
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + sizeof(int32_t),
                                                  0);
        }
        else if constexpr(N == 4)
        {
            vector_type<int32_t, 4> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset,
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + sizeof(int32_t),
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<2>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + 2 * sizeof(int32_t),
                                                  0);

            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<3>{}],
                                                  dst_wave_buffer_resource,
                                                  dst_thread_addr_offset,
                                                  dst_wave_addr_offset + 3 * sizeof(int32_t),
                                                  0);
        }
    }
}

rocking5566's avatar
rocking5566 committed
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
template <typename T, index_t N>
__device__ void amd_buffer_atomic_max_impl(const typename vector_type<T, N>::type src_thread_data,
                                           int32x4_t dst_wave_buffer_resource,
                                           index_t dst_thread_addr_offset,
                                           index_t dst_wave_addr_offset)
{
    static_assert((is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
                  "wrong! not implemented");
    if constexpr(is_same<T, double>::value)
    {
        if constexpr(N == 1)
        {
            llvm_amdgcn_raw_buffer_atomic_max_fp64(src_thread_data,
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);
        }
        else if constexpr(N == 2)
        {
            vector_type<double, 2> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + sizeof(double),
                                                   0);
        }
        else if constexpr(N == 4)
        {
            vector_type<double, 4> tmp{src_thread_data};

            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset,
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + sizeof(double),
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<2>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + 2 * sizeof(double),
                                                   0);

            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<3>{}],
                                                   dst_wave_buffer_resource,
                                                   dst_thread_addr_offset,
                                                   dst_wave_addr_offset + 3 * sizeof(double),
                                                   0);
        }
    }
}

Chao Liu's avatar
Chao Liu committed
628
// buffer_load requires:
Chao Liu's avatar
Chao Liu committed
629
//   1) p_src_wave must point to global memory space
630
//   2) p_src_wave must be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
631
// It is user's responsibility to make sure that is true.
632
633
634
template <typename T,
          index_t N,
          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
635
__device__ typename vector_type_maker<T, N>::type::type
Jianfeng Yan's avatar
Jianfeng Yan committed
636
637
638
639
amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
                                            index_t src_thread_element_offset,
                                            bool src_thread_element_valid,
                                            index_t src_element_space_size)
Chao Liu's avatar
Chao Liu committed
640
{
641
642
    const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
        make_wave_buffer_resource_new(p_src_wave, src_element_space_size);
Chao Liu's avatar
Chao Liu committed
643

644
645
646
647
    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);

    using vector_t = typename vector_type_maker<T, N>::type::type;
    using scalar_t = typename scalar_type<vector_t>::type;
Chao Liu's avatar
Chao Liu committed
648

649
650
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

Chao Liu's avatar
Chao Liu committed
651
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
652
    uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
zjing14's avatar
zjing14 committed
653
654
655
    return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);

Chao Liu's avatar
Chao Liu committed
656
#else
zjing14's avatar
zjing14 committed
657

658
659
    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
        src_wave_buffer_resource, src_thread_addr_offset, 0)};
zjing14's avatar
zjing14 committed
660
    return src_thread_element_valid ? tmp : vector_t(0);
661
#endif
Chao Liu's avatar
Chao Liu committed
662
663
}

664
// buffer_load requires:
Chao Liu's avatar
Chao Liu committed
665
//   1) p_src_wave must point to global memory space
666
667
//   2) p_src_wave must be a wavewise pointer.
// It is user's responsibility to make sure that is true.
668
669
670
template <typename T,
          index_t N,
          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
671
672
673
674
675
676
677
__device__ typename vector_type_maker<T, N>::type::type
amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
                                                        index_t src_thread_element_offset,
                                                        bool src_thread_element_valid,
                                                        index_t src_element_space_size,
                                                        T customized_value)
{
678
679
    const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
        make_wave_buffer_resource_new(p_src_wave, src_element_space_size);
680
681
682
683
684
685
686
687

    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);

    using vector_t = typename vector_type_maker<T, N>::type::type;
    using scalar_t = typename scalar_type<vector_t>::type;

    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

688
689
    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
        src_wave_buffer_resource, src_thread_addr_offset, 0)};
690
691
692
693

    return src_thread_element_valid ? tmp : vector_t(customized_value);
}

Chao Liu's avatar
Chao Liu committed
694
// buffer_store requires:
Chao Liu's avatar
Chao Liu committed
695
//   1) p_dst_wave must point to global memory
zjing14's avatar
zjing14 committed
696
//   2) p_dst_wave must be a wavewise pointer.
Chao Liu's avatar
Chao Liu committed
697
// It is user's responsibility to make sure that is true.
698
699
700
template <typename T,
          index_t N,
          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
701
702
703
704
705
__device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
                                 T* p_dst_wave,
                                 const index_t dst_thread_element_offset,
                                 const bool dst_thread_element_valid,
                                 const index_t dst_element_space_size)
Chao Liu's avatar
Chao Liu committed
706
{
707
708
    const __amdgpu_buffer_rsrc_t dst_wave_buffer_resource =
        make_wave_buffer_resource_new(p_dst_wave, dst_element_space_size);
Chao Liu's avatar
Chao Liu committed
709

710
    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
Chao Liu's avatar
Chao Liu committed
711

712
713
714
715
    using vector_t                = typename vector_type_maker<T, N>::type::type;
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

Chao Liu's avatar
Chao Liu committed
716
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
717
    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
zjing14's avatar
zjing14 committed
718
719
    amd_buffer_store_impl<scalar_t, vector_size, coherence>(
        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
Chao Liu's avatar
Chao Liu committed
720
#else
721
    if(dst_thread_element_valid)
Chao Liu's avatar
Chao Liu committed
722
    {
zjing14's avatar
zjing14 committed
723
724
        amd_buffer_store_impl<scalar_t, vector_size, coherence>(
            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
Chao Liu's avatar
Chao Liu committed
725
726
727
728
    }
#endif
}

zjing14's avatar
zjing14 committed
729
// buffer_atomic_add requires:
Chao Liu's avatar
Chao Liu committed
730
//   1) p_dst_wave must point to global memory
zjing14's avatar
zjing14 committed
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
//   2) p_dst_wave must be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
__device__ void
amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thread_data,
                      T* p_dst_wave,
                      const index_t dst_thread_element_offset,
                      const bool dst_thread_element_valid,
                      const index_t dst_element_space_size)
{
    const int32x4_t dst_wave_buffer_resource =
        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);

    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);

    using vector_t                = typename vector_type_maker<T, N>::type::type;
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

750
751
752
753
754
755
756
757
758
759
    if constexpr(is_same<T, bhalf_t>::value)
    {
        if(dst_thread_element_valid)
        {
            amd_global_atomic_add_impl<scalar_t, vector_size>(
                src_thread_data, p_dst_wave + dst_thread_element_offset);
        }
    }
    else
    {
zjing14's avatar
zjing14 committed
760
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
761
        uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
zjing14's avatar
zjing14 committed
762
763

        amd_buffer_atomic_add_impl<scalar_t, vector_size>(
764
765
766
767
768
769
770
            src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
#else
        if(dst_thread_element_valid)
        {
            amd_buffer_atomic_add_impl<scalar_t, vector_size>(
                src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
        }
zjing14's avatar
zjing14 committed
771
#endif
772
    }
zjing14's avatar
zjing14 committed
773
774
}

rocking5566's avatar
rocking5566 committed
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
// buffer_atomic_max requires:
//   1) p_dst_wave must point to global memory
//   2) p_dst_wave must be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t N>
__device__ void
amd_buffer_atomic_max(const typename vector_type_maker<T, N>::type::type src_thread_data,
                      T* p_dst_wave,
                      const index_t dst_thread_element_offset,
                      const bool dst_thread_element_valid,
                      const index_t dst_element_space_size)
{
    const int32x4_t dst_wave_buffer_resource =
        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);

    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);

    using vector_t                = typename vector_type_maker<T, N>::type::type;
    using scalar_t                = typename scalar_type<vector_t>::type;
    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
797
    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
rocking5566's avatar
rocking5566 committed
798
799
800
801
802
803
804
805
806
807
808
809

    amd_buffer_atomic_max_impl<scalar_t, vector_size>(
        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
#else
    if(dst_thread_element_valid)
    {
        amd_buffer_atomic_max_impl<scalar_t, vector_size>(
            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
    }
#endif
}

810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
// Direct loads from global to LDS.
__device__ void
llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
                                __attribute__((address_space(3))) uint32_t* lds_ptr,
                                index_t size,
                                index_t voffset,
                                index_t soffset,
                                index_t offset,
                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");

template <typename T, index_t NumElemsPerThread>
__device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                              const index_t global_offset,
                                              T* lds_base_ptr,
                                              const index_t lds_offset,
                                              const bool is_valid,
                                              const index_t src_element_space_size)
{
    // Direct loads require that each thread reads and writes exactly a single DWORD.
    constexpr auto dword_bytes      = 4;
    constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
    static_assert(bytes_per_thread == dword_bytes);

arai713's avatar
arai713 committed
833
#ifndef CK_CODE_GEN_RTC
834
835
    const uint32_t* global_ptr =
        reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
arai713's avatar
arai713 committed
836
837
838
839
#else
    const uint32_t* global_ptr =
        reinterpret_cast<uint32_t*>(reinterpret_cast<size_t>(global_base_ptr));
#endif
840
841
842
    const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size);
    const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;

843
844
#if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
    T* lds_ptr = lds_base_ptr + lds_offset;
arai713's avatar
arai713 committed
845
#ifndef CK_CODE_GEN_RTC
846
847
    auto const lds_ptr_sgpr =
        __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
arai713's avatar
arai713 committed
848
849
850
#else
    auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((reinterpret_cast<size_t>(lds_ptr)));
#endif
851
852
853
    asm volatile("s_mov_b32 m0, %0; \n\t"
                 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
                 "v"(global_offset_bytes),
854
855
                 "s"(src_resource)
                 : "memory");
856
#else
857
858
    // LDS pointer must be attributed with the LDS address space.
    __attribute__((address_space(3))) uint32_t* lds_ptr =
arai713's avatar
arai713 committed
859
#ifndef CK_CODE_GEN_RTC
860
861
        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
            reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
arai713's avatar
arai713 committed
862
863
864
865
#else
        reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
            reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
#endif
866
867
868

    llvm_amdgcn_raw_buffer_load_lds(
        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
869
#endif
870
871
}

Chao Liu's avatar
Chao Liu committed
872
} // namespace ck