dev_subm.py 16.4 KB
Newer Older
yan.yan's avatar
v2.1  
yan.yan committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import sys
from pathlib import Path
from typing import Dict, List, Tuple
import pickle
import sys
import time
from pathlib import Path
from cumm.gemm.algospec.core import GemmAlgo

import numpy as np
import pccm
import torch
import torch.nn.functional as F

from cumm import dtypes
from cumm import tensorview as tv
from cumm.constants import PACKAGE_ROOT
from cumm.conv.bases import NCHW, NHWC, ConvIterAlgo, ConvOpType
from cumm.conv.main import ConvMainUnitTest, gen_gemm_kernels
from cumm.conv.params import ConvProblem
from cumm.gemm import kernel
22
import os
yan.yan's avatar
v2.1  
yan.yan committed
23
24
25
26
27
from spconv.core_cc.csrc.sparse.all import SpconvOps
from cumm.gemm.codeops import div_up
from spconv.constants import PACKAGE_ROOT
from spconv.core import ConvAlgo

28
from spconv.pytorch import ops
yan.yan's avatar
v2.1  
yan.yan committed
29
30
31
from spconv.algo import CONV, BestConvAlgoByProfile
from spconv.pytorch.cppcore import torch_tensor_to_tv

32

yan.yan's avatar
v2.1  
yan.yan committed
33
34
35
def reduce_mask_count(mask: np.ndarray, width: int):
    mask_length_32 = (div_up(mask.shape[0], width)) * width
    if mask.shape[0] < mask_length_32:
36
        mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
yan.yan's avatar
v2.1  
yan.yan committed
37
38
39
40
41
42
43
        mask_pad[:mask.shape[0]] = mask
        mask = mask_pad
    mask = mask.reshape(-1, width)
    maskr = np.bitwise_or.reduce(mask, axis=1)
    maskr_tv = tv.from_numpy(maskr)
    return SpconvOps.count_bits(maskr_tv).numpy().sum() * width

44

yan.yan's avatar
v2.1  
yan.yan committed
45
46
47
def reduce_mask_count_x(mask: np.ndarray, width: int):
    mask_length_32 = (div_up(mask.shape[0], width)) * width
    if mask.shape[0] < mask_length_32:
48
        mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
yan.yan's avatar
v2.1  
yan.yan committed
49
50
51
52
53
54
        mask_pad[:mask.shape[0]] = mask
        mask = mask_pad
    mask = mask.reshape(-1, width)
    maskr = np.bitwise_or.reduce(mask, axis=1)
    return maskr

55

yan.yan's avatar
yan.yan committed
56
def dev_subm_inds_v2(subm: bool = True, run_conv: bool = True):
yan.yan's avatar
v2.1  
yan.yan committed
57
58
59
60
61
62
63
64
65
66
    limit_input_n = 16384
    limit_input_n = None
    np.random.seed(484)

    with (PACKAGE_ROOT.parent / "test/data/test_spconv.pkl").open("rb") as f:
        voxels_np, indices_np, spatial_shape = pickle.load(f)
        from spconv.test_utils import generate_sparse_data
        voxels_np = voxels_np[:limit_input_n]
        indices_np = indices_np[:limit_input_n]

yan.yan's avatar
yan.yan committed
67
68
        # spatial_shape = [19, 18, 17]
        # sparse_dict = generate_sparse_data(spatial_shape, [1024], 128)
yan.yan's avatar
v2.1  
yan.yan committed
69

yan.yan's avatar
yan.yan committed
70
71
72
73
        # voxels_np = np.ascontiguousarray(sparse_dict["features"]).astype(
        #     np.float32)
        # indices_np = np.ascontiguousarray(
        #     sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
yan.yan's avatar
v2.1  
yan.yan committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

        voxels = tv.from_numpy(voxels_np).cuda()
        indices = tv.from_numpy(indices_np).cuda()
        indices_th = torch.from_numpy(indices_np).cuda()
    print(spatial_shape, indices_np.shape)
    ndim = 3
    if subm:
        ksize = [3, 3, 3]
        kv = np.prod(ksize)
        padding = [1] * ndim
        stride = [1] * ndim
        dilation = [1] * ndim
        out_padding = [0] * ndim
    else:
        ksize = [2, 2, 2]
        kv = np.prod(ksize)
        padding = [0] * ndim
        stride = [1] * ndim
        dilation = [1] * ndim
        out_padding = [0] * ndim
94
95
96
    out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs(
        indices_th, 1, spatial_shape, ConvAlgo.Native, ksize, stride, padding,
        dilation, out_padding, subm)
yan.yan's avatar
v2.1  
yan.yan committed
97
98
    indice_num_per_loc_np = indice_num_per_loc.cpu().numpy()
    indice_pairs_np = pair_ref.cpu().numpy()
yan.yan's avatar
yan.yan committed
99
    algo = ConvAlgo.MaskImplicitGemm
yan.yan's avatar
v2.1  
yan.yan committed
100
101
102
103
104
    if algo == ConvAlgo.MaskImplicitGemm:
        num_split = 1
    else:
        num_split = 2
    for i in range(5):
105
106
107
        res = ops.get_indice_pairs_implicit_gemm(indices_th, 1, spatial_shape,
                                                 algo, ksize, stride, padding,
                                                 dilation, out_padding, subm)
yan.yan's avatar
v2.1  
yan.yan committed
108
109
110
111
112
113
114
115
116
117
118
    out_inds = res[0]
    num_inds_per_loc = res[1]
    pair_fwd = res[2]
    pair_fwd_x = pair_fwd.cpu().numpy().reshape(-1)
    pair_fwd_x[pair_fwd_x == -1] = 0
    loc_num_np = (pair_fwd_x > 0).reshape(kv, -1).sum(1)
    print(loc_num_np)
    print(indice_num_per_loc_np)

    pair_bwd = res[3]
    pair_mask_fwd_splits = res[4]
yan.yan's avatar
yan.yan committed
119

yan.yan's avatar
v2.1  
yan.yan committed
120
    pair_mask_bwd_splits = res[5]
yan.yan's avatar
yan.yan committed
121
122
123
124
    mask_tv = torch_tensor_to_tv(pair_mask_fwd_splits[0], dtype=tv.uint32).cpu().numpy()
    bench_reduce_mask(mask_tv)
    return

yan.yan's avatar
v2.1  
yan.yan committed
125
126
127
    mask_argsort_fwd_splits = res[6]
    mask_argsort_bwd_splits = res[7]
    masks = res[8]
128
129
130
131
132
133
134
135
136
    pair_mask_fwd_splits_tv = [
        ops.torch_tensor_to_tv(t, dtype=tv.uint32)
        for t in pair_mask_fwd_splits
    ]
    valid_location_bitcount = [
        SpconvOps.count_bits(t) for t in pair_mask_fwd_splits_tv
    ]
    valid_location_count = sum(
        [t.cpu().numpy().sum() for t in valid_location_bitcount])
yan.yan's avatar
v2.1  
yan.yan committed
137
    reduce_length = 32
138
139
140
141
    split_mask_valid_count = sum([
        reduce_mask_count(t.cpu().numpy(), reduce_length)
        for t in pair_mask_fwd_splits_tv
    ])
yan.yan's avatar
v2.1  
yan.yan committed
142
    if subm:
143
144
        print("SUBM", valid_location_count, split_mask_valid_count,
              pair_fwd.numel())
yan.yan's avatar
v2.1  
yan.yan committed
145
    else:
146
147
148
        print("REGULAR", valid_location_count, split_mask_valid_count,
              pair_fwd.numel())
    # return
yan.yan's avatar
v2.1  
yan.yan committed
149
150
151
152
153

    if run_conv:
        C = 64
        K = 64
        desps = CONV.desps
154
155
156
157
158
159
        mask_output_fwd = torch.zeros([2, div_up(out_inds.shape[0], 32)],
                                      dtype=torch.int32,
                                      device=indices_th.device)
        mask_output_bwd = torch.zeros([2, div_up(indices.dim(0), 32)],
                                      dtype=torch.int32,
                                      device=indices_th.device)
yan.yan's avatar
v2.1  
yan.yan committed
160
161
162
163
164
165
166
167

        for desp in desps:
            if desp.algo != GemmAlgo.Simt.value:
                continue
            # if desp.op_type == ConvOpType.kBackwardWeight.value:
            #     continue
            # if desp.tile_shape !
            if desp.dtype_a == dtypes.int8.tv_dtype:
168
169
170
171
172
173
174
                inp = np.random.randint(-1, 1, size=[voxels_np.shape[0],
                                                     C]).astype(np.int8)
                weight = np.random.randint(-1, 1, size=[K, *ksize,
                                                        C]).astype(np.int8)
                output = np.random.randint(-1, 1, size=[
                    out_inds.shape[0], K
                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
yan.yan's avatar
v2.1  
yan.yan committed
175
            else:
176
177
178
                inp = np.random.uniform(-1, 1, size=[
                    voxels_np.shape[0], C
                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_input))
yan.yan's avatar
v2.1  
yan.yan committed
179
180
                weight = np.random.uniform(-1, 1, size=[K, *ksize, C]).astype(
                    dtypes.get_npdtype_from_tvdtype(desp.dtype_weight))
181
182
183
                output = np.random.uniform(-1, 1, size=[
                    out_inds.shape[0], K
                ]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
yan.yan's avatar
v2.1  
yan.yan committed
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
            weight_ref = weight.transpose(1, 2, 3, 0, 4)
            weight_ref = np.ascontiguousarray(weight_ref).reshape(-1, K, C)
            if desp.op_type == ConvOpType.kBackwardInput.value:
                inp_tv = tv.zeros(inp.shape, desp.dtype_input, 0)
            else:
                inp_tv = tv.from_numpy(inp).cuda()
            if desp.op_type == ConvOpType.kBackwardWeight.value:
                weight_tv = tv.zeros(weight.shape, desp.dtype_weight, 0)
            else:
                weight_tv = tv.from_numpy(weight).cuda()
            # _ = tv.zeros([5000, 10], tv.float32, 0)
            if desp.op_type == ConvOpType.kForward.value:
                output_tv = tv.zeros(output.shape, desp.dtype_output, 0)
            else:
                output_tv = tv.from_numpy(output).cuda()
            torch.cuda.synchronize()
            t = time.time()
            spk = 1
            if desp.op_type == ConvOpType.kBackwardWeight.value:
                # TODO support splitk parallel
                spk = 32
            if subm:
                if desp.op_type == ConvOpType.kForward.value:
                    indice_pairs = pair_fwd
                elif desp.op_type == ConvOpType.kBackwardInput.value:
                    indice_pairs = pair_bwd
                else:
                    indice_pairs = pair_fwd
                mask_output = mask_output_fwd
                # print([bin(x.item()) for x in masks])
                for j in range(num_split):
                    beta = 1 if j == 1 else 0
                    mask_filter = 0xffffffff
                    mask_filter = masks[j].item()

                    reverse_mask = False
                    if desp.op_type == ConvOpType.kBackwardWeight.value:
                        mask_op = mask_output[j]
                    else:
                        mask_op = pair_mask_fwd_splits[j]
                    if desp.op_type == ConvOpType.kBackwardInput.value:
                        reverse_mask = True
                    CONV.run_with_tuned_result(
                        BestConvAlgoByProfile(desp, spk),
                        desp.op_type,
                        inp_tv,
                        weight_tv,
                        output_tv,
                        torch_tensor_to_tv(mask_op, dtype=tv.uint32),
                        torch_tensor_to_tv(mask_argsort_fwd_splits[j]),
                        torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
                        torch_tensor_to_tv(indice_pairs),
                        reverse_mask,
                        mask_filter=mask_filter,
                        mask_width=32,
                        beta=beta,
                        verbose=True,
                    )
            else:
                if desp.op_type == ConvOpType.kForward.value:
244
                    indice_pairs = pair_fwd  # inp -> out
yan.yan's avatar
v2.1  
yan.yan committed
245
246
247
248
                    mask_ops = pair_mask_fwd_splits
                    mask_argsorts = mask_argsort_fwd_splits
                    mask_output = mask_output_fwd
                elif desp.op_type == ConvOpType.kBackwardInput.value:
249
                    indice_pairs = pair_bwd  # out -> inp
yan.yan's avatar
v2.1  
yan.yan committed
250
251
252
253
254
255
                    mask_ops = pair_mask_bwd_splits
                    mask_argsorts = mask_argsort_bwd_splits
                    mask_output = mask_output_bwd

                    print([bin(x.item()) for x in masks])
                else:
256
                    indice_pairs = pair_fwd  # inp -> out
yan.yan's avatar
v2.1  
yan.yan committed
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
                    mask_ops = pair_mask_fwd_splits
                    mask_argsorts = mask_argsort_fwd_splits
                    mask_output = mask_output_fwd

                for j in range(2):
                    beta = 1 if j == 1 else 0
                    mask_filter = masks[j].item()
                    reverse_mask = False
                    if desp.op_type == ConvOpType.kBackwardWeight.value:
                        mask_op = mask_output[j]
                    else:
                        mask_op = mask_ops[j]

                    CONV.run_with_tuned_result(
                        BestConvAlgoByProfile(desp, spk),
                        desp.op_type,
                        inp_tv,
                        weight_tv,
                        output_tv,
                        torch_tensor_to_tv(mask_op, dtype=tv.uint32),
                        torch_tensor_to_tv(mask_argsorts[j]),
                        torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
                        torch_tensor_to_tv(indice_pairs),
                        reverse_mask,
                        mask_filter=mask_filter,
                        mask_width=32,
                        beta=beta,
                        verbose=True,
                    )

            torch.cuda.synchronize()
288
            duration = time.time() - t
yan.yan's avatar
v2.1  
yan.yan committed
289
290
291
292
293
294
295
296
297
298
299
300
301
302
            if desp.op_type == ConvOpType.kForward.value:
                output_ref = np.zeros_like(output, dtype=np.float32)
                # ref algorithm
                for filter_offset in range(kv):
                    if subm and filter_offset > kv // 2:
                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
                    elif subm and filter_offset == kv // 2:
                        nhot = voxels.shape[0]
                    else:
                        nhot = indice_num_per_loc_np[filter_offset]
                    a_inds = indice_pairs_np[0][filter_offset][:nhot]
                    c_inds = indice_pairs_np[1][filter_offset][:nhot]
                    # print(a_inds_cpu[:10])
                    a = inp[a_inds]
303
304
305
                    cc = a.astype(
                        np.float32) @ weight_ref[filter_offset].T.astype(
                            np.float32)
yan.yan's avatar
v2.1  
yan.yan committed
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
                    output_ref[c_inds] += cc

                output_cpu = output_tv.cpu().numpy().astype(np.float32)
                duration = time.time() - t
                my = output_cpu.reshape(-1)
                print("ERROR", np.linalg.norm(output_ref.reshape(-1) - my))

            elif desp.op_type == ConvOpType.kBackwardInput.value:
                dinput_ref = np.zeros_like(inp, dtype=np.float32)
                # ref algorithm
                for filter_offset in range(kv):
                    if subm and filter_offset > kv // 2:
                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
                    elif subm and filter_offset == kv // 2:
                        nhot = voxels.shape[0]
                    else:
                        nhot = indice_num_per_loc_np[filter_offset]
                    a_inds = indice_pairs_np[1][filter_offset][:nhot]
                    c_inds = indice_pairs_np[0][filter_offset][:nhot]

                    # print(a_inds_cpu[:10])
                    a = output[a_inds]
                    # NK @ KC
329
330
331
                    cc = a.astype(
                        np.float32) @ weight_ref[filter_offset].astype(
                            np.float32)
yan.yan's avatar
v2.1  
yan.yan committed
332
333
                    dinput_ref[c_inds] += cc
                din_cpu = inp_tv.cpu().numpy()
334
335
336
337
                print(
                    "ERROR",
                    np.linalg.norm(
                        din_cpu.reshape(-1) - dinput_ref.reshape(-1)))
yan.yan's avatar
v2.1  
yan.yan committed
338
            else:
339
340
                dw_ref = np.zeros_like(weight_ref,
                                       dtype=np.float32)  # KV, K, C
yan.yan's avatar
v2.1  
yan.yan committed
341
342
343
344
345
346
347
348
349
350
                for filter_offset in range(kv):
                    if subm and filter_offset > kv // 2:
                        nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
                    elif subm and filter_offset == kv // 2:
                        nhot = voxels.shape[0]
                    else:
                        nhot = indice_num_per_loc_np[filter_offset]
                    o_inds = indice_pairs_np[1][filter_offset][:nhot]
                    i_inds = indice_pairs_np[0][filter_offset][:nhot]
                    # print(a_inds_cpu[:10])
351
352
                    out_gather = output[o_inds]  # [N, K]
                    inp_gather = inp[i_inds]  # [N, C]
yan.yan's avatar
v2.1  
yan.yan committed
353
                    # KN @ NC
354
355
                    dw_res = out_gather.astype(
                        np.float32).T @ inp_gather.astype(np.float32)
yan.yan's avatar
v2.1  
yan.yan committed
356
357
358
359
360
                    dw_ref[filter_offset] = dw_res
                # print(indice_pairs_np_test[0])
                dw_ref_kcrs = dw_ref.transpose(1, 0, 2)
                dw_cpu = weight_tv.cpu().numpy().reshape(K, np.prod(ksize), C)

361
362
363
364
                print(
                    "ERROR",
                    np.linalg.norm(
                        dw_cpu.reshape(-1) - dw_ref_kcrs.reshape(-1)))
yan.yan's avatar
v2.1  
yan.yan committed
365

yan.yan's avatar
yan.yan committed
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
def reverse_bits(a: np.ndarray):
    a_unpack = np.unpackbits(a, bitorder="little")
    return np.packbits(a_unpack)

def _count_mask_reduce(masks: np.ndarray):
    masks_tv_count = SpconvOps.count_bits(tv.from_numpy(masks))
    masks_tv_count_sum = masks_tv_count.numpy_view().sum()

    reduce_count = reduce_mask_count(masks, 64)
    print(masks_tv_count_sum, reduce_count, reduce_count / masks_tv_count_sum)


def bench_reduce_mask(masks: np.ndarray, width: int = 27):
    # masks = np.random.randint(0, 2000000000, size=[100000], dtype=np.uint32)#  & 0xffff
    width_mask = np.array(0xffffffff, dtype=np.uint32) << (32 - width) >> (32 - width)

    width_half_mask = np.array(0xffffffff, dtype=np.uint32) >> (32 - width // 2 - 1)
    width_half_mask_left = width_half_mask << (width // 2 + 1)
    print(bin(width_half_mask))
    masks_sort = masks.copy()
    masks_sort.sort()
    _count_mask_reduce(masks_sort)
    masks_sort = masks.copy() & width_half_mask
    masks_sort.sort()
    _count_mask_reduce(masks_sort)

    # masks.sort()
    # masks = masks & 0xffff

    reversed_masks = SpconvOps.reverse_bits(tv.from_numpy(masks)).numpy()#  & 0xffff0000
    new_masks = np.concatenate([masks, reversed_masks])
    
    np.random.shuffle(new_masks)
    new_masks.sort()
    _count_mask_reduce(new_masks)
    new_masks &= width_half_mask
    new_masks.sort()
    _count_mask_reduce(new_masks)



yan.yan's avatar
v2.1  
yan.yan committed
407
408
409

if __name__ == "__main__":
    dev_subm_inds_v2()