parallel_linear.cu 3 KB
Newer Older
1
#include "parallel_linear.cuh"
Rick Ho's avatar
Rick Ho committed
2
3
4
5
6
#include "utils/fmoe_utils.h"
#include <torch/extension.h>

std::vector<torch::Tensor> _linear_forward(
        torch::Tensor input_buf,
7
        torch::Tensor expert_count,
Rick Ho's avatar
Rick Ho committed
8
        torch::Tensor weight,
9
        at::optional<torch::Tensor> bias
Rick Ho's avatar
Rick Ho committed
10
11
12
13
14
15
        ) {
    auto smgr = getCudaStreamManager(input_buf.device().index());
    const auto batch_size = input_buf.size(0);
    const auto num_expert = weight.size(0);
    const auto out_feat = weight.size(1);
    const auto in_feat = weight.size(2);
16
17
18

#ifdef MOE_DEBUG
    printf("[forward] expert=%ld, in_feat (d_model)=%ld, out_feat (d_ffn)=%ld\n",
Rick Ho's avatar
Rick Ho committed
19
20
            num_expert, in_feat, out_feat);
#endif
21
22
23
24
25
26
27
28
29
30
31
32
33

    torch::Tensor output;

    if (bias.has_value()) {
        output = bias.value().repeat_interleave(expert_count.to(bias.value().device()), 0);
    } else{
        auto out_options = torch::TensorOptions()
            .device(input_buf.device())
            .dtype(input_buf.dtype());
        output = torch::empty({batch_size, out_feat}, out_options);
    }

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input_buf.scalar_type(), "moe_forward_cuda",
Rick Ho's avatar
Rick Ho committed
34
            ([&] {
35
        fmoe_cuda_linear_forward_impl<scalar_t>(
Rick Ho's avatar
Rick Ho committed
36
37
38
39
            input_buf.data_ptr<scalar_t>(),
            weight.data_ptr<scalar_t>(),
            expert_count.data_ptr<long>(),
            output.data_ptr<scalar_t>(),
40
            bias.has_value(),
Rick Ho's avatar
Rick Ho committed
41
42
43
44
45
46
            in_feat,
            out_feat,
            num_expert,
            smgr
        );
    }));
47
48

    return {output, };
Rick Ho's avatar
Rick Ho committed
49
50
}

51

Rick Ho's avatar
Rick Ho committed
52
std::vector<torch::Tensor> _linear_backward(
53
54
55
56
57
    torch::Tensor grad_output_buf,
    torch::Tensor input_buf,
    torch::Tensor expert_count,
    torch::Tensor weight,
    at::optional<torch::Tensor> bias
Rick Ho's avatar
Rick Ho committed
58
59
60
61
62
63
64
) {
    auto smgr = getCudaStreamManager(input_buf.device().index());
    const auto batch_size = input_buf.size(0);
    const auto num_expert = weight.size(0);
    const auto out_feat = weight.size(1);
    const auto in_feat = weight.size(2);

65
#ifdef MOE_DEBUG
Rick Ho's avatar
Rick Ho committed
66
67
68
69
70
    printf("[backward] b=%ld, expert=%ld, in_feat (d_model)=%ld, "
            "out_feat (d_ffn)=%ld\n",
            batch_size, num_expert, in_feat, out_feat);
#endif

71
    auto grad_input_buf = grad_output_buf.new_empty({batch_size, in_feat});
Rick Ho's avatar
Rick Ho committed
72
    auto grad_weight = grad_output_buf.new_empty({num_expert, out_feat, in_feat});
73
    auto grad_bias = grad_output_buf.new_empty({num_expert, out_feat});
Rick Ho's avatar
Rick Ho committed
74

75
76
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input_buf.scalar_type(), "moe_cuda_backward", ([&] {
        fmoe_cuda_linear_backward_impl<scalar_t>(
Rick Ho's avatar
Rick Ho committed
77
78
79
80
81
82
            grad_output_buf.data_ptr<scalar_t>(),
            input_buf.data_ptr<scalar_t>(),
            weight.data_ptr<scalar_t>(),
            expert_count.data_ptr<long>(),
            grad_input_buf.data_ptr<scalar_t>(),
            grad_weight.data_ptr<scalar_t>(),
83
84
            grad_bias.data_ptr<scalar_t>(),
            bias.has_value(),
Rick Ho's avatar
Rick Ho committed
85
86
87
88
89
90
91
92
            batch_size,
            in_feat,
            out_feat,
            num_expert,
            smgr
        );
    }));

93
    return {grad_input_buf, grad_weight, grad_bias};
Rick Ho's avatar
Rick Ho committed
94
95
}