init FMoELinear param using mp_ank and numpy rng

5f5ccd47 · Jiezhong Qiu · da11cb76 · 5f5ccd47 · 5f5ccd47
Commit 5f5ccd47 authored Feb 20, 2021 by Jiezhong Qiu
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 8 deletions

fmoe/layers.py fmoe/layers.py +9 -4

fmoe/transformer.py fmoe/transformer.py +4 -4

No files found.
--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -17,11 +17,12 @@ class FMoELinear(nn.Module):
    performed in parallel to increase the performance.
    The FMoELinear module provides such function.
    '''
-    def __init__(self, num_expert=32, in_feat=1024, out_feat=1024):
+    def __init__(self, num_expert=32, in_feat=1024, out_feat=1024, rank=0):
        super().__init__()
        self.num_expert = num_expert
        self.in_feat = in_feat
        self.out_feat = out_feat
+        self.rank = rank
        self.weight = nn.Parameter(torch.Tensor(num_expert, out_feat, in_feat))
        self.reset_parameters()
@@ -29,10 +30,14 @@ class FMoELinear(nn.Module):
        r'''
        Initialize the weight as linear layers
        '''
+        rng = np.random.default_rng(np.random.randint(2048) + self.rank)
+        fan = nn.init._calculate_correct_fan(self.weight[0], 'fan_in')
+        gain = nn.init.calculate_gain('leaky_relu', math.sqrt(5))
+        std = gain / math.sqrt(fan)
+        bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
        for i in range(self.num_expert):
-            linear = nn.Linear(in_features=self.in_feat,
+            weight = rng.uniform(-bound, bound, size=tuple(self.weight[i].size()))
-                    out_features=self.out_feat)
+            self.weight.data[i] = torch.from_numpy(weight)
-            self.weight.data[i] = linear.weight.data
    def forward(self, inp, fwd_expert_count):
        r'''

--- a/fmoe/transformer.py
+++ b/fmoe/transformer.py
@@ -12,10 +12,10 @@ class _Expert(nn.Module):
    An expert using 2 FMoELinear modules to speed up the computation of experts
    within one worker.
    '''
-    def __init__(self, num_expert, d_model, d_hidden, activation):
+    def __init__(self, num_expert, d_model, d_hidden, activation, rank=0):
        super().__init__()
-        self.htoh4 = FMoELinear(num_expert, d_model, d_hidden)
+        self.htoh4 = FMoELinear(num_expert, d_model, d_hidden, rank)
-        self.h4toh = FMoELinear(num_expert, d_hidden, d_model)
+        self.h4toh = FMoELinear(num_expert, d_hidden, d_model, rank)
        self.activation = activation
    def forward(self, inp, fwd_expert_count):
@@ -52,7 +52,7 @@ class FMoETransformerMLP(FMoE):
        super().__init__(num_expert=num_expert, d_model=d_model, gate=gate,
                top_k=top_k, world_size=world_size, mp_group=mp_group,
                expert_fn=expert_fn)
-        self.experts = _Expert(num_expert, d_model, d_hidden, activation)
+        self.experts = _Expert(num_expert, d_model, d_hidden, activation, self.mp_rank)
        self.pre_lnorm = pre_lnorm
        self.layer_norm = nn.LayerNorm(d_model)
        self.mark_parallel_comm()