Commit 510ac924 authored by Jiezhong Qiu's avatar Jiezhong Qiu
Browse files

update

parent d9ca437a
......@@ -100,7 +100,7 @@ def my_topk(x, k, inplace=True):
return top_val, top_idx
class HierarchicalMoEPositionwiseFF(nn.Module):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, n_block=32, top_block=1):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, n_block=16, top_block=2):
super(HierarchicalMoEPositionwiseFF, self).__init__()
print("HierarchicalMoEPositionwiseFF")
......@@ -115,7 +115,7 @@ class HierarchicalMoEPositionwiseFF(nn.Module):
self.d_inner = d_inner
self.dropout = dropout
self.block_net = nn.Linear(d_model, n_block, bias=False)
self.block_net = nn.Linear(d_model, n_block, bias=True)
self.W1 = nn.Parameter(torch.Tensor(n_block, d_block, d_model))
self.b1 = nn.Parameter(torch.Tensor(n_block, d_block))
......@@ -131,7 +131,7 @@ class HierarchicalMoEPositionwiseFF(nn.Module):
self.dropout_middle = nn.Dropout(dropout * ratio)
self.dropout_final = nn.Dropout(dropout)
self.scale = 1 / (d_model ** 0.5)
# self.scale = 1 / (d_model ** 0.5)
self.reset_parameter()
def reset_parameter(self):
......@@ -149,8 +149,8 @@ class HierarchicalMoEPositionwiseFF(nn.Module):
block = self.block_net(inp)
block_val, block_idx = my_topk(block, k=self.top_block)
# block_val, block_idx = torch.topk(block, k=self.top_block, dim=-1, largest=True, sorted=False) # [.. x top_k]
# block_val, block_idx = my_topk(block, k=self.top_block)
block_val, block_idx = torch.topk(block, k=self.top_block, dim=-1, largest=True, sorted=False) # [.. x top_k]
gate = F.softmax(block_val, dim=-1)
......@@ -158,7 +158,7 @@ class HierarchicalMoEPositionwiseFF(nn.Module):
b1_block = self.b1[block_idx] # [.. x top_k x d_block]
x = torch.einsum('ibd,ibnhd->ibnh', (inp, W1_block)) + b1_block # [.. x top_k x d_block]
x = x + block_val.unsqueeze(-1) # somehow like residual
# x = x + block_val.unsqueeze(-1) # somehow like residual
x = x * gate.unsqueeze(-1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment