forked from PaddlePaddle/PaddleRec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
net.py
109 lines (98 loc) · 4.49 KB
/
net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class MMoELayer(nn.Layer):
def __init__(self, feature_size, expert_num, expert_size, tower_size,
gate_num):
super(MMoELayer, self).__init__()
self.expert_num = expert_num
self.expert_size = expert_size
self.tower_size = tower_size
self.gate_num = gate_num
self._param_expert = []
expert_init = [pow(10, -i) for i in range(1, self.expert_num + 1)]
for i in range(0, self.expert_num):
linear = self.add_sublayer(
name='expert_' + str(i),
sublayer=nn.Linear(
feature_size,
expert_size,
#initialize each expert respectly
weight_attr=nn.initializer.Constant(value=expert_init[i]),
bias_attr=nn.initializer.Constant(value=0.1),
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
name='expert_' + str(i)))
self._param_expert.append(linear)
self._param_gate = []
self._param_tower = []
self._param_tower_out = []
gate_init = [pow(10, -i) for i in range(1, self.gate_num + 1)]
for i in range(0, self.gate_num):
linear = self.add_sublayer(
name='gate_' + str(i),
sublayer=nn.Linear(
feature_size,
expert_num,
#initialize every gate respectly
weight_attr=nn.initializer.Constant(value=gate_init[i]),
bias_attr=nn.initializer.Constant(value=0.1),
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
name='gate_' + str(i)))
self._param_gate.append(linear)
linear = self.add_sublayer(
name='tower_' + str(i),
sublayer=nn.Linear(
expert_size,
tower_size,
#initialize each gate respectly
weight_attr=nn.initializer.Constant(value=gate_init[i]),
bias_attr=nn.initializer.Constant(value=0.1),
#bias_attr=paddle.ParamAttr(learning_rate=1.0),
name='tower_' + str(i)))
self._param_tower.append(linear)
linear = self.add_sublayer(
name='tower_out_' + str(i),
sublayer=nn.Linear(
tower_size,
2,
#initialize each gate respectly
weight_attr=nn.initializer.Constant(value=gate_init[i]),
bias_attr=nn.initializer.Constant(value=0.1),
name='tower_out_' + str(i)))
self._param_tower_out.append(linear)
def forward(self, input_data):
expert_outputs = []
for i in range(0, self.expert_num):
linear_out = self._param_expert[i](input_data)
expert_output = F.relu(linear_out)
expert_outputs.append(expert_output)
expert_concat = paddle.concat(x=expert_outputs, axis=1)
expert_concat = paddle.reshape(
expert_concat, [-1, self.expert_num, self.expert_size])
output_layers = []
for i in range(0, self.gate_num):
cur_gate_linear = self._param_gate[i](input_data)
cur_gate = F.softmax(cur_gate_linear)
cur_gate = paddle.reshape(cur_gate, [-1, self.expert_num, 1])
cur_gate_expert = paddle.multiply(x=expert_concat, y=cur_gate)
cur_gate_expert = paddle.sum(x=cur_gate_expert, axis=1)
cur_tower = self._param_tower[i](cur_gate_expert)
cur_tower = F.relu(cur_tower)
out = self._param_tower_out[i](cur_tower)
out = F.softmax(out)
out = paddle.clip(out, min=1e-15, max=1.0 - 1e-15)
output_layers.append(out)
return output_layers