±à¼ÍƼö: |
±¾ÎÄÖ÷Òª½²½âÁË¿ÉÊÓ»¯Í¼ÐÎ+Ô´Âë·ÖÎö°ïÖú´ó¼Ò¿ìËÙÉîÈëµØÀí½âTransformerÔÀí,Ï£Íû¶ÔÄúµÄѧϰÓÐËù°ïÖú¡£
±¾ÎÄÀ´×Ô΢ÐŹ«ÖںŠèµ°¶ùµÄСÎÑ£¬×÷ÕßÎü½ðС·²£¬ÓÉ»ðÁú¹ûÈí¼þÁõ衱à¼ÍƼö¡£ |
|
Ò»¡¢Transformer¼ò½é
1.1¡¢Seq2seq model
Transformer(±äÐνð¸Õ)¼òµ¥À´½²£¬ÎÒÃÇ¿ÉÒÔ½«Æä¿´×÷Ò»¸öseq2seq with self-attention model¡£ÎÒÃÇ¿ÉÒÔÕâôÀí½â£¬TransformerÕûÌå×÷Ϊһ¸ö·ÒëÆ÷£¬ÊäÈë·¨Îĵľä×Ó£¬Ä£Ðͽ«Æä·ÒëΪӢÎÄÊä³ö¡£

1.2¡¢TransformerÕûÌå¿ò¼Ü
ÓÉÉÏͼ֪£¬TransformerÖ÷ÒªÓÉencoderºÍdecoderÁ½²¿·Ö×é³É¡£ÔÚTransformerµÄpaperÖУ¬encoderºÍdecoder¾ùÓÉ6¸öencoder layerºÍdecoder layer×é³É£¬Í¨³£ÎÒÃdzÆÖ®Îªencoder block¡£

1.3¡¢Transformer¿ò¼ÜÔ´Âë
ÎÒÃÇÒѾ֪µÀTransformerÖ÷ÒªÓÉencoderºÍdecoderÁ½²¿·Ö×é³É¡£ÄÇô´úÂëÖй¹½¨Ò²ÊǷdz£¼òµ¥µÄ£¬¹Ø¼ü´úÂëÈçÏ£º
class Transformer(nn.Module):
def __init__(self):
super(Transformer, self).__init__()
# TransformerµÄÁ½¸ö×é³É²¿·Ö
self.encoder = Encoder()
self.decoder = Decoder()
self.projection = nn.Linear(d_model, tgt_vocab_size,
bias=False) |
¶þ¡¢Encoder±àÂëÆ÷

Encoder×ÜÌåµÄ¹¹½¨´úÂëÈçÏ£º
class Encoder(nn.Module):
def __init__(self):
super(Encoder, self).__init__()
self.src_emb = nn.Embedding(src_vocab_size, d_model)
# ´ÊÏòÁ¿embedding
self.pos_emb = nn.Embedding.from_pretrained (get_sinusoid_encoding_table
(src_len+1, d_model),freeze=True) # λÖÃÏòÁ¿embedding
# encoderºËÐIJÙ×÷£ºÓÉn_layer¸öencoder block¹¹½¨µÃµ½ encoder
self.layers = nn.ModuleList([EncoderLayer() for
_ in range(n_layers)]) |
EncoderLayer()¸ºÔðÿһ¸öEncoder blockµÄ¹¹½¨£º
class EncoderLayer(nn.Module):
def __init__(self):
super(EncoderLayer, self).__init__()
self.enc_self_attn = MultiHeadAttention()
self.pos_ffn = PoswiseFeedForwardNet() |
ÏÂÃæÎÒÃǾÍÀ´¿´Ò»ÏÂMulti-head AttentionºÍFeed forwardµÄµ×²ãÔÀíÊÇÈçºÎʵÏÖ£¿
2.1¡¢Multi-head Attention
Multihead attentionÆäʵ¾ÍÊÇself-attentionµÄn´Î¼ÆË㣬¼òµ¥À´½²£¬Æäʵ¾ÍÊÇÖØ¸´µÄ¾ØÕóÏà³ËÔËËã¡£Òò´Ë£¬ÎÒÃÇÊ×ÏÈÒª¸ãÇå³þSelf-attentionµÄÔËËã¹ý³Ì¡£
2.1.1 Self-Attention
Self-Attention±¾ÖÊÉϾÍÊÇһϵÁоØÕóÔËËã¡£ÄÇôËüµÄµÄ¼ÆËã¹ý³ÌÊÇÔõÑùµÄÄØ£¿ÎÒÃÇÒÔÊäÈëΪx1¶ÔÓ¦Êä³öΪb1ΪÀý£¬½øÐÐ˵Ã÷£¬ÈçÏÂͼ¡£

Õë¶ÔÒÔÉ϶¯Í¼Éæ¼°µÄ¼ÆËã¹ý³Ì£¬ÎÒÃÇÔÚÏÂͼ½øÐÐÒ»Ò»¶ÔÓ¦£º
Self-AttentionÕûÌåµÄ¼ÆËã¹ý³ÌÓÃÈçÏÂͼ½øÐйéÄÉ£º

¶ÔÓÚSelf-attention»úÖÆÕâÀï¾Í²»½øÐÐϸ½²ÁË£¬²©Ö÷ÔÚǰһƪ·¢µÄÎÄÕÂÖнøÐÐÏêϸµÄ½éÉÜ£¬Ð¡»ï°éÃÇ¿ÉÒÔ»¨Á½·ÖÖÓÈ¥¶ÁÒ»ÏÂÔÙ»ØÀ´½Ó×ÅÍùÏ¿´£¬Self-attention´«ËÍÃÅ£ºÍ¼½âBertϵÁÐÖ®Self-Attention
2.1.2 Multihead-Attention
ÎÒÃÇ»¹ÊÇÏÈÀ´¿´´úÂ룺
class MultiHeadAttention(nn.Module):
def __init__(self):
super(MultiHeadAttention, self).__init__()
self.W_Q = nn.Linear(d_model, d_k * n_heads)
self.W_K = nn.Linear(d_model, d_k * n_heads)
self.W_V = nn.Linear(d_model, d_v * n_heads)
def forward(self, Q, K, V, attn_mask):
# q: [batch_size x len_q x d_model], k: [batch_size
x len_k x d_model], v: [batch_size x len_k x d_model]
residual, batch_size = Q, Q.size(0)
# (B, S, D) -proj-> (B, S, D) -split-> (B,
S, H, W) -trans-> (B, H, S, W)
q_s = self.W_Q(Q).view(batch_size, -1, n_heads,
d_k).transpose(1,2) # q_s: [batch_size x n_heads
x len_q x d_k]
k_s = self.W_K(K).view(batch_size, -1, n_heads,
d_k).transpose(1,2) # k_s: [batch_size x n_heads
x len_k x d_k]
v_s = self.W_V(V).view(batch_size, -1, n_heads,
d_v).transpose(1,2) # v_s: [batch_size x n_heads
x len_k x d_v]
attn_mask = attn_mask.unsqueeze(1).repeat(1,
n_heads, 1, 1) # attn_mask : [batch_size x n_heads
x len_q x len_k]
# context: [batch_size x n_heads x len_q x
d_v], attn: [batch_size x n_heads x len_q(=len_k)
x len_k(=len_q)]
context, attn = ScaledDotProductAttention()(q_s,
k_s, v_s, attn_mask)
context = context.transpose(1, 2).contiguous().view(batch_size,
-1, n_heads * d_v) # context: [batch_size x
len_q x n_heads * d_v]
output = nn.Linear(n_heads * d_v, d_model)(context)
return nn.LayerNorm(d_model)(output + residual),
attn # output: [batch_size x len_q x d_model]
|
ÕâÀïµÄTransformerÊǶÔself-attention×öÁËÒ»¸ö2´ÎµÄ¾ØÕóÏà³Ë¡£Àí½âÁËSelf-attentionÖ®ºó£¬Multihead attentionÀí½âÆðÀ´ÊÇ·Ö·ÖÖÓÈöÈöË®À²¡£

ͨ¹ý¶ÔÒÔÉϼÆËã²½ÖèµÄ·´¸´µü´ú£¬µÃµ½Q¡¢K¡¢VÈý¸ö¾ØÕó¡£

2.1.3 Position VectorλÖÃÏòÁ¿
Ä£ÐÍÀí½âÒ»¸ö¾ä×ÓÓÐÁ½¸öÒªËØ£ºÒ»Êǵ¥´ÊµÄº¬Ò壬¶þÊǵ¥´ÊÔÚ¾äÖÐËù´¦µÄλÖá£
ÔÚSelf-attentionÖÐÿ¸öµ¥´ÊÏòÁ¿¾¹ý¼ÆËãÖ®ºóµÄÊä³ö¶¼Êܵ½Õû¾äµÄÓ°Ï죬Q¡¢K¡¢VµÄ¾ØÕóÔËËãÒ²¶¼ÊDz¢ÐÐÔËË㣬µ«µ¥´Ê¼äµÄ˳ÐòÐÅϢȴ±»¶ªÊ§ÁË¡£Òò´ËGoogleÍŶÓÔÚTransformerÖÐʹÓÃPositionλÖÃÏòÁ¿½â¾öÕâ¸öÎÊÌâ¡£
ÿ¸öµ¥´ÊµÄǶÈëÏòÁ¿»áѧϰµ¥´ÊµÄº¬Ò壬ËùÒÔÎÒÃÇÐèÒªÊäÈëһЩÐÅÏ¢£¬ÈÃÉñ¾ÍøÂçÖªµÀµ¥´ÊÔÚ¾äÖÐËù´¦µÄλÖá£ÀûÓÃÎÒÃÇÊìϤµÄsin¡¢cosÈý½Çº¯Êý´´½¨Î»ÖÃÌØÒìÐÔ³£Á¿À´½â¾öÕâÀàÎÊÌ⣺

ÆäÖУ¬ÔÚ¸ø´ÊÏòÁ¿Ìí¼ÓλÖñàÂë֮ǰ£¬ÎÒÃÇÒªÀ©´ó´ÊÏòÁ¿µÄÊýÖµ£¬Ä¿µÄÊÇÈÃλÖñàÂëÏà¶Ô½ÏС¡£ÕâÒâζ×ÅÏò´ÊÏòÁ¿Ìí¼ÓλÖñàÂëʱ£¬´ÊÏòÁ¿µÄÔʼº¬Òå²»»á¶ªÊ§¡£
×¢ÊÍ£ºÎ»ÖñàÂë¾ØÕóÊÇÒ»¸ö³£Á¿£¬ËüµÄÖµ¿ÉÒÔÓÃÉÏÃæµÄËãʽ¼ÆËã³öÀ´¡£°Ñ³£Á¿Ç¶Èë¾ØÕó£¬È»ºóÿ¸öǶÈëµÄµ¥´Ê»á¸ù¾ÝËüËù´¦µÄλÖ÷¢ÉúÌØ¶¨×ª±ä¡£

ÀûÓÃPytorch½øÐÐʵÏÖ£º
def get_sinusoid_encoding_table(n_position, d_model):
def cal_angle(position, hid_idx):
return position / np.power(10000, 2 * (hid_idx
// 2) / d_model)
def get_posi_angle_vec(position):
return [cal_angle(position, hid_j) for hid_j in
range (d_model)]
sinusoid_table = np.array([get_posi_angle_vec(pos_i)
for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:,
0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:,
1::2]) # dim 2i+1
return torch.FloatTensor(sinusoid_table) |
2.2¡¢Feed Forward ǰÀ¡Éñ¾ÍøÂç
ÖÕÓÚµ½ÁËencoderµÄ×îºóһС²¿·Ö£¬Àí½âÁËÒÔÉϲ¿·Ö£¬ÕâÀïµÄÉñ¾ÍøÂç·Ç³£¼òµ¥ÁË£¬Í¨¹ý¼¤»îº¯ÊýRelu×öÒ»¸ö·ÇÏßÐԱ任()£¬È»ºó¹éÒ»»¯²Ù×÷(¶ÔÓ¦´úÂëÖеÄLayerNormº¯Êý)£¬µÃµ½encoder×îÖÕµÄÊä³ö½á¹û¡£
class PoswiseFeedForwardNet(nn.Module):
def __init__(self):
super(PoswiseFeedForwardNet, self).__init__()
self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff,
kernel_size=1)
self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model,
kernel_size=1)
def forward(self, inputs):
residual = inputs # inputs : [batch_size, len_q,
d_model]
output = nn.ReLU()(self.conv1(inputs.transpose
(1, 2)))
output = self.conv2(output).transpose(1, 2)
return nn.LayerNorm(d_model)(output + residual) |
Èý¡¢Decoder½âÂëÆ÷
ÎÒÃÇÔÙÀ´»Ø¹ËÒ»ÏÂTransformerµÄÕûÌå½á¹¹£º

DecoderÓëEncoder²»Í¬µÄÊÇ£¬¶îÍâ¶à¼ÓÁËÒ»²ãEncoder-Decoder AttentionµÄÔËË㣬¾ßÌå½éÉÜÈçÏ¡£
3.1¡¢Encoder-decoder attention
Decoder°üÀ¨6¸ödecoder block£¬Ã¿Ò»¸ödecoder blockÓÉÈý²¿·Ö×é³É£¬³ýÁËdecoder block°üº¬µÄÁ½²ãÒÔÍ⣬¶àÁËÒ»²ãencoder-decoder attention¡£Ò²¾ÍÊÇ˵£¬³ýÁËdecoder±¾ÉíµÄÊäÈ룬encoderµÄÊä³öÒ²×÷ΪdecoderÊäÈëµÄÒ»²¿·Ö½øÐÐÔËËã¡£´Ó´úÂëDecoderLayer()ÖеÚ5ÐпÉÒÔÌåÏÖ³öÀ´¡£
ÔÚ1.2½ÚÖÐÎÒÃÇÖªµÀTransformerµÄEncoder±àÂëÆ÷°üÀ¨6¸öencoder block£¬ÆäÖÐÿһ¸öencoder blockÖ÷ÒªÓÉÁ½²¿·Ö×é³É£¬°üÀ¨Multihead attention²ã¡¢Feed Forward²ã¡£EncoderµÄÊäÈëÊ×ÏȾ¹ýMultihead attention²ã£¬ÕâÒ»²ã°ïÖúencoderÔÚÊäÈëij¸öµ¥´ÊʱÀí½âÆäÔÚÕû¸ö¾ä×ÓÖеÄÉÏÏÂÎÄÓïÒ壬Êä³öÖ®ºóÔÙË͵½Ç°À¡Éñ¾ÍøÂçÖУ¬¸÷²ãÒÀ´ÎÑ»·µü´ú¡£

Decoder×ÜÌåµÄ¹¹½¨´úÂëÈçÏ£º
class Decoder(nn.Module):
def __init__(self):
super(Decoder, self).__init__()
self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
self.pos_emb = nn.Embedding.from_pretrained (get_sinusoid_
encoding_ table(tgt_len+1, d_model),freeze=True)
self.layers = nn.ModuleList([DecoderLayer() for
_ in range (n_layers)]) |
DecoderLayer()¸ºÔðÿһ¸öEncoder blockµÄ¹¹½¨£º
class DecoderLayer(nn.Module):
def __init__(self):
super(DecoderLayer, self).__init__()
self.dec_self_attn = MultiHeadAttention()
self.dec_enc_attn = MultiHeadAttention()
self.pos_ffn = PoswiseFeedForwardNet()
def forward(self, dec_inputs, enc_outputs,
dec_self_ attn_mask, dec_enc_attn_mask):
dec_outputs, dec_self_attn = self.dec_self_attn
(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
dec_outputs, dec_enc_attn = self.dec_enc_attn
(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
dec_outputs = self.pos_ffn (dec_outputs)
return dec_outputs, dec_self_attn, dec_enc_attn |
Ìáʾ£º
DecoderÖеÄattentionÓëencoderÖеÄattentionÓÐËù²»Í¬¡£DecoderÖеÄattentionÖе±Ç°µ¥´ÊÖ»Êܵ±Ç°µ¥´Ê֮ǰÄÚÈݵÄÓ°Ï죬¶øencoderÖеÄÿ¸öµ¥´Ê»áÊܵ½Ç°ºóÄÚÈݵÄÓ°Ïì¡£¾ßÌåÊÇÈçºÎʵÏÖµÄÄØ£¿Çë¿´3.2½Úmask»úÖÆµÄ½²½â¡£
3.2¡¢Mask»úÖÆ
MasksÔÚtransformerÄ£ÐÍÖ÷ÒªÓÐÁ½¸ö×÷Óãº
1¡¢ÔÚ±àÂëÆ÷ºÍ½âÂëÆ÷ÖУºµ±ÊäÈëΪpadding£¬×¢ÒâÁ¦»áÊÇ0¡£
2¡¢ÔÚ½âÂëÆ÷ÖУºÔ¤²âÏÂÒ»¸öµ¥´Ê£¬±ÜÃâ½âÂëÆ÷͵͵¿´µ½ºóÃæµÄ·ÒëÄÚÈÝ¡£
EncoderÊäÈë¶ËÉú³ÉmaskºÜ¼òµ¥£º
def get_attn_pad_mask(seq_q, seq_k):
# print(seq_q)
batch_size, len_q = seq_q.size()
batch_size, len_k = seq_k.size()
# eq(zero) is PAD token
pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)
# batch _size x 1 x len_k(=len_q), one is masking
return pad_attn_mask.expand(batch_size, len_q,
len_k) # batch_size x len_q x len_ |
ͬÑùµÄ£¬Êä³ö¶ËÒ²¿ÉÒÔÉú³ÉÒ»¸ömask£¬µ«ÊÇ»á¶îÍâÔö¼ÓÒ»¸ö²½Ö裺
Ä¿±êÓï¾ä£¨ÊäÈë·¨Ó¡ª>Êä³öÓ¢Ó×÷Ϊ³õʼֵÊä½ø½âÂëÆ÷ÖС£decoderͨ¹ýencoderµÄÈ«²¿Êä³ö£¬ÒÔ¼°Ä¿Ç°ÒÑ·ÒëµÄµ¥´ÊÀ´Ô¤²âÏÂÒ»¸öµ¥´Ê¡£Òò´Ë£¬ÎÒÃÇÐèÒª·ÀÖ¹½âÂëÆ÷͵¿´µ½»¹Ã»Ô¤²âµÄµ¥´Ê¡£ÎªÁË´ï³ÉÕâ¸öÄ¿µÄ£¬ÎÒÃÇÓõ½ÁËsubsequent_maskº¯Êý£º
def get_attn_subsequent_mask(seq):
attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
subsequent_mask = np.triu(np.ones (attn_shape),
k=1)
subsequent_mask = torch.from_numpy (subsequent_mask)
.byte()
return subsequent_mask |

ÕâÑùÔÚDecoderµÄattentionÖÐÓ¦ÓÃmask£¬Ã¿Ò»´ÎÔ¤²â¶¼Ö»»áÓõ½Õâ¸ö´Ê֮ǰµÄ¾ä×Ó£¬¶ø²»ÊÜÖ®ºó¾ä×ÓµÄÓ°Ïì
ËÄ¡¢ÎÄÄ©×ܽá
ͨ¹ý±¾ÎÄÎÒÃǶÔTransformerµÄÀ´ÁúÈ¥ÂöÒѾ¸ãµÃ²î²»¶àÁË£¬ÊÇʱºò¶ÔBertÏÂÊÖÁË¡£Bert±¾ÖÊÉϾÍÊÇË«ÏòTransformerµÄEncoder£¬Òò´Ë¸ã¶®TransformerµÄÔÀí¶ÔBertÄ£Ð͵ÄÀí½âÖÁ¹ØÖØÒª¡£´ÓSelf-Attentionµ½Transformer£¬ÔÙµ½Bert¿ÉÒÔ˵ÊÇ»·»·Ïà¿Û£¬È±Ò»²»¿É¡£
|