Cross-attention plays a crucial role in transformer-based encoder-decoder models by enabling the decoder to focus on relevant parts of the input sequence while generating the output sequence. It acts as a bridge between the encoder and decoder, allowing the decoder to selectively attend to different parts of the encoded input based on the current decoding step. This selective attention mechanism helps the decoder gather the necessary information from the input to generate coherent and contextually appropriate output tokens. Key functions include aligning input and output sequences, providing context for output generation, and enabling selective information retrieval from the encoded input. This is crucial for tasks like machine translation where different parts of the input might be relevant at different stages of output generation.
In machine translation, when translating the English sentence "The cat sat on the mat" to French, the decoder might focus on the word "cat" when generating the French word "chat," and on the word "mat" when generating "tapis." Cross-attention allows the decoder to dynamically select and focus on the relevant parts of the input sentence at each decoding step.
import torch
import torch.nn as nn
# Simplified implementation of cross-attention
class CrossAttention(nn.Module):
def __init__(self, d_model, num_heads):
super(CrossAttention, self).__init__()
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_q = nn.Linear(d_model, d_model) # Query projection
self.W_k = nn.Linear(d_model, d_model) # Key projection
self.W_v = nn.Linear(d_model, d_model) # Value projection
self.W_o = nn.Linear(d_model, d_model) # Output projection
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
# 1. Project query, key, and value
q = self.W_q(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # (batch_size, num_heads, seq_len_q, d_k)
k = self.W_k(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # (batch_size, num_heads, seq_len_k, d_k)
v = self.W_v(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # (batch_size, num_heads, seq_len_v, d_k)
# 2. Scaled dot-product attention
attention_scores = torch.matmul(q, k.transpose(-1, -2)) / (self.d_k ** 0.5)
if mask is not None:
attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
attention_weights = torch.softmax(attention_scores, dim=-1) # (batch_size, num_heads, seq_len_q, seq_len_k)
context_vector = torch.matmul(attention_weights, v) # (batch_size, num_heads, seq_len_q, d_k)
# 3. Concatenate and project
context_vector = context_vector.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
output = self.W_o(context_vector)
return output, attention_weights
# Example usage:
batch_size, seq_len_q, seq_len_k, d_model, num_heads = 2, 5, 7, 512, 8 # Decoder input (query) length is 5, encoder output (key,value) length is 7.
query = torch.randn(batch_size, seq_len_q, d_model)
key = torch.randn(batch_size, seq_len_k, d_model)
value = torch.randn(batch_size, seq_len_k, d_model)
cross_attention = CrossAttention(d_model, num_heads)
output, attention_weights = cross_attention(query, key, value)
print("Output Shape:", output.shape) # Output: (batch_size, seq_len_q, d_model)
print("Attention Weights Shape:", attention_weights.shape) # Output: (batch_size, num_heads, seq_len_q, seq_len_k)