先来构建 transformers 层,其中最重要的是自注意力机制。它使序列中的每个位置能够与同一序列中的每个其他位置接合并确定其相关性。
来用第二个单词(也就是 journey)举例子,先来计算他跟其他单词的相关性分数(注意力分数)。

import torch inputs = torch.tensor( [[0.43, 0.15, 0.89], # Your (x^1) [0.55, 0.87, 0.66], # journey (x^2) [0.57, 0.85, 0.64], # starts (x^3) [0.22, 0.58, 0.33], # with (x^4) [0.77, 0.25, 0.10], # one (x^5) [0.05, 0.80, 0.55]] # step (x^6) ) query = inputs[1] # 2nd input token is the query attn_scores_2 = torch.empty(inputs.shape[0]) for i, x_i in enumerate(inputs): attn_scores_2[i] = torch.dot(x_i, query) # dot product (transpose not necessary here since they are 1-dim vectors) print(attn_scores_2)
注意力权重最后需要归一化,按照惯例,非归一化的注意力权重被称为“注意力分数”,而归一化的注意力分数(总和为 1)被称为“注意力权重”
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum() print("Attention weights:", attn_weights_2_tmp) print("Sum:", attn_weights_2_tmp.sum())
实际会使用 pytorch 的 softmax 进行归一化,该函数在处理极端值方面更好,并且在训练期间具有更理想的梯度属性。
attn_weights_2 = torch.softmax(attn_scores_2, dim=0) print("Attention weights:", attn_weights_2) print("Sum:", attn_weights_2.sum())

query = inputs[1] # 2nd input token is the query context_vec_2 = torch.zeros(query.shape) for i,x_i in enumerate(inputs): context_vec_2 += attn_weights_2[i]*x_i print(context_vec_2)
计算了输入 2 的注意力权重和上下文向量,接下来,推广此计算以计算所有注意力权重和上下文向量。
torch.manual_seed(123) W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False) W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False) W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False) # shape is 3 * 2
keys = inputs @ W_key values = inputs @ W_value print("keys.shape:", keys.shape) print("values.shape:", values.shape) # keys.shape: torch.Size([6, 2]) # values.shape: torch.Size([6, 2])
attn_scores_2 = query_2 @ keys.T # All attention scores for given query print(attn_scores_2)
d_k = keys.shape[1] attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1) print(attn_weights_2)
最后得出输入 2 的上下文向量(首次引入 value)
context_vec_2 = attn_weights_2 @ values print(context_vec_2)
那怎么解决?就是让输入后面的注意力上下文乘以 0 。
tensor([[1., 0., 0., 0., 0., 0.], [1., 1., 0., 0., 0., 0.], [1., 1., 1., 0., 0., 0.], [1., 1., 1., 1., 0., 0.], [1., 1., 1., 1., 1., 0.], [1., 1., 1., 1., 1., 1.]])
而实际中,可以在对角线上方的非规范化注意力分数进入 softmax 函数之前,用负无穷大来掩盖对角线上方的未归一化注意力分数,而不是将对角线上方的注意力权重归零并重新规范化结果。
为什么使用 -infmask = torch.triu(torch.ones(context_length, context_length), diagonal=1) masked = attn_scores.masked_fill(mask.bool(), -torch.inf) attn_weights = torch.softmax(masked / keys.shape[-1]**0.5, dim=-1)
使用 dropout
增加 dropout 层,目的是为防止模型过拟合,举个例子就是防止鹦鹉学舌。方法是随机去掉一些权重。
class MultiHeadAttentionWrapper(nn.Module): def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): super().__init__() self.heads = nn.ModuleList( [CausalAttention(d_in, d_out, context_length, dropout, qkv_bias) for _ in range(num_heads)] ) def forward(self, x): return torch.cat([head(x) for head in self.heads], dim=-1)
class MultiHeadAttention(nn.Module): def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): super().__init__() assert (d_out % num_heads == 0), \ "d_out must be divisible by num_heads" self.d_out = d_out self.num_heads = num_heads self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs self.dropout = nn.Dropout(dropout) self.register_buffer( "mask", torch.triu(torch.ones(context_length, context_length), diagonal=1) ) # 具体作用下面有详细解释 def forward(self, x): b, num_tokens, d_in = x.shape keys = self.W_key(x) # Shape: (b, num_tokens, d_out) queries = self.W_query(x) values = self.W_value(x) # We implicitly split the matrix by adding a `num_heads` dimension # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # 具体作用下面有详细解释 values = values.view(b, num_tokens, self.num_heads, self.head_dim) queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) keys = keys.transpose(1, 2) queries = queries.transpose(1, 2) values = values.transpose(1, 2) # Compute scaled dot-product attention (aka self-attention) with a causal mask attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head # Original mask truncated to the number of tokens and converted to boolean mask_bool = self.mask.bool()[:num_tokens, :num_tokens] # Use the mask to fill attention scores attn_scores.masked_fill_(mask_bool, -torch.inf) attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) attn_weights = self.dropout(attn_weights) # Shape: (b, num_tokens, num_heads, head_dim) context_vec = (attn_weights @ values).transpose(1, 2) # Combine heads, where self.d_out = self.num_heads * self.head_dim context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) context_vec = self.out_proj(context_vec) # optional projection return context_vec
的用处view 的作用运行代码
cd 03/ python .\main.py