计算分类准确率

1
2
3
_, preds = torch.max(logits, dim=1) # 获取预测结果

correct += preds.eq(targets.expand_as(preds)).cpu().sum() # 计算分类正确样本个数

CosineLinear

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
class CosineLinear(nn.Module):
def __init__(self, in_features, out_features, nb_proxy=1, to_reduce=False, sigma=True):
super(CosineLinear, self).__init__()
self.in_features = in_features
self.out_features = out_features * nb_proxy
self.nb_proxy = nb_proxy
self.to_reduce = to_reduce
self.weight = nn.Parameter(torch.Tensor(self.out_features, in_features))
if sigma:
self.sigma = nn.Parameter(torch.Tensor(1))
else:
self.register_parameter('sigma', None)
self.reset_parameters()

def reset_parameters(self):
stdv = 1. / math.sqrt(self.weight.size(1))
self.weight.data.uniform_(-stdv, stdv)
if self.sigma is not None:
self.sigma.data.fill_(1)

def forward(self, input):
out = F.linear(F.normalize(input, p=2, dim=1), F.normalize(self.weight, p=2, dim=1))

if self.to_reduce:
# Reduce_proxy
out = reduce_proxies(out, self.nb_proxy)

if self.sigma is not None:
out = self.sigma * out

return {'logits': out}

余弦相似度

1
teacher_embedding: 60*768的tensor,student_embedding5*768的tensor,如何用nn.functional.cosine_similarity计算余弦相似度,返回一个5*60的矩阵
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import torch
import torch.nn.functional as F

# 假设输入张量
teacher_embedding = torch.randn(60, 768) # [60, 768]
student_embedding = torch.randn(5, 768) # [5, 768]

# 计算余弦相似度矩阵 (5×60)
def compute_cosine_similarity_matrix(student, teacher):
# 扩展维度以便广播计算
# student: [5, 1, 768]
# teacher: [1, 60, 768]
student = student.unsqueeze(1) # 在dim=1插入新维度
teacher = teacher.unsqueeze(0) # 在dim=0插入新维度

# 计算余弦相似度 [5, 60]
similarity_matrix = F.cosine_similarity(student, teacher, dim=2)
return similarity_matrix

cosine_sim_matrix = compute_cosine_similarity_matrix(student_embedding, teacher_embedding)
print(cosine_sim_matrix.shape) # 输出: torch.Size([5, 60])

关键点说明:

  1. 维度扩展

    • student_embedding 从 [5,768] → [5,1,768]
    • teacher_embedding 从 [60,768] → [1,60,768]
    • 这样广播机制会自动将其扩展为 [5,60,768] 的形状
  2. 余弦相似度计算

    • F.cosine_similarity(..., dim=2) 表示沿着最后一个维度(768维)计算相似度
    • 计算结果会自动压缩为 [5,60] 的形状
  3. 数学等价性
    这等价于为每个student向量(共5个)分别计算它与所有60个teacher向量的余弦相似度

可视化理解:

1
2
3
4
5
6
7
8
9
学生向量: [s1, s2, s3, s4, s5] 每个s_i是768维
教师向量: [t1, t2, ..., t60] 每个t_j是768维

结果矩阵:
t1 t2 ... t60
s1 [ cos(s1,t1), cos(s1,t2), ..., cos(s1,t60) ]
s2 [ cos(s2,t1), cos(s2,t2), ..., cos(s2,t60) ]
...
s5 [ cos(s5,t1), cos(s5,t2), ..., cos(s5,t60) ]

其他实现方式

如果更喜欢使用矩阵乘法实现(实际结果相同):

1
2
3
4
5
6
# 归一化向量
student_norm = F.normalize(student_embedding, p=2, dim=1) # [5,768]
teacher_norm = F.normalize(teacher_embedding, p=2, dim=1) # [60,768]

# 矩阵乘法计算相似度
cosine_sim_matrix = torch.mm(student_norm, teacher_norm.T) # [5,60]

两种方法在数值上是等价的,第一种方法更简洁,第二种方法更直观。