原理篇:https://blog.csdn.net/weixin_41089007/article/details/106604465
前段時間寫了一篇關于幾種改進的embedding算法的代碼,這幾天嘗試實作了一下,但是不知道為什麼效果都比gensim自帶的word2vec差,原因可能是自己寫的CBOW模型出了問題,也有可能是那幾個方法複現錯了,但是不管怎麼說還是記錄一下吧!
代碼位址
https://github.com/ZJUhjx/NewEmbeddings
關鍵代碼
TF-CR Word Embedding
首先生成單詞-權重對應表
def genWordWeight(dataSource, stopWordDict):
data = pd.read_csv(dataSource)
positive = data[data['sentiment'] == 'positive']
negative = data[data['sentiment'] == 'negative']
review_pos = positive['review'].tolist()
review_pos = [review.strip().split() for review in review_pos]
review_pos = [word for review in review_pos for word in review]
review_pos = [word for word in review_pos if word not in stopWordDict]
count_pos = Counter(review_pos)
review_neg = negative['review'].tolist()
review_neg = [review.strip().split() for review in review_neg]
review_neg = [word for review in review_neg for word in review]
review_neg = [word for word in review_neg if word not in stopWordDict]
count_neg = Counter(review_neg)
word_weight = {}
for word in count_pos:
wc = count_pos[word]
nc = len(count_pos)
w = count_pos[word] + count_neg[word]
weight = wc * wc / (nc * w)
word_weight[word] = weight
with open("data/pos_weight.json", "w", encoding="utf-8") as f:
json.dump(word_weight, f)
word_weight = {}
for word in count_neg:
wc = count_neg[word]
nc = len(count_neg)
w = count_pos[word] + count_neg[word]
weight = wc * wc / (nc * w)
word_weight[word] = weight
with open("data/neg_weight.json", "w", encoding="utf-8") as f:
json.dump(word_weight, f)
然後生成最終embedding矩陣
def getTfcrEmbedding(vocab, embedding_matrix):
with open("data/pos_weight.json", "r", encoding="utf-8") as f:
pos_weight = json.load(f)
with open("data/neg_weight.json", "r", encoding="utf-8") as f:
neg_weight = json.load(f)
pos_embedding = np.zeros((embedding_matrix.shape[0], embedding_matrix.shape[1]))
for i, word in enumerate(vocab):
if word in ['PAD', 'UNK']:
pos_embedding[i] = embedding_matrix[i]
else:
pos_embedding[i] = embedding_matrix[i] * pos_weight.get(word, 0) # 沒有則代表0權重
neg_embedding = np.zeros((embedding_matrix.shape[0], embedding_matrix.shape[1]))
for i, word in enumerate(vocab):
if word in ['PAD', 'UNK']:
neg_embedding[i] = embedding_matrix[i]
else:
neg_embedding[i] = embedding_matrix[i] * neg_weight.get(word, 0)
tfcr_embedding = np.hstack([pos_embedding, neg_embedding])
return tfcr_embedding
Attention Word Embedding
class AttentionEmbeddingModel(nn.Module):
def __init__(self, vocab_size, embed_size, attn_size):
''' 初始化輸入和輸出embedding
'''
super().__init__()
self.vocab_size = vocab_size
self.embed_size = embed_size
self.attn_size = attn_size
# 共享參數,in out embedding共用一個
self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
self.k = nn.Embedding(self.vocab_size, self.attn_size)
self.q = nn.Embedding(self.vocab_size, self.attn_size)
initrange = 0.5 / self.embed_size
self.in_embed.weight.data.uniform_(-initrange, initrange)
initrange2 = 0.5 / self.attn_size
self.k.weight.data.uniform_(-initrange2, initrange2)
self.q.weight.data.uniform_(-initrange2, initrange2)
def forward(self, input_labels, pos_labels, neg_labels):
'''
input_labels: 中心詞, [batch_size]
pos_labels: 中心詞周圍 context window 出現過的單詞 [batch_size * (window_size * 2)]
neg_labelss: 中心詞周圍沒有出現過的單詞,從 negative sampling 得到 [batch_size, (window_size * 2 * K)]
b * 2c —— b * 2c * embed —— attn = b * 2c n*d1 b*2c*d1
return: loss
'''
input_embedding = self.in_embed(input_labels) # B * embed_size
pos_embedding = self.in_embed(pos_labels) # B * (2*C) * embed_size
neg_embedding = self.in_embed(neg_labels) # B * (2*C * K) * embed_size
input_k = self.k(input_labels) # B * attn_size
pos_k = self.q(pos_labels) # B * (2*C) * attn_size
log_pos = torch.bmm(pos_embedding, input_embedding.unsqueeze(2)).squeeze(2) # B * (2*C)
attn_pos = torch.bmm(pos_k, input_k.unsqueeze(2)).squeeze(2) # B * (2*C) 每個單詞都有對應的權重
log_pos = torch.mul(log_pos, attn_pos) # 點乘
log_neg = torch.bmm(neg_embedding, -input_embedding.unsqueeze(2)).squeeze(2) # B * (2*C*K)
log_pos = F.logsigmoid(log_pos).sum(1) # .sum()結果隻為一個數,.sum(1)結果是一維的張量
log_neg = F.logsigmoid(log_neg).sum(1) # batch_size
loss = log_pos + log_neg
return -loss
def input_embeddings(self):
return self.in_embed.weight.data.cpu().numpy()
平均權重詞向量方法由于死活安裝不了glove就沒去實作了。。。有興趣的同學可以自己複現一下。。。
實驗結果
epoch:5
embedding require grad:True