Greedy Matching 貪婪比對方法是基于詞級别的一種矩陣比對方法,在給出的兩個句子r和r^,每一個詞w∈r都會經過詞向量轉換後變為詞向量ew,同時與r^中的每一個詞序列w^∈r^的詞向量ew^最大程度進行餘弦相似度比對,最後得出的結果是所有詞比對之後的均值:
首先要去glove官網https://nlp.stanford.edu/projects/glove/下載下傳訓練好的英文詞向量壓縮包,我選擇的是glove.840B.300d.zip,解壓得到檔案glove.840B.300d.txt,你可以下載下傳比較小的包,解壓并儲存到你建立的項目下,中文的話要自己去騰訊的AILab下載下傳中文詞向量,以下是實作計算兩個句子基于詞級别的貪婪比對代碼:
import numpy as np
import re
def cosine_similarity(x, y, norm=False):
""" 計算兩個向量x和y的餘弦相似度 """
assert len(x) == len(y), "len(x) != len(y)"
zero_list = [0] * len(x)
if x == zero_list or y == zero_list:
return float(1) if x == y else float(0)
# method 1
res = np.array([[x[i] * y[i], x[i] * x[i], y[i] * y[i]] for i in range(len(x))])
cos = sum(res[:, 0]) / (np.sqrt(sum(res[:, 1])) * np.sqrt(sum(res[:, 2])))
return 0.5 * cos + 0.5 if norm else cos # 歸一化到[0, 1]區間内
def conver_float(x):
'''将詞向量資料類型轉換成可以計算的浮點類型'''
float_str = x
return [float(f) for f in float_str]
def process_wordembe(path):
'''
将詞向量檔案中的所有詞向量存放到一個清單lines裡
:param path: a path of english word embbeding file 'glove.840B.300d.txt'
:return: a list, element is a 301 dimension word embbeding, it's form like this
['- 0.12332 ... -0.34542\n', ', 0.23421 ... -0.456733\n', ..., 'you 0.34521 0.78905 ... -0.23123\n']
'''
f = open(path, 'r', encoding='utf-8')
lines = f.readlines()
return lines
def word2vec(x, lines):
'''
将一個字元串(這裡指句子)中所有的詞都向量化,并存放到一個清單裡
:param x: a sentence/sequence, type is string, for example 'hello, how are you ?'
:return: a list, the form like [[word_vector1],...,[word_vectorn]], save per word embbeding of a sentence.
'''
x = x.split()[:-1]
x_words = []
for w in x:
for line in lines:
# print(line)
if w == line.split()[0]: # 将詞向量按空格切分到一個清單裡,将清單的第一個詞與x的word比較
print(w)
x_words.append(conver_float(line[:-1].split()[1:])) # 若在詞向量清單中找到對應的詞向量,添加到x_words清單裡
break
return x_words
def greedy(x, x_words, y_words):
'''
上面提到的第一個公式
:param x: a sentence, type is string.
:param x_words: list[list1, list2,...,listn], listk(k=1...n) is word vector which from sentence x,
:param y_words: list[list1, list2,..., listn], listk(k=1...n) is word vector which from sentence y,
:return: a scalar, it's value is in [0, 1]
'''
cosine = [] # 存放一個句子的一個詞與另一個句子的所有詞的餘弦相似度
sum_x = 0 # 存放最後得到的結果
for x_v in x_words:
for y_v in y_words:
cosine.append(cosine_similarity(x_v, y_v))
if cosine:
sum_x += max(cosine)
cosine = []
sum_x = sum_x / len(x.split()[:-1])
return sum_x
def greedy_match(path, x, y):
'''
上面的第二個公式
:param lines: english word embbeding list, like[['-','0.345',...,'0.3123'],...]
:param x: a sentence, here is a candidate answer
:param y: a sentence, here is reference answer
:return: a scalar in [0,1]
'''
lines = process_wordembe(path)
# x_words.append(line.split()[1:] for line in lines for w in x if w in line)
x_words = word2vec(x, lines)
y_words = word2vec(y, lines)
# greedy match
sum_x = greedy(x, x_words, y_words)
sum_y = greedy(y, y_words, x_words)
score = (sum_x+sum_y)/2
return score
if __name__ == '__main__':
# print(cosine_similarity([1, 1], [0, 0])) # 0.0
# print(cosine_similarity([1, 1], [-1, -1])) # -1.0
# print(cosine_similarity([1, 1], [2, 2])) # 1.0
f = open('G:\\PycharmProjects\\test\\glove.840B.300d.txt', 'r', encoding='utf-8') #這裡改成你自己項目的路徑
path = 'G:\\PycharmProjects\\test\\glove.840B.300d.txt' #這裡改成你自己項目的路徑
# lines = process_wordembe(path)
# print(lines[:1])
x = "what 's wrong ? \n"
y = "I 'm fine . \n"
# x_words = word2vec(x, lines)
# y_words = word2vec(y, lines)
# print(x_words[0])
# print(y_words[0])
# sum = greedy(x, x_words, y_words)
# print(sum)
score = greedy_match(path, x, y)
print(score)
如有問題,歡迎評論指正。