天天看点

python scipy使用余弦定理求句子相似度

import jieba
import gensim
import numpy as np
from gensim import corpora
from scipy.spatial.distance import pdist
text1 = "我去玉龙雪山并且喜欢玉龙雪山玉龙雪山"
text2 = "我去玉龙雪山并且玉龙雪山玉龙雪山"
text_dict = [[word for word in jieba.cut(text1)]]+[[word for word in jieba.cut(text2)]]
dictionary = corpora.Dictionary(text_dict,prune_at=)
bow1= dictionary.doc2bow(jieba.cut(text1))
print len(dictionary.keys())
# 5
numpy_X = gensim.matutils.corpus2dense([bow1],num_terms=len(dictionary.keys()),dtype=np.int).T


bow2= dictionary.doc2bow(jieba.cut(text2))
numpy_Y = gensim.matutils.corpus2dense([bow2],num_terms=len(dictionary.keys()),dtype=np.int).T

# 也可以再加个标准化
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(numpy_X[])
pred_X_train = scaler.transform(numpy_X[])
pred_X_test = scaler.transform(numpy_Y[])

sims=-pdist([numpy_X[],numpy_Y[]],'cosine')
sims1=-pdist([pred_X_train,pred_X_test],'cosine')
print sims
# [ 0.96076892]
print sims1
# [ 0.9]