其实我在TF-IDF和gensim实现主题提取写过LDA
关于LDA的理论相关知识以后有机会阐释
import numpy as np
from gensim import corpora,models,similarities
from pprint import pprint #打印出来的更好看
1.构建停用词列表
def load_stopword():
f_stop=open('stopword.txt')
sw=[line.strip() for line in f_stop]
return sw
stop_words=load_stopword()
print(stop_words)
2.读取数据文件并去除停用词
f=open('news.dat',encoding='UTF-8')
texts=[[word for word in line.strip().lower().split() if word not in stop_words] for line in f]
M=len(texts)
pprint(texts)
3.建立词典+文本向量+TF-IDF转换
dictionary=corpora.Dictionary(texts)
corpus=[dictionary.doc2bow(text) for text in texts]
corpus_tfidf=models.TfidfModel(corpus)[corpus]
4.LDA模型拟合推断
lda=models.LdaModel(corpus_tfidf,num_topics=30,id2word=dictionary,
alpha=0.01,eta=0.01,minimum_probability=0.001,
update_every=1,chunksize=100,passes=1)
4.1所有主题
doc_topic=[a for a in lda[corpus_tfidf]]
pprint(doc_topic)
4.2随机打印某10个文档的主题
num_show_topic=10#每个文档显示前几个主题
doc_topics=lda.get_document_topics(corpus_tfidf)#所有文档的主题分布
idx=np.arange(M)
np.random.shuffle(idx)
idx=idx[:10]
for i in idx:
topic=np.array(doc_topic[i])
topic_distribute=np.array(topic[:,1])
topic_idx=topic_distribute.argsort()[:-num_show_topic-1:-1]
print ('第%d个文档的前%d个主题:' % (i, num_show_topic)), topic_idx
print (topic_distribute[topic_idx])
4.3随机打印某10个文档的主题的前7个词
num_show_term=7#每个主题显示几个词
for topic_id in range(num_show_topic):
print('主题#%d:\t' % topic_id)
term_distribute_all = lda.get_topic_terms(topicid=topic_id)
term_distribute = term_distribute_all[:num_show_term]
term_distribute = np.array(term_distribute)
term_id = term_distribute[:, 0].astype(np.int)
print ('词:\t',)
for t in term_id:
print (dictionary.id2token[t],)
print('--------------')
print('\n概率:\t', term_distribute[:, 1])