天天看点

LDA主题模型进阶

其实我在TF-IDF和gensim实现主题提取写过LDA

关于LDA的理论相关知识以后有机会阐释

import numpy as np
from gensim import corpora,models,similarities
from pprint import pprint #打印出来的更好看
           

1.构建停用词列表

def load_stopword():
    f_stop=open('stopword.txt')
    sw=[line.strip() for line in f_stop]
    return sw
stop_words=load_stopword()
print(stop_words)
           
LDA主题模型进阶

2.读取数据文件并去除停用词

f=open('news.dat',encoding='UTF-8')
texts=[[word for word in line.strip().lower().split() if word not in stop_words] for line in f]
M=len(texts)
pprint(texts)
           

3.建立词典+文本向量+TF-IDF转换

dictionary=corpora.Dictionary(texts)
corpus=[dictionary.doc2bow(text) for text in texts]
corpus_tfidf=models.TfidfModel(corpus)[corpus]
           

4.LDA模型拟合推断

lda=models.LdaModel(corpus_tfidf,num_topics=30,id2word=dictionary,
                   alpha=0.01,eta=0.01,minimum_probability=0.001,
                   update_every=1,chunksize=100,passes=1)
           

4.1所有主题

doc_topic=[a for a in lda[corpus_tfidf]]
pprint(doc_topic)
           

4.2随机打印某10个文档的主题

num_show_topic=10#每个文档显示前几个主题
doc_topics=lda.get_document_topics(corpus_tfidf)#所有文档的主题分布
idx=np.arange(M)
np.random.shuffle(idx)
idx=idx[:10]
for i in idx:
    topic=np.array(doc_topic[i])
    topic_distribute=np.array(topic[:,1])
    topic_idx=topic_distribute.argsort()[:-num_show_topic-1:-1]
    print ('第%d个文档的前%d个主题:' % (i, num_show_topic)), topic_idx
    print (topic_distribute[topic_idx])
           
LDA主题模型进阶

4.3随机打印某10个文档的主题的前7个词

num_show_term=7#每个主题显示几个词
for topic_id in range(num_show_topic):
    print('主题#%d:\t' % topic_id)
    term_distribute_all = lda.get_topic_terms(topicid=topic_id)
    term_distribute = term_distribute_all[:num_show_term]
    term_distribute = np.array(term_distribute)
    term_id = term_distribute[:, 0].astype(np.int)
    print ('词:\t',)
    for t in term_id:
        print (dictionary.id2token[t],)
    print('--------------')
    print('\n概率:\t', term_distribute[:, 1])
           
LDA主题模型进阶