說明
-
最終的模型檔案:
連結:
https://pan.baidu.com/s/1acGhejPCw98Mx4iKozVZdw 提取碼:vsm1 - 源碼github位址: https://github.com/datadevsh/wiki-gensim-word2vector
- 如果遇到編碼問題,參考《維基百科檔案解析成中文遇到的變量類型、編碼問題》 https://my.oschina.net/datadev/blog/1836529
- 如果使用pycharm,可能會發生記憶體不足。把兩個pycharm64.exe.vmoptions檔案的-Xmx參數調大。
![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsIyZuBnL5YWZkJDNmZzN1gDOjFjNwQjN4gTZwUDN1EzN0cDMyMjMmFDN0YmNi9CXt92Yu4GZjlGbh5SZslmZxl3Lc9CX6MHc0RHaiojIsJye.png)
執行時間
1 解析xml 13分鐘
2 繁體2簡體 1分鐘
3 jieba分詞 27分鐘
4 模型訓練 22分鐘
總計63分鐘。
1. 下載下傳檔案
下載下傳pages-articles.xml檔案。打開下面的連結,選最近的日期,進入頁面後,搜尋“pages-articles.xml”。
下載下傳位址:
https://dumps.wikimedia.org/zhwiki/2. 解析xml
# -*- coding: utf-8 -*-
# 解析xml
import logging
import os.path
import sys
from gensim.corpora import WikiCorpus
import time
begin = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s"% ' '.join(sys.argv))
# if len(sys.argv) > 1:
# print(globals()['__doc__'] % locals())
# sys.exit(1)
inp,outp = sys.argv[1:3]
space = ' '
i = 0
output = open(outp,'w',encoding='utf-8')
wiki = WikiCorpus(inp,lemmatize=False,dictionary={ })
for text in wiki.get_texts():
s = space.join(text)+"\n"
output.write(s)
i = i+1
if(i% 10000 == 0):
logger.info("Saved "+str(i) + " articles")
output.close()
logger.info("Finished Saved "+ str(i) +" articles")
end = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("begin",begin)
print("end ",end)
# python 1process-xml.py zhwiki-20180620-pages-articles.xml.1.49G.bz2 wiki.zh.1.49G.text
3. 繁體轉簡體
使用opencc。下載下傳位址如下,下載下傳opencc-1.0.1-win64.7z。
https://bintray.com/package/files/byvoid/opencc/OpenCC.\pencc -i wiki_text.txt -o test.txt -c t2s.json
-i 輸入
-o 輸出
運作1分鐘左右。
4. jieba分詞
#-*- coding: utf-8 -*-
import jieba
import jieba.analyse
import codecs,sys
import time
begin = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) #
def cut_words(sentence):
return " ".join(jieba.cut(sentence)).encode('utf-8')
f=codecs.open('D:/soft/opencc-1.0.1-win64/wiki-ts.txt','r',encoding='utf8')
target = codecs.open("D:/soft/opencc-1.0.1-win64/wiki.jieba.txt",'w',encoding='utf8')
print(" open file")
line_num = 1
line = f.readline()
while line:
if(line_num % 10000 == 0):
print('---------------processing',line_num,'articles------------')
line_seg=" ".join(jieba.cut(line))
target.writelines(line_seg)
line_num=line_num + 1
line = f.readline()
f.close()
target.close()
end = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) #
print("begin",begin)
print("end ",end)
exit()
5. 模型訓練
#-*- coding: utf-8 -*-
# @Describe:
# @File : word2vec-model.py
import logging
import os.path
import sys
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import time
begin = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# if len(sys.argv) < 4:
# print(globals()['__doc__'] % locals())
# sys.exit(1)
# inp = "D:/soft/opencc-1.0.1-win64/wiki-jieba-test.txt"
inp = "D:/soft/opencc-1.0.1-win64/wiki.jieba.txt"
outp1 ='D:/soft/opencc-1.0.1-win64/wiki.model'
outp2 = 'D:/soft/opencc-1.0.1-win64/wiki.vector'
model = Word2Vec(LineSentence(inp),size=400,window=5,min_count=5,workers=multiprocessing.cpu_count())
model.save(outp1)
model.wv.save_word2vec_format(outp2,binary=False)
end = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("begin",begin)
print("end ",end)
#python word2vec-model.py txt model wiki.zh.text.vector
#opencc -i wiki_text.txt -o test.txt -c t2s.json
6.測試
#-*- coding: utf-8 -*-
# @Describe:
# @File : test-model.py
from gensim.models import Word2Vec
import time
begin = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
model = Word2Vec.load('D:/soft/opencc-1.0.1-win64/wiki.model')
# testwords = ['蘋果','數學','學術','白癡','籃球']
# for i in range(5):
# res = model.most_similar(testwords[i])
# print(testwords[i])
# print(res)
# 二級類目 '日用百貨','收納整理','家紡','家庭清潔','綠植園藝','廚房用品'
# testwords = ['日用百貨','收納整理','家紡','家庭清潔','綠植園藝','廚房用品']
word = '被子'
for i in testwords:
sim = model.n_similarity(word,i)
print(i,sim)
testwords = ['蘋果','數學','學術','白癡','籃球']
for i in range(5):
res = en_wiki_word2vec_model.most_similar(testwords[i])
print(testwords[i])
print(res)
print(model.most_similar(word))
end = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("begin",begin)
print("end ",end)
# 收納整理 0.16833255
# 家紡 0.14426242
# 家庭清潔 0.066685855
# 綠植園藝 0.028275765
# 廚房用品 0.2936325
# 蘋果
# [('apple', 0.5410169363021851), ('蘋果公司', 0.4918888807296753), ('咬一口', 0.4741284251213074), ('洋蔥', 0.4696866571903229), ('冰淇淋', 0.4614587426185608), ('蘋果電腦', 0.45998817682266235), ('黑莓', 0.4557930827140808), ('水果', 0.4546721577644348), ('iphone', 0.44593721628189087), ('草莓', 0.4437388479709625)]
# 數學
# [('微積分', 0.7083343267440796), ('算術', 0.6934097409248352), ('數學分析', 0.663016140460968), ('機率論', 0.6389687061309814), ('數論', 0.6296793222427368), ('邏輯學', 0.6191371083259583), ('幾何學', 0.60764479637146), ('數理邏輯', 0.5989662408828735), ('實體', 0.5965093970298767), ('高等數學', 0.5895018577575684)]
# 學術
# [('學術研究', 0.7319201231002808), ('漢學', 0.5988526344299316), ('學術活動', 0.5887891054153442), ('科學研究', 0.5864561796188354), ('學術界', 0.5863242149353027), ('教學研究', 0.5767545700073242), ('教研', 0.5732147097587585), ('學術交流', 0.561274528503418), ('科研', 0.5595779418945312), ('醫學教育', 0.5571168661117554)]
# 白癡
# [('瘋子', 0.5986206531524658), ('書呆子', 0.5612877607345581), ('騙子', 0.538498044013977), ('怪胎', 0.5305827856063843), ('愛哭鬼', 0.5293511152267456), ('傻子', 0.5216787457466125), ('自戀', 0.5185167789459229), ('變态', 0.5165976285934448), ('自以為是', 0.516464114189148), ('蠢', 0.5106762051582336)]
# 籃球
# [('美式足球', 0.633753776550293), ('橄榄球', 0.6222437620162964), ('排球', 0.5964736938476562), ('棒球', 0.5949814319610596), ('男子籃球', 0.5927262306213379), ('冰球', 0.591292142868042), ('籃球員', 0.5610231161117554), ('籃球運動', 0.5576823353767395), ('足球', 0.5409365892410278), ('橄榄球隊', 0.5348620414733887)]