天天看點

python 樸素貝葉斯算法使用樸素貝葉斯算法使用

樸素貝葉斯算法使用

工具:Pycharm,win10,Python3.6.4

1.題目要求

根據如下資料使用樸素貝葉斯算法進行預測。

Document                            Content                       Category

d1                                ball goal cart goal                Sports

d2                                theater cart drama               Culture

d3                      drama strategy decision drama     Politics

d4                                        theater ball                   Culture

d5                              ball goal player strategy         Sports

d6                                 theater cart opera               Culture

d7                                    ball player strategy            ?

d8                                   theater cart decision           ?

2.Python代碼

現在有三種類别Culture,Politics,Sports,我們把這三個類别分别建一個檔案夾,并且把Content存入其中,這樣子周遊檔案的時候友善給資料打上标簽。首先擷取詞彙表,代碼和結果如下

import re
import numpy as np
import os


def textParse(String):
    list_String = re.split(r'\W*', String)
    return list_String


def readfiles():
    doc_list = []
    class_list = []
    file_lists = ['culture', 'politics', 'sports']
    for i in range(3):
        for txtfile in os.listdir(file_lists[i] + '/'):
            with open(file_lists[i] + '/' + txtfile, 'r', ) as f:
                word_list = textParse(f.read())
                doc_list.append(list(word_list))
                class_list.append(i + 1)
    # vocab_list = createVocabList(doc_list)
    return doc_list, class_list
if __name__ == '__main__':
    doc_list, class_list = readfiles()
    print(doc_list)
    print(class_list)
           
python 樸素貝葉斯算法使用樸素貝葉斯算法使用

根據詞彙表,講切分好的詞條轉換為詞條向量,代碼和結果如下

import re
import numpy as np
import os


def textParse(String):
    list_String = re.split(r'\W*', String)
    return list_String


def readfiles():
    doc_list = []
    class_list = []
    file_lists = ['culture', 'politics', 'sports']
    for i in range(3):
        for txtfile in os.listdir(file_lists[i] + '/'):
            with open(file_lists[i] + '/' + txtfile, 'r', ) as f:
                word_list = textParse(f.read())
                doc_list.append(list(word_list))
                class_list.append(i + 1)
    # vocab_list = createVocabList(doc_list)
    return doc_list, class_list
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setWords2Vec(vocablist, inputSet):
    returnVec = [0] * len(vocablist)
    for word in inputSet:
        if word in vocablist:
            returnVec[vocablist.index(word)] += 1
    return returnVec
if __name__ == '__main__':
    doc_list, class_list = readfiles()
    vocab_list = createVocabList(doc_list)
    trainingSet = list(range(6))
    trainMat = []
    trainLabel = []
    # print(doc_list[1])
    for docIndex in trainingSet:
        trainMat.append(setWords2Vec(vocab_list, doc_list[docIndex]))
        trainLabel.append(class_list[docIndex])
    print(trainMat)
    print(trainLabel)
           
python 樸素貝葉斯算法使用樸素貝葉斯算法使用

接下來就可以根據貝葉斯公式進行分類,但要注意會出現0機率的問題,是以我們将所有詞的出現數初始化為1,并将分母初始化為2,進行拉普拉斯平滑。代碼和結果如下:

import re
import numpy as np
import os


def textParse(String):
    list_String = re.split(r'\W*', String)
    return list_String


def readfiles():
    doc_list = []
    class_list = []
    file_lists = ['culture', 'politics', 'sports']
    for i in range(3):
        for txtfile in os.listdir(file_lists[i] + '/'):
            with open(file_lists[i] + '/' + txtfile, 'r', ) as f:
                word_list = textParse(f.read())
                doc_list.append(list(word_list))
                class_list.append(i + 1)
    # vocab_list = createVocabList(doc_list)
    return doc_list, class_list
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setWords2Vec(vocablist, inputSet):
    returnVec = [0] * len(vocablist)
    for word in inputSet:
        if word in vocablist:
            returnVec[vocablist.index(word)] += 1
    return returnVec

def train(trainMatrix, trainCategory):
    num_train = len(trainMatrix)
    num_words = len(trainMatrix[0])
    p_culture = list(trainCategory).count(1) / float(num_train)
    p_politics = list(trainCategory).count(2) / float(num_train)
    p_sports = list(trainCategory).count(3) / float(num_train)
    p_culture_Num = np.ones(num_words)
    p_politics_Num = np.ones(num_words)
    p_sports_Num = np.ones(num_words)
    p_culture_la = 2.0
    p_politics_la = 2.0
    p_sports_la = 2.0
    for i in range (num_train):
        if trainCategory[i] == 1:
            p_culture_Num += trainMatrix[i]
            p_culture_la += sum(trainMatrix[i])
        if trainCategory[i] == 2:
            p_politics_Num += trainMatrix[i]
            p_politics_la += sum(trainMatrix[i])
        if trainCategory[i] == 3:
            p_sports_Num += trainMatrix[i]
            p_sports_la += sum(trainMatrix[i])
    p_culture_vect = np.log(p_culture_Num/p_culture_la)
    p_politics_vect = np.log(p_politics_Num/p_politics_la)
    p_sports_vect = np.log(p_sports_Num/p_sports_la)
    return p_culture_vect,p_politics_vect,p_sports_vect,p_culture,p_politics,p_sports

def classify(vec,p_culture_vect,p_politics_vect,p_sports_vect,p_culture,p_politics,p_sports):
    p1 = sum(vec * p_culture_vect) + np.log(p_culture)
    p2 = sum(vec * p_politics_vect) + np.log(p_politics)
    p3 = sum(vec * p_sports_vect) + np.log(p_sports)
    if p1 > p2 and p1 > p3:
        return 'culture'
    if p2 > p1 and p2 > p3:
        return 'politics'
    if p3 > p2 and p3 > p1:
        return 'sports'

if __name__ == '__main__':
    doc_list, class_list = readfiles()
    vocab_list = createVocabList(doc_list)
    trainingSet = list(range(6))
    trainMat = []
    trainLabel = []
    # print(doc_list[1])
    for docIndex in trainingSet:
        trainMat.append(setWords2Vec(vocab_list, doc_list[docIndex]))
        trainLabel.append(class_list[docIndex])
    p_culture_vect, p_politics_vect, p_sports_vect, p_culture, p_politics, p_sports = train(np.array(trainMat),
                                                                                            np.array(trainLabel))
    testSet = [['ball', 'player', 'strategy'], ['theater', 'cart', 'decision']]
    for i in range(2):
        wordVector = setWords2Vec(vocab_list, testSet[i])
        print(classify(np.array(wordVector), p_culture_vect, p_politics_vect, p_sports_vect, p_culture, p_politics,
                       p_sports))

           
python 樸素貝葉斯算法使用樸素貝葉斯算法使用