基于機率論的分類方法:樸素貝葉斯
優點:在資料較少的情況下仍然有效,可以處理多類别問題。
缺點:對于輸入資料的準備方式較為敏感。
适用資料類型:标稱型資料。
适用條件機率進行分類。
# -*- coding: utf-8 -*-
from numpy import *
# --Step1--
# 1> loadDataSet(): 加載實驗樣本
def loadDataSet():
postingList = [['my','dog','has','flea','problems','help','please'], # 7
['maybe','not','take','him','to','dog','park','stupid'], # 8
['my','dalmation','is','so','cute','I','love','him'], # 8
['stop','posting','stupid','worthless','garbage'], # 5
['mr','licks','ate','my','steak','how','to','stop','him'],# 9
['quit','buying','worthless','dog','food','stupid']] # 6
classVec = [0,1,0,1,0,1] # 1 代表侮辱性文字,0代表正常言論
return postingList, classVec
# 2> createVocabList(dataSet): 建立詞彙表(每個單詞唯一),包含所有文檔中出現的不重複清單
def createVocabList(dataSet):
vocabSet = set([]) #建立一個空集
for document in dataSet:
vocabSet = vocabSet | set(document) #建立兩個集合的并集
return list(vocabSet)
# 3> setOfWords2Vec(vocabList, inputSet): 輸出文檔向量,向量的每個元素為1或0, 1出現, 0未出現
# 詞集模型(set-of-words model), 每個詞隻能出現一次!
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList) #建立一個所有元素值為0的向量
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print "the word: %s is not in my Vocabulary!" % word
return returnVec
# 4> bagOfWords2Vec(vocabList, inputSet): 輸出文檔向量
# 詞袋模型(bag-of-words model),每個詞可以出現不止一次
def bagOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList) #建立一個所有元素值為0的向量
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1 # 唯一不同與setOfWords2Vec代碼
else:
print "the word: %s is not in my Vocabulary!" % word
return returnVec
# --Step2--
# trainMatrix : 文檔矩陣,由文檔向量構成!!!
# trainCategory : 由每篇文檔類别标簽所構成的向量(0: 正常言論 1: 侮辱言論)
#
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix) # trainMatrix矩陣行數 numTrainDocs
numWords = len(trainMatrix[0]) # trainMatrix矩陣列數 numWords
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = ones(numWords); # numWords長的機關向量, 防止出現條件機率為0時,所求機率值為0
p1Num = ones(numWords);
p0Denom = 2.0; # 分母初始化2.0
p1Denom = 2.0;
for i in range(numTrainDocs):#
if trainCategory[i] == 1:
p1Num += trainMatrix[i] # 向量相加
p1Denom += sum(trainMatrix[i]) # p1Denom總和
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom) #change to log() 防止很多小數相乘導緻下溢出
p0Vect = log(p0Num/p0Denom) #change to log()
return p0Vect,p1Vect,pAbusive
# --Step3--
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) # 元素相乘
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
testEntry = ['love','my','dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print testEntry,'classified as: ', classifyNB(thisDoc,p0V,p1V,pAb)
testEntry = ['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print testEntry,'classified as: ', classifyNB(thisDoc,p0V,p1V,pAb)