天天看點

吳恩達機器學習ex6 python實作

支援向量機

在本練習中,我們将使用高斯核函數的支援向量機(SVM)來建構垃圾郵件分類器。

資料集1

現在2d資料集上實驗

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
           
raw_data = loadmat('ex6data1.mat')
data=pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y'] = raw_data.get('y')
data.head()
           
X1 X2 y
1.9643 4.5957 1
1 2.2753 3.8589 1
2 2.9781 4.5651 1
3 2.9320 3.5519 1
4 3.5772 2.8560 1
#可視化
def plot_init_data(data,fig,ax):
    positive = data[data['y']==1]
    negative = data[data['y']==0]
    ax.scatter(positive['X1'],positive['X2'],s=50,marker='o',c='r',label='positive')
    ax.scatter(negative['X1'],negative['X2'],s=50,marker='x',c='b',label='negative')
           
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()
           
吳恩達機器學習ex6 python實作

可以看出左上角有一個異常點,但是整體依然呈現線性分布,是以可以調用線性支援向量機來學習類邊界。

令C=1

from sklearn import svm
svc = svm.LinearSVC(C=1,loss='hinge',max_iter=1000)
svc
           
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.0001, verbose=0)
           
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])
           
0.9803921568627451
           
# 可視化分類邊界
def find_decision_boundary(svc,x1min,x2min,x1max,x2max,diff):
    x1 = np.linspace(x1min,x1max,1000)
    x2 = np.linspace(x2min,x2max,1000)
    
    cordinates = [(x,y) for x in x1 for y in x2]
    x_cord, y_cord = zip(*cordinates) #*cordinates可以了解為cordinates的解壓,傳回兩個數組
    c_val = pd.DataFrame({'x1':x_cord,'x2':y_cord})
    c_val['cval'] = svc.decision_function(c_val[['x1','x2']])
    
    decision = c_val[np.abs(c_val['cval'])<diff]
    
    return decision.x1,decision.x2
           
x1,x2 = find_decision_boundary(svc,0,1.5,4,5,2*10**-3)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(x1,x2,s=10,c='r',label='bound')
plot_init_data(data,fig,ax)
ax.set_title('SVM(C=1) decision boundary')
ax.legend()
plt.show()
           
吳恩達機器學習ex6 python實作
#改變C,讓C變大觀察變化
svc2 = svm.LinearSVC(C=100,loss='hinge',max_iter=1000)
svc2.fit(data[['X1','X2']],data['y'])
svc2.score(data[['X1','X2']],data['y'])
           
0.9411764705882353
           
x1,x2 = find_decision_boundary(svc2,0,1.5,4,5,2*10**-3)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(x1,x2,s=10,c='r',label='bound')
plot_init_data(data,fig,ax)
ax.set_title('SVM(C=100) decision boundary')
ax.legend()
plt.show()
           
吳恩達機器學習ex6 python實作

高斯核心的SVM

為了能更好的了解非線性SVM分類器,我們從頭開始實作高斯核心,并不使用sklearn中已經封裝好的包。

高斯核心

高斯核心是一個衡量一對資料間的“距離”的函數,有一個參數 σ \sigma σ,決定了相似性下降至0有多快.( σ \sigma σ越大,下降越快)。

def gaussian_kernel(x1,x2,sigma):
    return np.exp(-(np.sum((x1-x2)**2)/(2*(sigma**2))))
           
x1 = np.array([1,2,1])
x2 = np.array([0,4,-1])
sigma = 2

gaussian_kernel(x1,x2,sigma)
           
0.32465246735834974
           

資料集2

在這個資料集中,我們使用高斯核心實作非線性分類器。這裡将直接調用sklean中的svm包。

raw_data = loadmat('ex6data2')
data = pd.DataFrame(raw_data['X'],columns=['X1','X2'])
data['y'] = raw_data['y']

fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()
           
吳恩達機器學習ex6 python實作
svc = svm.SVC(C=100,gamma=10,probability=True)
svc
           
SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
           
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])
           
0.9698725376593279
           
x1,x2 = find_decision_boundary(svc,0,0.4,1,1,0.01)
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.scatter(x1,x2,s=10,c='g')
plt.show()
           
吳恩達機器學習ex6 python實作

資料集3

這個資料集已經把X和y按照訓練集和驗證集分類好了,是以隻需要找到最優的超參數即可。即,尋找最優的 C C C 和 σ \sigma σ. 我們給定數值範圍:[0.01,0.03,0.1,0.3,1,3,10,30,100]

raw_data = loadmat('ex6data3.mat')

X = raw_data['X']
Xval = raw_data['Xval']
y = raw_data['y']
yval = raw_data['yval']

fig,ax = plt.subplots(figsize=(12,8))
data = pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y'] = raw_data.get('y')

plot_init_data(data,fig,ax)
ax.legend()
plt.show()


           
吳恩達機器學習ex6 python實作
C_candidates = [0.01,0.03,0.1,0.3,1,3,10,30,100]
gamma_candidates = [0.01,0.03,0.1,0.3,1,3,10,30,100]

best_score = 0
finalC = 0
finalgamma = 0

for C in C_candidates:
    for gamma in gamma_candidates:
        svc = svm.SVC(C=C,gamma=gamma)
        svc.fit(X,y)
        score = svc.score(Xval,yval)
        
        if score>best_score:
            best_score = score
            finalC = C
            finalgamma = gamma

best_score,finalC,finalgamma
            
           
(0.965, 0.3, 100)
           
svc = svm.SVC(C=finalC,gamma=finalgamma)
svc.fit(X,y)

x1,x2 = find_decision_boundary(svc,-0.6,-0.7,0.3,0.6,0.005)
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.scatter(x1,x2,s=10,c='g')
ax.legend()
plt.show()
           
吳恩達機器學習ex6 python實作

垃圾郵件分類

我們通過建構SVM來建構垃圾郵件分類器

訓練SVM

spam_train = loadmat('spamTrain.mat')
spam_test = loadmat('spamTest.mat')

spam_train
           
{'X': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 '__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Nov 13 14:27:25 2011',
 '__version__': '1.0',
 'y': array([[1],
        [1],
        [0],
        ...,
        [1],
        [0],
        [0]], dtype=uint8)}
           
X = spam_train['X']
Xtest = spam_test['Xtest']
y = spam_train['y'].ravel()
ytest = spam_test['ytest'].ravel()

X.shape,y.shape,Xtest.shape,ytest.shape
           
((4000, 1899), (4000,), (1000, 1899), (1000,))
           

這裡可以了解為每個文檔都是一個向量了,1899個次元對應1899個單詞,每個次元僅有01用于判斷這個文檔是否有這個單詞

svc = svm.SVC()
svc.fit(X,y)
           
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
           
(0.99325, 0.987)
           

可視化結果

rang = np.eye(1899)
spam_val['isspam'] = svc.decision_function(rang)
           
count    1899.000000
mean       -0.110039
std         0.049094
min        -0.428396
25%        -0.131213
50%        -0.111985
75%        -0.091973
max         0.396286
Name: isspam, dtype: float64
           
decision = spam_val[spam_val['isspam']>0]
decision
           
idx isspam
155 155 0.095529
173 173 0.066666
297 297 0.396286
351 351 0.023785
382 382 0.030317
476 476 0.042474
478 478 0.057344
529 529 0.060692
537 537 0.008558
680 680 0.109643
697 697 0.003269
738 738 0.092561
774 774 0.181496
791 791 0.040396
1008 1008 0.012187
1088 1088 0.132633
1101 1101 0.002832
1120 1120 0.003076
1163 1163 0.072045
1178 1178 0.012122
1182 1182 0.015656
1190 1190 0.232788
1263 1263 0.160806
1298 1298 0.044018
1372 1372 0.019640
1397 1397 0.218337
1399 1399 0.018762
1460 1460 0.001859
1467 1467 0.002822
1519 1519 0.001654
1661 1661 0.003775
1721 1721 0.057241
1740 1740 0.034107
1795 1795 0.125143
1823 1823 0.002071
1829 1829 0.002630
1851 1851 0.030662
1892 1892 0.052786
1894 1894 0.101613
path = 'vocab.txt'
vocab = pd.read_csv(path,header=None,names=['idx','vocabulary'],sep='\t')
vocab.head()
           
idx vocabulary
1 aa
1 2 ab
2 3 abil
3 4 abl
4 5 about
spamvocabulary = vocab.loc[list(decision['idx'])]
spamvocabulary
           
idx vocabulary
155 156 basenumb
173 174 below
297 298 click
351 352 contact
382 383 credit
476 477 dollar
478 479 dollarnumb
529 530 email
537 538 encod
680 681 free
697 698 futur
738 739 guarante
774 775 here
791 792 hour
1008 1009 market
1088 1089 nbsp
1101 1102 nextpart
1120 1121 numbera
1163 1164 offer
1178 1179 opt
1182 1183 order
1190 1191 our
1263 1264 pleas
1298 1299 price
1372 1373 receiv
1397 1398 remov
1399 1400 repli
1460 1461 se
1467 1468 see
1519 1520 sincer
1661 1662 text
1721 1722 transfer
1740 1741 type
1795 1796 visit
1823 1824 websit
1829 1830 welcom
1851 1852 will
1892 1893 you
1894 1895 your

繼續閱讀