天天看點

機器學習項目實戰:泰坦尼克号獲救預測

import pandas
titanic = pandas.read_csv("D:\\test\\titanic_train.csv")
#進行簡單的統計學分析
print titanic.describe()#std代表方差,Age中存在缺失值      
PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000         NaN    0.000000   
50%     446.000000    0.000000    3.000000         NaN    0.000000   
75%     668.500000    1.000000    3.000000         NaN    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


C:\Users\qiujiahao\Anaconda2\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
      
#下面操作為對資料進行預處理
#算法大多是矩陣運算,不能存在缺失值,用均值來填充缺失值
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
print titanic.describe()#std代表方差,Age中存在缺失值      
PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.361582    0.523008   
std     257.353842    0.486592    0.836071   13.019697    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   22.000000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   35.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
      
#sex是字元串,無法進行計算。将它轉成數字,用0代表man,1代表female
print titanic["Sex"].unique()

titanic.loc[titanic["Sex"]=="male","Sex"] = 0
titanic.loc[titanic["Sex"]=="female","Sex"] = 1      
['male' 'female']
      
#登船的地點也是字元串,須要變換成數字,并填充缺失值
print titanic["Embarked"].unique()
titanic["Embarked"] = titanic["Embarked"].fillna('S')
#loc通過索引擷取資料
titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0
titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1
titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2      
['S' 'C' 'Q' nan]
      
#使用回歸算法(二分類)進行預測
#線性回歸
from sklearn.linear_model import LinearRegression
#交叉驗證:将訓練資料集分成3份。對這三份進行交叉驗證,比方使用1。2樣本測試,3号樣本驗證
#對最後得到得資料取平均值
from sklearn.cross_validation import KFold

#選中一些特征
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
alg = LinearRegression()
#n_folds代表将資料切分成3份。存在3層的交叉驗證,titanic.shape[0]代表樣本個數
kf = KFold(titanic.shape[0],n_folds=3,random_state=1)

predictions = []
for train,test in kf:
#iloc通過行号擷取資料
train_predictors = titanic[predictors].iloc[train,:]
#擷取相應的label值
train_target = titanic["Survived"].iloc[train]
#進行訓練
alg.fit(train_predictors,train_target)
#進行預測
test_predictors = alg.predict(titanic[predictors].iloc[test,:])
#将結果增加到list中
predictions.append(test_predictors)      
import numpy as np

predictions = np.concatenate(predictions,axis=0)
#将0到1之間的區間值。變成詳細的是否被獲救,1代表被獲救
predictions[predictions>.5] = 1
predictions[predictions<=.5]= 0
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)

print accuracy      
0.783389450056


C:\Users\qiujiahao\Anaconda2\lib\site-packages\ipykernel\__main__.py:7: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index
      
#使用邏輯回歸。它盡管是回歸算法。可是一般都用來分類
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=3)
#注意。邏輯回歸和線性回歸得到的結果類型不一樣,邏輯回歸是機率值,線性回歸是[0,1]區間的數值
print (scores.mean())      
0.787878787879
      
#從以上結果來看,線性回歸和邏輯回歸并沒有得到非常高的準确率,接下來使用随機森林進行分析
#随機森林
#1.樣本是随機的。有放回的取樣  2.特征的選擇也是随機的,防止過拟合  3.多顆決策樹,取平均值
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

#選中一些特征
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
#random_state=1表示此處代碼多執行幾次得到的随機值都是一樣的,假設不設定,兩次執行的随機值是不一樣的
#n_estimators指定有多少顆決策樹,樹的分裂的條件是:min_samples_split代表樣本不停的分裂,某一個節點上的樣本假設僅僅有2個了
#就不再繼續分裂了,min_samples_leaf是控制葉子節點的最小個數
alg = RandomForestClassifier(random_state=1,n_estimators=10,min_samples_split=2,min_samples_leaf=1)
#進行交叉驗證
kf = cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=kf)
print (scores.mean())      
0.785634118967
      
#決策樹為10顆的時候效果仍然不好,将決策樹數量調整到50顆,而且放松下面條件,使每顆樹能夠更淺一些
alg = RandomForestClassifier(random_state=1,n_estimators=50,min_samples_split=4,min_samples_leaf=2)
#進行交叉驗證
kf = cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=kf)
#我們會發現精确度有了近一步的提高
print (scores.mean())      
0.81593714927
      
#特征提取是資料挖掘裡非常總要的一部分
#以上使用的特征都是資料裡已經有的了,在真實的資料挖掘裡我們經常沒有合适的特征。須要我們自己取提取

#自己生成一個特征,家庭成員的大小:兄弟姐妹+老人孩子
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]
#名字的長度(據說國外的富裕的家庭都喜歡取非常長的名字)
titanic["NameLength"] = titanic["Name"].apply(lambda x:len(x))      
import re
def get_title(name):
#此處是正則表達式:(+)代表比對一個或者多個,\代表轉義。總的來說就是比對帶點号的名稱而且至少有一個字母開始
title_search = re.search('([A-Za-z]+)\.',name)
if title_search:
#傳回比對到的元組,group(1)代表傳回比對到的第一個()裡的内容
return title_search.group(1)
return ""

titles = titanic["Name"].apply(get_title)
print (pandas.value_counts(titles))
print "......................."
#國外不同階層的人都有不同的稱呼
title_mapping = {"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Dr":5,"Rev":6,"Major":7,"Col":7,"Mlle":8,"Mme":8,"Don":9,
"Lady":10,"Countess":10,"Jonkheer":10,"Sir":9,"Capt":7,"Ms":2}

for k,v in title_mapping.items():
#将不同的稱呼替換成機器能夠計算的數字
titles[titles==k]=v

print (pandas.value_counts(titles))
print "......................."

titanic["Title"] = titles
print titanic["Title"]      
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Countess      1
Ms            1
Lady          1
Jonkheer      1
Don           1
Mme           1
Capt          1
Sir           1
Name: Name, dtype: int64
.......................
1     517
2     183
3     125
4      40
5       7
6       6
7       5
10      3
8       3
9       2
Name: Name, dtype: int64
.......................
0      1
1      3
2      2
3      3
4      1
5      1
6      1
7      4
8      3
9      3
10     2
11     2
12     1
13     1
14     2
15     3
16     4
17     1
18     3
19     3
20     1
21     1
22     2
23     1
24     2
25     3
26     1
27     1
28     2
29     1
      ..
861    1
862    3
863    2
864    1
865    3
866    2
867    1
868    1
869    4
870    1
871    3
872    1
873    1
874    3
875    2
876    1
877    1
878    1
879    3
880    3
881    1
882    2
883    1
884    1
885    3
886    6
887    2
888    2
889    1
890    1
Name: Title, dtype: object
      
#特征重要性分析
#分析不同特征對終于結果的影響
#比如衡量age列的重要程度時,什麼也不幹,得到一個錯誤率error1。
#增加一些噪音資料。替換原來的值(注意,此時其它列的資料不變),又得到一個一個錯誤率error2
#兩個錯誤率的內插補點能夠展現這一個特征的重要性
import numpy as np
from sklearn.feature_selection import SelectKBest,f_classif
import matplotlib.pyplot as plt
#選中一些特征
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","FamilySize","Title","NameLength"]
#選擇特性
seletor = SelectKBest(f_classif,k=5)
seletor.fit(titanic[predictors],titanic["Survived"])

scores = -np.log10(seletor.pvalues_)
#顯示不同特征的重要程度
plt.bar(range(len(predictors)),scores)
plt.xticks(range(len(predictors)),predictors,rotation="vertical")
plt.show()      
#通過以上特征的重要性分析,選擇出4個最重要的特性,又一次進行随機森林的算法
predictors = ["Pclass","Sex","Fare","Title"]
alg = RandomForestClassifier(random_state=1,n_estimators=50,min_samples_split=4,min_samples_leaf=2)
#進行交叉驗證
kf = cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=kf)
#眼下的結果是沒有得到提高,本處的目的是為了練習在随機森林中的特征選擇。它對于實際的資料挖掘具有重要意義
print (scores.mean())      
0.814814814815
      
#在競賽中經常使用的耍賴的辦法:內建多種算法。取最後每種算法的平均值。來降低過拟合
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

#GradientBoostingClassifier也是一種随機森林的算法,能夠內建多個弱分類器,然後變成強分類器
algorithms = [
    [GradientBoostingClassifier(random_state=1,n_estimators=25,max_depth=3),["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]],
    [LogisticRegression(random_state=1),["Pclass","Sex","Fare","FamilySize","Title","Age","Embarked"]]
]

kf = KFold(titanic.shape[0],n_folds=3,random_state=1)
predictions = []
for train,test in kf:
train_target = titanic["Survived"].iloc[train]
full_test_predictions = []
for alg,predictors in algorithms:
alg.fit(titanic[predictors].iloc[train,:],train_target)
test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
full_test_predictions.append(test_predictions)
test_predictions = (full_test_predictions[0] + full_test_predictions[1])/2
test_predictions[test_predictions<=.5]=0
test_predictions[test_predictions>.5] =1
predictions.append(test_predictions)

predictions = np.concatenate(predictions,axis=0)

#發現準确率提高了一個百分點
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)
print accuracy
      
0.821548821549


C:\Users\qiujiahao\Anaconda2\lib\site-packages\ipykernel\__main__.py:27: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index