import pandas
titanic = pandas.read_csv("D:\\test\\titanic_train.csv")
#進行簡單的統計學分析
print titanic.describe()#std代表方差,Age中存在缺失值
PassengerId Survived Pclass Age SibSp \
count 891.000000 891.000000 891.000000 714.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008
std 257.353842 0.486592 0.836071 14.526497 1.102743
min 1.000000 0.000000 1.000000 0.420000 0.000000
25% 223.500000 0.000000 2.000000 NaN 0.000000
50% 446.000000 0.000000 3.000000 NaN 0.000000
75% 668.500000 1.000000 3.000000 NaN 1.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000
Parch Fare
count 891.000000 891.000000
mean 0.381594 32.204208
std 0.806057 49.693429
min 0.000000 0.000000
25% 0.000000 7.910400
50% 0.000000 14.454200
75% 0.000000 31.000000
max 6.000000 512.329200
C:\Users\qiujiahao\Anaconda2\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
#下面操作為對資料進行預處理
#算法大多是矩陣運算,不能存在缺失值,用均值來填充缺失值
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
print titanic.describe()#std代表方差,Age中存在缺失值
PassengerId Survived Pclass Age SibSp \
count 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.361582 0.523008
std 257.353842 0.486592 0.836071 13.019697 1.102743
min 1.000000 0.000000 1.000000 0.420000 0.000000
25% 223.500000 0.000000 2.000000 22.000000 0.000000
50% 446.000000 0.000000 3.000000 28.000000 0.000000
75% 668.500000 1.000000 3.000000 35.000000 1.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000
Parch Fare
count 891.000000 891.000000
mean 0.381594 32.204208
std 0.806057 49.693429
min 0.000000 0.000000
25% 0.000000 7.910400
50% 0.000000 14.454200
75% 0.000000 31.000000
max 6.000000 512.329200
#sex是字元串,無法進行計算。将它轉成數字,用0代表man,1代表female
print titanic["Sex"].unique()
titanic.loc[titanic["Sex"]=="male","Sex"] = 0
titanic.loc[titanic["Sex"]=="female","Sex"] = 1
['male' 'female']
#登船的地點也是字元串,須要變換成數字,并填充缺失值
print titanic["Embarked"].unique()
titanic["Embarked"] = titanic["Embarked"].fillna('S')
#loc通過索引擷取資料
titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0
titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1
titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2
['S' 'C' 'Q' nan]
#使用回歸算法(二分類)進行預測
#線性回歸
from sklearn.linear_model import LinearRegression
#交叉驗證:将訓練資料集分成3份。對這三份進行交叉驗證,比方使用1。2樣本測試,3号樣本驗證
#對最後得到得資料取平均值
from sklearn.cross_validation import KFold
#選中一些特征
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
alg = LinearRegression()
#n_folds代表将資料切分成3份。存在3層的交叉驗證,titanic.shape[0]代表樣本個數
kf = KFold(titanic.shape[0],n_folds=3,random_state=1)
predictions = []
for train,test in kf:
#iloc通過行号擷取資料
train_predictors = titanic[predictors].iloc[train,:]
#擷取相應的label值
train_target = titanic["Survived"].iloc[train]
#進行訓練
alg.fit(train_predictors,train_target)
#進行預測
test_predictors = alg.predict(titanic[predictors].iloc[test,:])
#将結果增加到list中
predictions.append(test_predictors)
import numpy as np
predictions = np.concatenate(predictions,axis=0)
#将0到1之間的區間值。變成詳細的是否被獲救,1代表被獲救
predictions[predictions>.5] = 1
predictions[predictions<=.5]= 0
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)
print accuracy
0.783389450056
C:\Users\qiujiahao\Anaconda2\lib\site-packages\ipykernel\__main__.py:7: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index
#使用邏輯回歸。它盡管是回歸算法。可是一般都用來分類
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=3)
#注意。邏輯回歸和線性回歸得到的結果類型不一樣,邏輯回歸是機率值,線性回歸是[0,1]區間的數值
print (scores.mean())
0.787878787879
#從以上結果來看,線性回歸和邏輯回歸并沒有得到非常高的準确率,接下來使用随機森林進行分析
#随機森林
#1.樣本是随機的。有放回的取樣 2.特征的選擇也是随機的,防止過拟合 3.多顆決策樹,取平均值
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
#選中一些特征
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
#random_state=1表示此處代碼多執行幾次得到的随機值都是一樣的,假設不設定,兩次執行的随機值是不一樣的
#n_estimators指定有多少顆決策樹,樹的分裂的條件是:min_samples_split代表樣本不停的分裂,某一個節點上的樣本假設僅僅有2個了
#就不再繼續分裂了,min_samples_leaf是控制葉子節點的最小個數
alg = RandomForestClassifier(random_state=1,n_estimators=10,min_samples_split=2,min_samples_leaf=1)
#進行交叉驗證
kf = cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=kf)
print (scores.mean())
0.785634118967
#決策樹為10顆的時候效果仍然不好,将決策樹數量調整到50顆,而且放松下面條件,使每顆樹能夠更淺一些
alg = RandomForestClassifier(random_state=1,n_estimators=50,min_samples_split=4,min_samples_leaf=2)
#進行交叉驗證
kf = cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=kf)
#我們會發現精确度有了近一步的提高
print (scores.mean())
0.81593714927
#特征提取是資料挖掘裡非常總要的一部分
#以上使用的特征都是資料裡已經有的了,在真實的資料挖掘裡我們經常沒有合适的特征。須要我們自己取提取
#自己生成一個特征,家庭成員的大小:兄弟姐妹+老人孩子
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]
#名字的長度(據說國外的富裕的家庭都喜歡取非常長的名字)
titanic["NameLength"] = titanic["Name"].apply(lambda x:len(x))
import re
def get_title(name):
#此處是正則表達式:(+)代表比對一個或者多個,\代表轉義。總的來說就是比對帶點号的名稱而且至少有一個字母開始
title_search = re.search('([A-Za-z]+)\.',name)
if title_search:
#傳回比對到的元組,group(1)代表傳回比對到的第一個()裡的内容
return title_search.group(1)
return ""
titles = titanic["Name"].apply(get_title)
print (pandas.value_counts(titles))
print "......................."
#國外不同階層的人都有不同的稱呼
title_mapping = {"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Dr":5,"Rev":6,"Major":7,"Col":7,"Mlle":8,"Mme":8,"Don":9,
"Lady":10,"Countess":10,"Jonkheer":10,"Sir":9,"Capt":7,"Ms":2}
for k,v in title_mapping.items():
#将不同的稱呼替換成機器能夠計算的數字
titles[titles==k]=v
print (pandas.value_counts(titles))
print "......................."
titanic["Title"] = titles
print titanic["Title"]
Mr 517
Miss 182
Mrs 125
Master 40
Dr 7
Rev 6
Col 2
Major 2
Mlle 2
Countess 1
Ms 1
Lady 1
Jonkheer 1
Don 1
Mme 1
Capt 1
Sir 1
Name: Name, dtype: int64
.......................
1 517
2 183
3 125
4 40
5 7
6 6
7 5
10 3
8 3
9 2
Name: Name, dtype: int64
.......................
0 1
1 3
2 2
3 3
4 1
5 1
6 1
7 4
8 3
9 3
10 2
11 2
12 1
13 1
14 2
15 3
16 4
17 1
18 3
19 3
20 1
21 1
22 2
23 1
24 2
25 3
26 1
27 1
28 2
29 1
..
861 1
862 3
863 2
864 1
865 3
866 2
867 1
868 1
869 4
870 1
871 3
872 1
873 1
874 3
875 2
876 1
877 1
878 1
879 3
880 3
881 1
882 2
883 1
884 1
885 3
886 6
887 2
888 2
889 1
890 1
Name: Title, dtype: object
#特征重要性分析
#分析不同特征對終于結果的影響
#比如衡量age列的重要程度時,什麼也不幹,得到一個錯誤率error1。
#增加一些噪音資料。替換原來的值(注意,此時其它列的資料不變),又得到一個一個錯誤率error2
#兩個錯誤率的內插補點能夠展現這一個特征的重要性
import numpy as np
from sklearn.feature_selection import SelectKBest,f_classif
import matplotlib.pyplot as plt
#選中一些特征
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","FamilySize","Title","NameLength"]
#選擇特性
seletor = SelectKBest(f_classif,k=5)
seletor.fit(titanic[predictors],titanic["Survived"])
scores = -np.log10(seletor.pvalues_)
#顯示不同特征的重要程度
plt.bar(range(len(predictors)),scores)
plt.xticks(range(len(predictors)),predictors,rotation="vertical")
plt.show()
#通過以上特征的重要性分析,選擇出4個最重要的特性,又一次進行随機森林的算法
predictors = ["Pclass","Sex","Fare","Title"]
alg = RandomForestClassifier(random_state=1,n_estimators=50,min_samples_split=4,min_samples_leaf=2)
#進行交叉驗證
kf = cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=kf)
#眼下的結果是沒有得到提高,本處的目的是為了練習在随機森林中的特征選擇。它對于實際的資料挖掘具有重要意義
print (scores.mean())
0.814814814815
#在競賽中經常使用的耍賴的辦法:內建多種算法。取最後每種算法的平均值。來降低過拟合
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
#GradientBoostingClassifier也是一種随機森林的算法,能夠內建多個弱分類器,然後變成強分類器
algorithms = [
[GradientBoostingClassifier(random_state=1,n_estimators=25,max_depth=3),["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]],
[LogisticRegression(random_state=1),["Pclass","Sex","Fare","FamilySize","Title","Age","Embarked"]]
]
kf = KFold(titanic.shape[0],n_folds=3,random_state=1)
predictions = []
for train,test in kf:
train_target = titanic["Survived"].iloc[train]
full_test_predictions = []
for alg,predictors in algorithms:
alg.fit(titanic[predictors].iloc[train,:],train_target)
test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
full_test_predictions.append(test_predictions)
test_predictions = (full_test_predictions[0] + full_test_predictions[1])/2
test_predictions[test_predictions<=.5]=0
test_predictions[test_predictions>.5] =1
predictions.append(test_predictions)
predictions = np.concatenate(predictions,axis=0)
#發現準确率提高了一個百分點
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)
print accuracy
0.821548821549
C:\Users\qiujiahao\Anaconda2\lib\site-packages\ipykernel\__main__.py:27: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index