天天看点

机器学习项目实战:泰坦尼克号获救预測

import pandas
titanic = pandas.read_csv("D:\\test\\titanic_train.csv")
#进行简单的统计学分析
print titanic.describe()#std代表方差,Age中存在缺失值      
PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000         NaN    0.000000   
50%     446.000000    0.000000    3.000000         NaN    0.000000   
75%     668.500000    1.000000    3.000000         NaN    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


C:\Users\qiujiahao\Anaconda2\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
      
#下面操作为对数据进行预处理
#算法大多是矩阵运算,不能存在缺失值,用均值来填充缺失值
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
print titanic.describe()#std代表方差,Age中存在缺失值      
PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.361582    0.523008   
std     257.353842    0.486592    0.836071   13.019697    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   22.000000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   35.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
      
#sex是字符串,无法进行计算。将它转成数字,用0代表man,1代表female
print titanic["Sex"].unique()

titanic.loc[titanic["Sex"]=="male","Sex"] = 0
titanic.loc[titanic["Sex"]=="female","Sex"] = 1      
['male' 'female']
      
#登船的地点也是字符串,须要变换成数字,并填充缺失值
print titanic["Embarked"].unique()
titanic["Embarked"] = titanic["Embarked"].fillna('S')
#loc通过索引获取数据
titanic.loc[titanic["Embarked"]=="S","Embarked"] = 0
titanic.loc[titanic["Embarked"]=="C","Embarked"] = 1
titanic.loc[titanic["Embarked"]=="Q","Embarked"] = 2      
['S' 'C' 'Q' nan]
      
#使用回归算法(二分类)进行预測
#线性回归
from sklearn.linear_model import LinearRegression
#交叉验证:将训练数据集分成3份。对这三份进行交叉验证,比方使用1。2样本測试,3号样本验证
#对最后得到得数据取平均值
from sklearn.cross_validation import KFold

#选中一些特征
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
alg = LinearRegression()
#n_folds代表将数据切分成3份。存在3层的交叉验证,titanic.shape[0]代表样本个数
kf = KFold(titanic.shape[0],n_folds=3,random_state=1)

predictions = []
for train,test in kf:
#iloc通过行号获取数据
train_predictors = titanic[predictors].iloc[train,:]
#获取相应的label值
train_target = titanic["Survived"].iloc[train]
#进行训练
alg.fit(train_predictors,train_target)
#进行预測
test_predictors = alg.predict(titanic[predictors].iloc[test,:])
#将结果增加到list中
predictions.append(test_predictors)      
import numpy as np

predictions = np.concatenate(predictions,axis=0)
#将0到1之间的区间值。变成详细的是否被获救,1代表被获救
predictions[predictions>.5] = 1
predictions[predictions<=.5]= 0
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)

print accuracy      
0.783389450056


C:\Users\qiujiahao\Anaconda2\lib\site-packages\ipykernel\__main__.py:7: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index
      
#使用逻辑回归。它尽管是回归算法。可是一般都用来分类
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=3)
#注意。逻辑回归和线性回归得到的结果类型不一样,逻辑回归是概率值,线性回归是[0,1]区间的数值
print (scores.mean())      
0.787878787879
      
#从以上结果来看,线性回归和逻辑回归并没有得到非常高的准确率,接下来使用随机森林进行分析
#随机森林
#1.样本是随机的。有放回的取样  2.特征的选择也是随机的,防止过拟合  3.多颗决策树,取平均值
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

#选中一些特征
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
#random_state=1表示此处代码多执行几次得到的随机值都是一样的,假设不设置,两次执行的随机值是不一样的
#n_estimators指定有多少颗决策树,树的分裂的条件是:min_samples_split代表样本不停的分裂,某一个节点上的样本假设仅仅有2个了
#就不再继续分裂了,min_samples_leaf是控制叶子节点的最小个数
alg = RandomForestClassifier(random_state=1,n_estimators=10,min_samples_split=2,min_samples_leaf=1)
#进行交叉验证
kf = cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=kf)
print (scores.mean())      
0.785634118967
      
#决策树为10颗的时候效果仍然不好,将决策树数量调整到50颗,而且放松下面条件,使每颗树能够更浅一些
alg = RandomForestClassifier(random_state=1,n_estimators=50,min_samples_split=4,min_samples_leaf=2)
#进行交叉验证
kf = cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=kf)
#我们会发现精确度有了近一步的提高
print (scores.mean())      
0.81593714927
      
#特征提取是数据挖掘里非常总要的一部分
#以上使用的特征都是数据里已经有的了,在真实的数据挖掘里我们经常没有合适的特征。须要我们自己取提取

#自己生成一个特征,家庭成员的大小:兄弟姐妹+老人孩子
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]
#名字的长度(据说国外的富裕的家庭都喜欢取非常长的名字)
titanic["NameLength"] = titanic["Name"].apply(lambda x:len(x))      
import re
def get_title(name):
#此处是正則表達式:(+)代表匹配一个或者多个,\代表转义。总的来说就是匹配带点号的名称而且至少有一个字母開始
title_search = re.search('([A-Za-z]+)\.',name)
if title_search:
#返回匹配到的元组,group(1)代表返回匹配到的第一个()里的内容
return title_search.group(1)
return ""

titles = titanic["Name"].apply(get_title)
print (pandas.value_counts(titles))
print "......................."
#国外不同阶层的人都有不同的称呼
title_mapping = {"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Dr":5,"Rev":6,"Major":7,"Col":7,"Mlle":8,"Mme":8,"Don":9,
"Lady":10,"Countess":10,"Jonkheer":10,"Sir":9,"Capt":7,"Ms":2}

for k,v in title_mapping.items():
#将不同的称呼替换成机器能够计算的数字
titles[titles==k]=v

print (pandas.value_counts(titles))
print "......................."

titanic["Title"] = titles
print titanic["Title"]      
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Countess      1
Ms            1
Lady          1
Jonkheer      1
Don           1
Mme           1
Capt          1
Sir           1
Name: Name, dtype: int64
.......................
1     517
2     183
3     125
4      40
5       7
6       6
7       5
10      3
8       3
9       2
Name: Name, dtype: int64
.......................
0      1
1      3
2      2
3      3
4      1
5      1
6      1
7      4
8      3
9      3
10     2
11     2
12     1
13     1
14     2
15     3
16     4
17     1
18     3
19     3
20     1
21     1
22     2
23     1
24     2
25     3
26     1
27     1
28     2
29     1
      ..
861    1
862    3
863    2
864    1
865    3
866    2
867    1
868    1
869    4
870    1
871    3
872    1
873    1
874    3
875    2
876    1
877    1
878    1
879    3
880    3
881    1
882    2
883    1
884    1
885    3
886    6
887    2
888    2
889    1
890    1
Name: Title, dtype: object
      
#特征重要性分析
#分析不同特征对终于结果的影响
#比如衡量age列的重要程度时,什么也不干,得到一个错误率error1。
#增加一些噪音数据。替换原来的值(注意,此时其它列的数据不变),又得到一个一个错误率error2
#两个错误率的差值能够体现这一个特征的重要性
import numpy as np
from sklearn.feature_selection import SelectKBest,f_classif
import matplotlib.pyplot as plt
#选中一些特征
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","FamilySize","Title","NameLength"]
#选择特性
seletor = SelectKBest(f_classif,k=5)
seletor.fit(titanic[predictors],titanic["Survived"])

scores = -np.log10(seletor.pvalues_)
#显示不同特征的重要程度
plt.bar(range(len(predictors)),scores)
plt.xticks(range(len(predictors)),predictors,rotation="vertical")
plt.show()      
#通过以上特征的重要性分析,选择出4个最重要的特性,又一次进行随机森林的算法
predictors = ["Pclass","Sex","Fare","Title"]
alg = RandomForestClassifier(random_state=1,n_estimators=50,min_samples_split=4,min_samples_leaf=2)
#进行交叉验证
kf = cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=kf)
#眼下的结果是没有得到提高,本处的目的是为了练习在随机森林中的特征选择。它对于实际的数据挖掘具有重要意义
print (scores.mean())      
0.814814814815
      
#在竞赛中经常使用的耍赖的办法:集成多种算法。取最后每种算法的平均值。来降低过拟合
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

#GradientBoostingClassifier也是一种随机森林的算法,能够集成多个弱分类器,然后变成强分类器
algorithms = [
    [GradientBoostingClassifier(random_state=1,n_estimators=25,max_depth=3),["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]],
    [LogisticRegression(random_state=1),["Pclass","Sex","Fare","FamilySize","Title","Age","Embarked"]]
]

kf = KFold(titanic.shape[0],n_folds=3,random_state=1)
predictions = []
for train,test in kf:
train_target = titanic["Survived"].iloc[train]
full_test_predictions = []
for alg,predictors in algorithms:
alg.fit(titanic[predictors].iloc[train,:],train_target)
test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
full_test_predictions.append(test_predictions)
test_predictions = (full_test_predictions[0] + full_test_predictions[1])/2
test_predictions[test_predictions<=.5]=0
test_predictions[test_predictions>.5] =1
predictions.append(test_predictions)

predictions = np.concatenate(predictions,axis=0)

#发现准确率提高了一个百分点
accuracy = sum(predictions[predictions == titanic["Survived"]])/len(predictions)
print accuracy
      
0.821548821549


C:\Users\qiujiahao\Anaconda2\lib\site-packages\ipykernel\__main__.py:27: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index