該案例主要目的:根據使用者一系列屬性,對使用者是否流失做出合理判斷
1.讀取資料
from __future__ import division
import pandas as pd
import numpy as np
#讀取資料
churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist()
#列印列名
print("Column names:")
print(col_names)
#顯示左邊五列和右邊五列資料
to_show = col_names[:6] + col_names[-6:]
#列印前六行
print("\nSample data:")
churn_df[to_show].head(6)
- 輸出結果如下:
2.資料預處理
#将最後一列标簽字段(字元型)資料類型轉化為數值型
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)
# 删除無用字段
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)
# 将"Int'l Plan"和"VMail Plan" 兩列轉化為數值型
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
# 所有屬性字段名
features = churn_feat_space.columns
X = churn_feat_space.values.astype(np.float)
# 資料标準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
#顯示記錄數量和特征數量值
print('Feature space holds {} observations and {} features'.format( X.shape[0], X.shape[1]))
print("Unique target labels:", np.unique(y))
print(X[0])
print (len(y[y == 0]))
- 輸出結果如下:
3.訓練集的交叉驗證
from sklearn.sklearn.model_selection import KFold
def run_cv(X,y,clf_class,**kwargs):
# 建立一個kfolds對象
kf = KFold(len(y),n_folds=5,shuffle=True)
y_pred = y.copy()
# 對訓練集進行交叉驗證
for train_index, test_index in kf.split(X)::
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
# 初始化帶有參數的分類器
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
4.分類模型建構—比較k近鄰、随機森林和支援向量機三個模型
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
def accuracy(y_true,y_pred):
# NumPy 可以将boolean類型(True and False)分别轉化為 1. 和 0.,傳回統計名額——精度值
return np.mean(y_true == y_pred)
print ('Support vector machines:')
print ("%.3f" % accuracy(y, run_cv(X,y,SVC)))
print ("Random forest:")
print ("%.3f" % accuracy(y, run_cv(X,y,RF)))
print ("K-nearest-neighbors:")
print ("%.3f" % accuracy(y, run_cv(X,y,KNN)))
5.模型評價
通過函數predict_proba 計算分類器的正确率,根據predict_proba,我們分析不同門檻值下的正确率分數,會發現在某些門檻值下,正确率分數會更高。
def run_prob_cv(X, y, clf_class, **kwargs):
kf = KFold(n_splits = 5, shuffle = True)
y_prob = np.zeros((len(y),2))
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
# 預測分類機率
#predict_proba傳回的是一個 n 行 k 列的數組, 第 i 行 第 j 列上的數值是模型預測 第 i 個預測樣本為某個标簽的機率,并且每一行的機率和為1。
y_prob[test_index] = clf.predict_proba(X_test)
return y_prob
import warnings
warnings.filterwarnings('ignore')
# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
#print pred_prob[0]
pred_churn = pred_prob[:,1]
is_churn = y == 1
# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
#print counts
# 計算機率值
true_prob = {}
for prob in counts.index:
true_prob[prob] = np.mean(is_churn[pred_churn == prob])
true_prob = pd.Series(true_prob)
# pandas-fu
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts
- 輸出結果如下:
6.計算混淆矩陣并可視化
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
def plot_confusion_matrix(y_true, y_pred, classes,
normalize=False,
title=None,
cmap=plt.cm.Blues):
# 計算混淆矩陣
cm = confusion_matrix(y_true, y_pred)
# 僅适用标簽列
classes = classes[unique_labels(y_true, y_pred)]
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# 顯示橫坐标與縱坐标标簽
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# 旋轉坐标标簽注釋
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# 設定文本标簽的格式
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
return ax
plot_confusion_matrix(y, run_cv(X, y, RF), classes=np.array([1, 0]),
title='Confusion matrix, without normalization')
plt.show()
- 輸出結果
參考網站:https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
案例資料下載下傳位址:
連結:https://pan.baidu.com/s/1sHsIet6e7KCU6o7cme8JjQ
提取碼:4ya3