天天看點

Python機器學習10——梯度提升

本系列所有的代碼和資料都可以從陳強老師的個人首頁上下載下傳:Python資料程式

參考書目:陳強.機器學習及Python應用. 北京:高等教育出版社, 2021.

本系列基本不講數學原理,隻從代碼角度去讓讀者們利用最簡潔的Python代碼實作機器學習方法。

梯度提升也是屬于內建方法,最早的提升法是Adaboost,後來拓寬到梯度提升法GBM,然後又有了一些改進的版本,例如XGboost,LightGBM,都是目前競賽,項目會采用的主流算法。是真正的具有使用做項目的價值。

首先采用GBM去拟合正弦函數,觀察估計器個數對于拟合曲線的影響。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor

np.random.seed(1)
x = np.random.uniform(0, 1, 500)
y = np.sin(2 * np.pi * x) + np.random.normal(0, 0.1, 500)
data = pd.DataFrame(x, columns=['x'])
data['y'] = y
w = np.linspace(0, 1, 100)

# Plot the DGP
sns.scatterplot(x='x', y='y', s=20, data=data, alpha=0.3)
plt.plot(w, np.sin(2 * np.pi * w))
plt.title('Data Generating Process')
           
Python機器學習10——梯度提升

 拟合

for i, m in zip([1, 2, 3, 4], [1, 10, 100, 1000]):
    model = GradientBoostingRegressor(n_estimators=m, max_depth=1, learning_rate=1, random_state=123)
    model.fit(x.reshape(-1, 1), y)
    pred = model.predict(w.reshape(-1, 1))
    plt.subplot(2, 2, i)
    plt.plot(w, np.sin(2 * np.pi * w), 'k', linewidth=1)
    plt.plot(w, pred, 'b')
    plt.text(0.65, 0.8, f'M = {m}')
plt.subplots_adjust(wspace=0.4, hspace=0.4)

## GBM for Regression on Boston Housing Data
           
Python機器學習10——梯度提升

 可以看出,估計器個數越多,回歸結果越來越接近真實值。

回歸提升法的Python案例

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_boston
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import plot_roc_curve
from sklearn.inspection import plot_partial_dependence
           

 依舊采用波士頓房價資料集。

Boston = load_boston()
X = pd.DataFrame(Boston.data, columns=Boston.feature_names)
y = Boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

model = GradientBoostingRegressor(random_state=123)
model.fit(X_train, y_train)
model.score(X_test, y_test)
           

gbm的參數太多了,網格化搜不完,使用随機搜尋超參數

# Choose best hyperparamters by RandomizedSearchCV

param_distributions = {'n_estimators': range(1, 300), 'max_depth': range(1, 10),
                       'subsample': np.linspace(0.1,1,10), 'learning_rate': np.linspace(0.1, 1, 10)}
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
model = RandomizedSearchCV(estimator=GradientBoostingRegressor(random_state=123),
                           param_distributions=param_distributions, cv=kfold, n_iter=300, random_state=0)
model.fit(X_train, y_train)

model.best_params_

model = model.best_estimator_

model.score(X_test, y_test)
           

和随機森林一樣,可以畫變量重要性圖

sorted_index = model.feature_importances_.argsort()
plt.barh(range(X.shape[1]), model.feature_importances_[sorted_index])
plt.yticks(np.arange(X.shape[1]), X.columns[sorted_index])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Gradient Boosting')
plt.tight_layout()
           
Python機器學習10——梯度提升

 偏依賴圖

plot_partial_dependence(model, X_train, ['LSTAT', 'RM'])

           
Python機器學習10——梯度提升

 手工循環找最優超參數n(估計器的個數)

# Number of trees and prediction performance
scores = []
for n_estimators in range(1, 301):
    model = GradientBoostingRegressor(n_estimators=n_estimators, subsample=0.5, max_depth=5, learning_rate=0.1, random_state=123)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    scores.append(mse)

index = np.argmin(scores)
range(1, 301)[index]


plt.plot(range(1, 301), scores)
plt.axvline(range(1, 301)[index], linestyle='--', color='k', linewidth=1)
plt.xlabel('Number of Trees')
plt.ylabel('MSE')
plt.title('MSE on Test Set')
           
Python機器學習10——梯度提升

137時mse最小

分類提升法的Python案例

采用一個玻璃種類的多分類資料集,讀取資料,劃分訓練測試集

#多分類問題
### Boosting for Multiple Classification 
Glass = pd.read_csv('Glass.csv')
Glass.shape

X = Glass.iloc[:, :-1]
y = Glass.iloc[:, -1]
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

model = GradientBoostingClassifier(random_state=123)
model.fit(X_train, y_train)
model.score(X_test, y_test)
           

随機搜參

param_distributions = {'n_estimators': range(1, 300),'max_depth': range(1, 10),
                       'subsample': np.linspace(0.1, 1, 10),'learning_rate': np.linspace(0.1, 1, 10)}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=123),
              param_distributions=param_distributions, n_iter=50, cv=kfold, random_state=66)
model.fit(X_train, y_train)

model.best_params_

model = model.best_estimator_
model.score(X_test, y_test)

           

變量重要性排序圖

sorted_index = model.feature_importances_.argsort()
plt.barh(range(X.shape[1]), model.feature_importances_[sorted_index])
plt.yticks(np.arange(X.shape[1]), X.columns[sorted_index])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Gradient Boosting')
           
Python機器學習10——梯度提升

偏依賴圖

# Partial Dependence Plot
plot_partial_dependence(model, X_train, ['Mg'], target=1)
plt.title('Reponse: Glass.Type = 1')
           
Python機器學習10——梯度提升

 計算混淆矩陣

# Prediction Performance
prob = model.predict_proba(X_test)
prob[:3, :]

pred = model.predict(X_test)
pred[:5]

pd.crosstab(y_test, pred, rownames=['Actual'], colnames=['Predicted'])
           
Python機器學習10——梯度提升

 XGboos的Pythont案例

### XGBoost for Boston Housing Data

# conda install py-xgboost
import xgboost as xgb

X, y= load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
           

 objective='reg:squarederror'參數表示回歸問題

model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300, max_depth=6, 
         subsample=0.6, colsample_bytree=0.8, learning_rate=0.1, random_state=0)
model.fit(X_train, y_train)
model.score(X_test, y_test)
           

計算誤差

pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))
rmse
           

交叉驗證找超參數

params = {'objective': 'reg:squarederror', 'max_depth': 6, 'subsample': 0.6,
          'colsample_bytree': 0.8, 'learning_rate': 0.1}

dtrain = xgb.DMatrix(data=X_train, label=y_train)
type(dtrain)


results = xgb.cv(dtrain=dtrain, params=params, nfold=10, metrics="rmse",
                    num_boost_round=300, as_pandas=True, seed=123)
results.shape

results.tail()

# Plot CV Errors
plt.plot(range(1, 301), results['train-rmse-mean'], 'k', label='Training Error')
plt.plot(range(1, 301), results['test-rmse-mean'], 'b', label='Test Error')
plt.xlabel('Number of Trees')
plt.ylabel('RMSE')
plt.axhline(0, linestyle='--', color='k', linewidth=1)
plt.legend()
plt.title('CV Errors for XGBoost')
           
Python機器學習10——梯度提升

xgboost的分類問題

### XGBoost for spam Data

spam = pd.read_csv('spam.csv')
X = spam.iloc[:, :-1]
y = spam.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=300, max_depth=6, 
         subsample=0.6, colsample_bytree=0.8, learning_rate=0.1, random_state=0)
model.fit(X_train, y_train)
           

binary:logistic表是使用邏輯損失函數去分類

計算混淆矩陣

model.score(X_test, y_test)

prob = model.predict_proba(X_test)
prob[:5, :]

pred = model.predict(X_test)
pred[:5]

pd.crosstab(y_test, pred, rownames=['Actual'], colnames=['Predicted'])
           
Python機器學習10——梯度提升

 xgboost和lightgbm用法還有很多,後面應該還會單獨出一章