本系列所有的代碼和資料都可以從陳強老師的個人首頁上下載下傳:Python資料程式
參考書目:陳強.機器學習及Python應用. 北京:高等教育出版社, 2021.
本系列基本不講數學原理,隻從代碼角度去讓讀者們利用最簡潔的Python代碼實作機器學習方法。
梯度提升也是屬于內建方法,最早的提升法是Adaboost,後來拓寬到梯度提升法GBM,然後又有了一些改進的版本,例如XGboost,LightGBM,都是目前競賽,項目會采用的主流算法。是真正的具有使用做項目的價值。
首先采用GBM去拟合正弦函數,觀察估計器個數對于拟合曲線的影響。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
np.random.seed(1)
x = np.random.uniform(0, 1, 500)
y = np.sin(2 * np.pi * x) + np.random.normal(0, 0.1, 500)
data = pd.DataFrame(x, columns=['x'])
data['y'] = y
w = np.linspace(0, 1, 100)
# Plot the DGP
sns.scatterplot(x='x', y='y', s=20, data=data, alpha=0.3)
plt.plot(w, np.sin(2 * np.pi * w))
plt.title('Data Generating Process')
拟合
for i, m in zip([1, 2, 3, 4], [1, 10, 100, 1000]):
model = GradientBoostingRegressor(n_estimators=m, max_depth=1, learning_rate=1, random_state=123)
model.fit(x.reshape(-1, 1), y)
pred = model.predict(w.reshape(-1, 1))
plt.subplot(2, 2, i)
plt.plot(w, np.sin(2 * np.pi * w), 'k', linewidth=1)
plt.plot(w, pred, 'b')
plt.text(0.65, 0.8, f'M = {m}')
plt.subplots_adjust(wspace=0.4, hspace=0.4)
## GBM for Regression on Boston Housing Data
可以看出,估計器個數越多,回歸結果越來越接近真實值。
回歸提升法的Python案例
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import load_boston
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import plot_roc_curve
from sklearn.inspection import plot_partial_dependence
依舊采用波士頓房價資料集。
Boston = load_boston()
X = pd.DataFrame(Boston.data, columns=Boston.feature_names)
y = Boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
model = GradientBoostingRegressor(random_state=123)
model.fit(X_train, y_train)
model.score(X_test, y_test)
gbm的參數太多了,網格化搜不完,使用随機搜尋超參數
# Choose best hyperparamters by RandomizedSearchCV
param_distributions = {'n_estimators': range(1, 300), 'max_depth': range(1, 10),
'subsample': np.linspace(0.1,1,10), 'learning_rate': np.linspace(0.1, 1, 10)}
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
model = RandomizedSearchCV(estimator=GradientBoostingRegressor(random_state=123),
param_distributions=param_distributions, cv=kfold, n_iter=300, random_state=0)
model.fit(X_train, y_train)
model.best_params_
model = model.best_estimator_
model.score(X_test, y_test)
和随機森林一樣,可以畫變量重要性圖
sorted_index = model.feature_importances_.argsort()
plt.barh(range(X.shape[1]), model.feature_importances_[sorted_index])
plt.yticks(np.arange(X.shape[1]), X.columns[sorted_index])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Gradient Boosting')
plt.tight_layout()
偏依賴圖
plot_partial_dependence(model, X_train, ['LSTAT', 'RM'])
手工循環找最優超參數n(估計器的個數)
# Number of trees and prediction performance
scores = []
for n_estimators in range(1, 301):
model = GradientBoostingRegressor(n_estimators=n_estimators, subsample=0.5, max_depth=5, learning_rate=0.1, random_state=123)
model.fit(X_train, y_train)
pred = model.predict(X_test)
mse = mean_squared_error(y_test, pred)
scores.append(mse)
index = np.argmin(scores)
range(1, 301)[index]
plt.plot(range(1, 301), scores)
plt.axvline(range(1, 301)[index], linestyle='--', color='k', linewidth=1)
plt.xlabel('Number of Trees')
plt.ylabel('MSE')
plt.title('MSE on Test Set')
137時mse最小
分類提升法的Python案例
采用一個玻璃種類的多分類資料集,讀取資料,劃分訓練測試集
#多分類問題
### Boosting for Multiple Classification
Glass = pd.read_csv('Glass.csv')
Glass.shape
X = Glass.iloc[:, :-1]
y = Glass.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)
model = GradientBoostingClassifier(random_state=123)
model.fit(X_train, y_train)
model.score(X_test, y_test)
随機搜參
param_distributions = {'n_estimators': range(1, 300),'max_depth': range(1, 10),
'subsample': np.linspace(0.1, 1, 10),'learning_rate': np.linspace(0.1, 1, 10)}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=123),
param_distributions=param_distributions, n_iter=50, cv=kfold, random_state=66)
model.fit(X_train, y_train)
model.best_params_
model = model.best_estimator_
model.score(X_test, y_test)
變量重要性排序圖
sorted_index = model.feature_importances_.argsort()
plt.barh(range(X.shape[1]), model.feature_importances_[sorted_index])
plt.yticks(np.arange(X.shape[1]), X.columns[sorted_index])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Gradient Boosting')
偏依賴圖
# Partial Dependence Plot
plot_partial_dependence(model, X_train, ['Mg'], target=1)
plt.title('Reponse: Glass.Type = 1')
計算混淆矩陣
# Prediction Performance
prob = model.predict_proba(X_test)
prob[:3, :]
pred = model.predict(X_test)
pred[:5]
pd.crosstab(y_test, pred, rownames=['Actual'], colnames=['Predicted'])
XGboos的Pythont案例
### XGBoost for Boston Housing Data
# conda install py-xgboost
import xgboost as xgb
X, y= load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
objective='reg:squarederror'參數表示回歸問題
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300, max_depth=6,
subsample=0.6, colsample_bytree=0.8, learning_rate=0.1, random_state=0)
model.fit(X_train, y_train)
model.score(X_test, y_test)
計算誤差
pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))
rmse
交叉驗證找超參數
params = {'objective': 'reg:squarederror', 'max_depth': 6, 'subsample': 0.6,
'colsample_bytree': 0.8, 'learning_rate': 0.1}
dtrain = xgb.DMatrix(data=X_train, label=y_train)
type(dtrain)
results = xgb.cv(dtrain=dtrain, params=params, nfold=10, metrics="rmse",
num_boost_round=300, as_pandas=True, seed=123)
results.shape
results.tail()
# Plot CV Errors
plt.plot(range(1, 301), results['train-rmse-mean'], 'k', label='Training Error')
plt.plot(range(1, 301), results['test-rmse-mean'], 'b', label='Test Error')
plt.xlabel('Number of Trees')
plt.ylabel('RMSE')
plt.axhline(0, linestyle='--', color='k', linewidth=1)
plt.legend()
plt.title('CV Errors for XGBoost')
xgboost的分類問題
### XGBoost for spam Data
spam = pd.read_csv('spam.csv')
X = spam.iloc[:, :-1]
y = spam.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=300, max_depth=6,
subsample=0.6, colsample_bytree=0.8, learning_rate=0.1, random_state=0)
model.fit(X_train, y_train)
binary:logistic表是使用邏輯損失函數去分類
計算混淆矩陣
model.score(X_test, y_test)
prob = model.predict_proba(X_test)
prob[:5, :]
pred = model.predict(X_test)
pred[:5]
pd.crosstab(y_test, pred, rownames=['Actual'], colnames=['Predicted'])
xgboost和lightgbm用法還有很多,後面應該還會單獨出一章