1、数据准备
from xgboost import XGBClassifier
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from matplotlib import pyplot
import seaborn as sns
%matplotlib inline
dpath = './data/'
train = pd.read_csv(dpath +"Otto_train.csv")
train.head()
y_train = train['target']
y_train = y_train.map(lambda s: s[6:])
y_train = y_train.map(lambda s: int(s)-1)
train = train.drop(["id", "target"], axis=1)
X_train = np.array(train)
2、参数调优
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
n_estimators = 1000 #数值大没关系,cv会自动返回合适的n_estimators
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=n_estimators,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.3,
colsample_bytree=0.8,
colsample_bylevel=0.7,
objective= 'multi:softprob',
seed=3)
xgtrain = xgb.DMatrix(x_train, label = y_train)
xgb1.set_params(num_class = 3)
cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=n_estimators, folds =kfold,metrics='mlogloss', early_stopping_rounds=10)
n_estimators = cvresult.shape[0]
# 采用交叉验证得到的最佳参数n_estimators,训练模型
xgb1.set_params(n_estimators = n_estimators)
xgb1.fit(X_train, y_train)
3、查看变化曲线
test_means = cvresult['test-mlogloss-mean']
test_stds = cvresult['test-mlogloss-std']
train_means = cvresult['train-mlogloss-mean']
train_stds = cvresult['train-mlogloss-std']
x_axis = range(0, cvresult.shape[0])
pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test')
pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train')
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel( 'n_estimators' )
pyplot.ylabel( 'Log Loss' )
pyplot.savefig( 'n_estimators4_1.png' )
pyplot.show()
cvresult = cvresult.iloc[100:]
# plot
test_means = cvresult['test-mlogloss-mean']
test_stds = cvresult['test-mlogloss-std']
train_means = cvresult['train-mlogloss-mean']
train_stds = cvresult['train-mlogloss-std']
x_axis = range(100,cvresult.shape[0]+100)
fig = pyplot.figure(figsize=(10, 10), dpi=100)
pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test')
pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train')
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel( 'n_estimators' )
pyplot.ylabel( 'Log Loss' )
pyplot.savefig( 'n_estimators_detail.png' )
pyplot.show()
附:使用GridSearchCV进行参数调优
max_depth = [6,7,8]
min_child_weight = [4,5,6]
param_test2_2 = dict(max_depth=max_depth, min_child_weight=min_child_weight)
#注意,这里需要确定y究竟是几类,如果是二分类的问题 objective可设置为binary:logistic
xgb2_2 = XGBClassifier(
learning_rate =0.1,
n_estimators=699, #第一轮参数调整得到的n_estimators最优值
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.3,
colsample_bytree=0.8,
colsample_bylevel = 0.7,
objective= 'multi:softprob',
seed=3)
xgb2_2 .set_params(num_class = 3)
gsearch2_2 = GridSearchCV(xgb2_2, param_grid = param_test2_2, scoring='neg_log_loss',n_jobs=-1, cv=kfold)
gsearch2_2.fit(X_train , y_train)
# summarize results
print("Best: %f using %s" % (gsearch2_2.best_score_, gsearch2_2.best_params_))
test_means = gsearch2_2.cv_results_[ 'mean_test_score' ]
test_stds = gsearch2_2.cv_results_[ 'std_test_score' ]
train_means = gsearch2_2.cv_results_[ 'mean_train_score' ]
train_stds = gsearch2_2.cv_results_[ 'std_train_score' ]
# plot results
test_scores = np.array(test_means).reshape(len(min_child_weight), len(max_depth))
train_scores = np.array(train_means).reshape(len(min_child_weight), len(max_depth))
for i, value in enumerate(min_child_weight):
pyplot.plot(max_depth, test_scores[i], label= 'test_min_child_weight:' + str(value))
#for i, value in enumerate(min_child_weight):
# pyplot.plot(max_depth, train_scores[i], label= 'train_min_child_weight:' + str(value))
pyplot.legend()
pyplot.xlabel( 'max_depth' )
pyplot.ylabel( '- Log Loss' )