sklearn中XGBoost的使用

1、数据准备

from xgboost import XGBClassifier
import xgboost as xgb

import pandas as pd 
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import log_loss

from matplotlib import pyplot
import seaborn as sns
%matplotlib inline

dpath = './data/'
train = pd.read_csv(dpath +"Otto_train.csv")
train.head()

y_train = train['target']
y_train = y_train.map(lambda s: s[6:])
y_train = y_train.map(lambda s: int(s)-1)

train = train.drop(["id", "target"], axis=1)
X_train = np.array(train)

2、参数调优

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)

n_estimators = 1000 #数值大没关系，cv会自动返回合适的n_estimators
xgb1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=n_estimators, 
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.3,
        colsample_bytree=0.8,
        colsample_bylevel=0.7,
        objective= 'multi:softprob',
        seed=3)

xgtrain = xgb.DMatrix(x_train, label = y_train)
xgb1.set_params(num_class = 3)
        
cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=n_estimators, folds =kfold,metrics='mlogloss', early_stopping_rounds=10)
n_estimators = cvresult.shape[0]

 # 采用交叉验证得到的最佳参数n_estimators，训练模型
xgb1.set_params(n_estimators = n_estimators)
xgb1.fit(X_train, y_train)

3、查看变化曲线

test_means = cvresult['test-mlogloss-mean']
test_stds = cvresult['test-mlogloss-std'] 
        
train_means = cvresult['train-mlogloss-mean']
train_stds = cvresult['train-mlogloss-std'] 

x_axis = range(0, cvresult.shape[0])
        
pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test')
pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train')
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel( 'n_estimators' )
pyplot.ylabel( 'Log Loss' )
pyplot.savefig( 'n_estimators4_1.png' )
pyplot.show()


cvresult = cvresult.iloc[100:]
# plot
test_means = cvresult['test-mlogloss-mean']
test_stds = cvresult['test-mlogloss-std'] 
        
train_means = cvresult['train-mlogloss-mean']
train_stds = cvresult['train-mlogloss-std'] 

x_axis = range(100,cvresult.shape[0]+100)
        
fig = pyplot.figure(figsize=(10, 10), dpi=100)
pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test')
pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train')
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel( 'n_estimators' )
pyplot.ylabel( 'Log Loss' )
pyplot.savefig( 'n_estimators_detail.png' )
pyplot.show()

附：使用GridSearchCV进行参数调优

max_depth = [6,7,8]
min_child_weight = [4,5,6]
param_test2_2 = dict(max_depth=max_depth, min_child_weight=min_child_weight)

#注意，这里需要确定y究竟是几类，如果是二分类的问题 objective可设置为binary:logistic
xgb2_2 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=699,  #第一轮参数调整得到的n_estimators最优值
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.3,
        colsample_bytree=0.8,
        colsample_bylevel = 0.7,
        objective= 'multi:softprob',
        seed=3)

xgb2_2 .set_params(num_class = 3)
gsearch2_2 = GridSearchCV(xgb2_2, param_grid = param_test2_2, scoring='neg_log_loss',n_jobs=-1, cv=kfold)
gsearch2_2.fit(X_train , y_train)

# summarize results
print("Best: %f using %s" % (gsearch2_2.best_score_, gsearch2_2.best_params_))
test_means = gsearch2_2.cv_results_[ 'mean_test_score' ]
test_stds = gsearch2_2.cv_results_[ 'std_test_score' ]
train_means = gsearch2_2.cv_results_[ 'mean_train_score' ]
train_stds = gsearch2_2.cv_results_[ 'std_train_score' ]


# plot results
test_scores = np.array(test_means).reshape(len(min_child_weight), len(max_depth))
train_scores = np.array(train_means).reshape(len(min_child_weight), len(max_depth))

for i, value in enumerate(min_child_weight):
    pyplot.plot(max_depth, test_scores[i], label= 'test_min_child_weight:'   + str(value))
#for i, value in enumerate(min_child_weight):
#    pyplot.plot(max_depth, train_scores[i], label= 'train_min_child_weight:'   + str(value))
    
pyplot.legend()
pyplot.xlabel( 'max_depth' )                                                                                                      
pyplot.ylabel( '- Log Loss' )

sklearn中XGBoost的使用

1、数据准备

2、参数调优

3、查看变化曲线

附：使用GridSearchCV进行参数调优

继续阅读

K-近邻算法以及图像分类应用

小i机器人受邀赴韩交流，CFO首尔亚洲金融论坛演讲

吴恩达deeplearning

吴恩达logistic回归实现

【人工智能行业大师访谈1】吴恩达采访 Geoffery Hinton

吴恩达机器学习笔记（3）

吴恩达j机器学习之过拟合

吴恩达机器学习(一) 介绍

深度学习模型分析人类复杂疾病的准确性

疾病研究：重症肌无力

人工智能如何有效地运用于自然语言处理

新闻 | Mapbox 牵手阿里，飞猪旅行上线六大城市地图功能

[HTML5]自定义属性 data-* 和 jQuery.data 详解

2021-2025年中国运动疗法（KT）带行业市场供需与战略研究报告

2021年危险化学品经营单位安全管理人员考试题库及危险化学品经营单位安全管理人员考试技巧

无人机--飞控科普