1.数据读取与介绍
- 导入相关库及模块
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
- 利用pandas进行数据读取,通过info()函数了解该数据的大致信息
file_name='data.csv'
data=pd.read_csv(file_name)
print('****该数据的大致信息如下****')
print(data.info())
- 打印前五行数据
- 获取该数据的行数与列数
输出为:该数据共有30697条记录,25个特征项
通过以上结果显示,我们得到:该数据在shot_made_flag字段上缺失值较多,且该字段为标签项,0表示未射入球门,1表示摄入球门,所以需删去shot_made_flag项为NaN的记录行。
data=data[data['shot_made_flag'].notnull()]
data.info()
2.特征数据可视化展示
- 将射球时相对于球门的位置(loc_x,loc_y),(lat,lon)在图形中展示出来
#设置画布大小
plt.figure(figsize=(12,12))
#画第一个子图
plt.subplot(121)
plt.title('the location of the shot')
plt.xlabel('loc_x')
plt.ylabel('loc_y')
plt.scatter(data['loc_x'], data['loc_y'], color='g', alpha = 0.02)
#画第二个子图
plt.subplot(122)
plt.title('the site of the shot')
plt.xlabel('longitude')#经度
plt.ylabel('latitude')#纬度
plt.scatter(data['lon'], data['lat'], color='r', alpha = 0.02)
3.数据预处理
上面可视化图形显示,科比的射门位置大致呈半圆形,因此构建两个新的字段值dist和angle,其中dist=sqtr(x^2 +y^2),angle为夹角。
data['dist'] = np.sqrt(data['loc_x']**2 + data['loc_y']**2)
loc_x_zero = data['loc_x'] == 0
#print (loc_x_zero)
data['angle'] = np.array([0]*len(data))
data['angle'][~loc_x_zero] = np.arctan(data['loc_y'][~loc_x_zero] / data['loc_x'][~loc_x_zero])
data['angle'][loc_x_zero] = np.pi / 2
- 构建新的字段remaining_time
- 打印字段action_type、combined_shot_type、shot_type和shot_type
print(data.action_type.unique())
print(data.combined_shot_type.unique())
print(data.shot_type.unique())
print(data.shot_type.value_counts())
打印字段season
输出:array([‘2000-01’, ‘2001-02’, ‘2002-03’, ‘2003-04’, ‘2004-05’, ‘2005-06’,
‘2006-07’, ‘2007-08’, ‘2008-09’, ‘2009-10’, ‘2010-11’, ‘2011-12’,
‘2012-13’, ‘2013-14’, ‘2014-15’, ‘2015-16’, ‘1996-97’, ‘1997-98’,
‘1998-99’, ‘1999-00’], dtype=object)
- 构建新列
data['season'] = data['season'].apply(lambda x: int(x.split('-')[1]) )
data['season'].unique()
- 可视化distance与dist之间的关系
plt.figure(figsize=(5,5))
plt.scatter(data['dist'], data['shot_distance'], color='blue')
plt.title('dist and shot_distance')
plt.xlabel('dist')
plt.ylabel('shot_distance')
- 删除多余字段
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
data = data.drop(drop, 1)
- 将分类字段转化为数值型
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
data = pd.concat([data, pd.get_dummies(data[var], prefix=var)], 1)
data = data.drop(var, 1)
4.使用scikit-learn建立模型
- 构造训练集
train_kobe = data.copy()
train_kobe = train_kobe.drop(axis=1, columns='shot_made_flag')
train_label = data['shot_made_flag']
- 导入相关库及模块
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix,log_loss
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
-
机器学习步骤如下:
在这里面,我们采用随机森林集成算法,对科比是否进球进行合理分类,代码主要实现目标为:
- 寻求随机森林中树的最优构建数量
- 寻求树的最优深度,防止过度拟合现象发生
# find the best n_estimators for RandomForestClassifier
print('Finding best n_estimators for RandomForestClassifier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0,2,num=3).astype(int)
for n in range_n:
print("the number of trees : {0}".format(n))
t1 = time.time()
rfc_score = 0.
rfc = RandomForestClassifier(n_estimators=n)
for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
scores_n.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_n = n
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2-t1))
print(best_n, min_score)
# find best max_depth for RandomForestClassifier
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
print("the max depth : {0}".format(m))
t1 = time.time()
rfc_score = 0.
rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
scores_m.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_m = m
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)
输出结果如下:
结果显示:最优树数量为100,树最大深度为10
- 可视化在不同数量树以及树深度下随机森林的信息熵(香农熵),信息熵越大,表明此时信息不确定性越大,即预测的准确性越低