天天看點

模組化-特征篩選

第二次任務:對資料已經預處理的變量,使用IV和随機森林的特征重要性進行篩選;

目錄:

1、導入資料

2、IV值計算

3、importance計算

4、特征篩選

1、導入資料

#導入需要的包
import numpy as np
import pandas as pd
import LR as lr
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
           
####資料導入
data =  pd.read_csv(r'/data/1/home/mabufa/data/task/data_task02.csv')
           
####标簽區分
label = data['status']
data_var = data.drop(['status'], axis=1)
           

2、IV值計算

####定義IV值計算公式
def calc_iv(df, feature, target, pr = False):
    '''
    input:
        df: data
        feature: independent variable
        target: good/bad
        pr: True to enable printing of output
    output:
        iv:float
        data:pandas.DataFrame
    '''
    lst = []
    df[feature] = df[feature].fillna('NULL')

    for i in range(df[feature].nunique()):  #nunique()傳回不同行或列的值,去重後的數量;axis=0/1對應列或行;
        val = list(df[feature].unique())[i]    #對于一維數組或清單,去除其中重複的元素,并按元素由大到小傳回一個新的無元素重複的元組或清單;
        lst.append([feature,
            val,
            df[df[feature] == val].count()[feature],
            df[(df[feature] == val) & (df[target] == 0)].count()[feature],
            df[(df[feature] == val) & (df[target] == 1)].count()[feature]])
    data = pd.DataFrame(lst, columns=['Variable','Value','All','Good','Bad'])
    
    data['Share'] = data['All'] / data['All'].sum()  #分組的占比
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
    
    data = data.replace({'WoE':{np.inf:0,-np.inf:0}})  #替換正無窮和負無窮為0,以字典的形式;
    
    data['IV'] = data['WoE']*(data['Distribution Good'] - data['Distribution Bad'])

    data = data.sort_values(by=['Variable','Value'],ascending=[True,True])  #以列['Variable','Value']排序,采用升序;
    data.index = range(len(data.index)) #重置索引
    
    if pr:
        print(data)
        print('IV = ',data['IV'].sum())
        
    iv = data['IV'].sum()
    data = data.append(data)
    return iv,data
           
##IV值計算
IV_dict = {}
f_col = data_var.columns

for x in f_col:
     IV_1,df = calc_iv(data, x, 'status')
#      print('{}: {}'.format(x, IV_1))
     IV_dict[x] = IV_1
           
#将變量IV值進行降序排列,友善後續挑選變量
IV_dict_sorted = sorted(IV_dict.items(), key=lambda x: x[1], reverse=True)
IV_values = [i[1] for i in IV_dict_sorted]
IV_name = [i[0] for i in IV_dict_sorted]
plt.figure(figsize=(20,6))
plt.title('feature IV')
plt.bar(range(len(IV_values)),IV_values)
           
模組化-特征篩選

3、importance計算

#### 随機森林,檢視importance

param = {'n_estimators': list(range(10, 1001, 50))}
g = GridSearchCV(estimator = RandomForestClassifier(random_state=2019),
                       param_grid=param, cv=5)
g.fit(data_var, label)
g.best_estimator_
           
#調參
param = {'n_estimators': list(range(770, 870, 10))}
forest_grid = GridSearchCV(estimator = RandomForestClassifier(random_state=2019),
                       param_grid=param, cv=5)
forest_grid.fit(data_var, label)
rnd_clf = forest_grid.best_estimator_
rnd_clf
           
##f_importance 
f_importance = {}

importances = rnd_clf.feature_importances_
indices = np.argsort(importances)[::-1]  #argsort()函數将importances中的元素從小到大排列,提取其對應的index(索引),然後輸出到indices;
for f in range(data_var.shape[1]):
#     print(" %  s%f" % (f_col[indices[f]], importances[indices[f]]))
    f_importance[f_col[indices[f]]] = importances[indices[f]]
           
#将變量importances進行降序排列,友善後續挑選變量
im_dict_sorted = sorted(f_importance.items(), key=lambda x: x[1], reverse=True)
im_values = [i[1] for i in im_dict_sorted]
im_name = [i[0] for i in im_dict_sorted]
plt.figure(figsize=(20,6))
plt.title('feature importances')
plt.bar(range(len(im_values)),im_values)
           
模組化-特征篩選

4、特征篩選

##IV值和importance的字典轉換

df_iv = pd.DataFrame(IV_dict_sorted, columns=['vars','iv'])
df_im = pd.DataFrame(im_dict_sorted, columns=['vars','importances'])
df_iv_im =  df_iv .merge(df_im [['vars','importances']],on=['vars'],how='left')
           
##特征篩選:IV值取大于0.1,具有較強預測能力;importance門檻值簡單設為0.015;

df_iv_im = df_iv_im[df_iv_im['iv']>0.1]
df_iv_im  = df_iv_im [df_iv_im ['importances']>0.015]
           

繼續閱讀