波士頓房價預測(二)
在(一)的基礎上進行了資料的異常處理
庫:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
讀取資料:
train_data=pd.read_csv('train_dataset.csv')
test_data=pd.read_csv('test_dataset.csv')
train_price=train_data['PRICE']
del train_data['PRICE']
col_list=train_data.columns.tolist()
提取測試ID,為送出檔案做準備:
test_id=test_data['ID']
del test_data['ID']
L=[]
for x in test_id:
s='id_'+str(x)
L.append(s)
異常值核心代碼程式:
通過箱線圖進行異常值的判斷,同時在下門檻值和上門檻值的進行分界,把大于或小于該值的資料等于該值
def box_plot_outliers(data_ser, box_scale):
"""
利用箱線圖去除異常值
:param data_ser: 接收 pandas.Series 資料格式
:param box_scale: 箱線圖尺度,預設用 box_plot(scale=3)進行清洗
:return:
"""
new_up=data_ser.quantile(0.75)
new_low=data_ser.quantile(0.25)
print(new_up,new_low)
iqr = box_scale * (data_ser.quantile(0.75) - data_ser.quantile(0.25))
# 下門檻值
val_low = data_ser.quantile(0.25) - iqr*1.5
# 上門檻值
val_up = data_ser.quantile(0.75) + iqr*1.5
# 異常值
ans=[]
for x in data_ser:
if x>val_up:
x=new_up
if x<val_low:
x=new_low
ans.append(x)
return ans
同時處理訓練資料和測試資料的異常值:
all_data=[train_data,test_data]
for x in all_data:
for y in x:
x[y]=box_plot_outliers(x[y],1.5)
導庫,建立模型,訓練模型,并求出線下mse:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
x_train, x_test, y_train, y_test = train_test_split(train_data, train_price,random_state=1)
linreg = LinearRegression()
model = linreg.fit(x_train, y_train)
y_hat = linreg.predict(np.array(x_test))
mse = np.average((y_hat - np.array(y_test)) ** 2)
print(mse)
預測訓練資料,并生成送出檔案:
sub=pd.DataFrame()
test_price = linreg.predict(np.array(test_data))
price=[]
for x in test_price:
price.append( round(x,1))
sub['ID']=L
sub['value']=price
sub.to_csv('answer_10.csv',index=False)
線上mse is :13.34479