最近幫一個小夥伴做了一個針對太原鍊家二手房的資料采集加分析,先看下效果圖,後面上源碼
采集
# -*- coding: utf-8 -*-
import re
import csv
import time
import math
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
'Referer': 'https://ty.lianjia.com/ershoufang/'}
session = requests.session()
session.get('https://ty.lianjia.com/ershoufang/', headers=headers)
url = 'https://ty.lianjia.com/ershoufang/{}/pg{}/'
area_dic = {'杏花嶺區':'xinghualingqu',
'迎澤區':'yingzequ',
'萬柏林區':'wanbolinqu',
'小店區':'xiaodianqu',
'尖草坪區':'jiancaopingqu',
'晉源區':'jinyuanqu',
'陽曲縣':'yangquxian',
'婁煩縣':'loufanxian',
'古交市':'gujiaoshi',
'清徐縣':'qingxuxian'
}
# csv表格商品頭
def csv_head():
ky = 'taiyuan'
head = ['area', 'title', 'community', 'position', 'tax', 'total_price', 'unit_price', 'hourseType', 'hourseSize','direction','fitment']
csvFile = open(fr'{ky}.csv', 'a+', newline='', encoding='utf-8-sig') # 設定newline,否則兩行之間會空一行
writer = csv.writer(csvFile)
writer.writerow(head)
csvFile.close()
# 存儲本地csv
def save(info):
ky = 'taiyuan'
csvFile = open(fr'{ky}.csv', 'a+', newline='', encoding='utf-8-sig') # 設定newline,否則兩行之間會空一行
writer = csv.writer(csvFile)
writer.writerow(info)
csvFile.close()
def re_match(re_pattern, string, errif=None):
try:
return re.findall(re_pattern, string)[0].strip()
except IndexError:
return errif
def collect():
for key_, value_ in area_dic.items():
# 擷取該行政區下房源記錄數
start_url = 'https://ty.lianjia.com/ershoufang/{}/'.format(value_)
html = session.get(start_url).text
house_num = re.findall('共找到<span> (.*?) </span>套.*二手房', html)[0].strip()
print('💚{}: 社群房源共計「{}」套'.format(key_, house_num))
time.sleep(1)
# 頁面限制🚫 每個行政區隻能擷取最多100頁共計3000條房源資訊
total_page = int(math.ceil(min(3000, int(house_num)) / 30.0))
for i in tqdm(range(total_page), desc=key_):
html = session.get(url.format(value_, i + 1)).text
soup = BeautifulSoup(html, 'lxml')
info_collect = soup.find_all(class_="info clear")
for info in info_collect:
info_dic = {}
# 行政區
info_dic['area'] = key_
# 房源的标題
info_dic['title'] = re_match('target="_blank">(.*?)</a><!--', str(info))
# 小區名
info_dic['community'] = re_match('xiaoqu.*?target="_blank">(.*?)</a>', str(info))
# 位置
info_dic['position'] = re_match('<a href.*?target="_blank">(.*?)</a>.*?class="address">', str(info))
# 稅相關,如房本滿5年
info_dic['tax'] = re_match('class="taxfree">(.*?)</span>', str(info))
# 總價
info_dic['total_price'] = float(re_match('class="totalPrice"><span>(.*?)</span>萬', str(info)))
# 單價
info_dic['unit_price'] = float(re_match('data-price="(.*?)"', str(info)))
# 比對房源标簽資訊,通過|切割
# 包括面積,朝向,裝修等資訊
icons = re.findall('class="houseIcon"></span>(.*?)</div>', str(info))[0].strip().split('|')
info_dic['hourseType'] = icons[0].strip()
info_dic['hourseSize'] = float(icons[1].replace('平米', ''))
info_dic['direction'] = icons[2].strip()
info_dic['fitment'] = icons[3].strip()
list = [info_dic['area'], info_dic['area'], info_dic['title'], info_dic['community'],
info_dic['position'],info_dic['tax'], info_dic['total_price'], info_dic['unit_price'],
info_dic['hourseType'], info_dic['hourseSize'], info_dic['direction'], info_dic['fitment']]
save(list)
if __name__ == '__main__':
csv_head()
collect()
清洗分析
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from pylab import mpl
import matplotlib.pyplot as plt
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定預設字型
#讀取資料檔案,檢視資料的大體情況
df = pd.read_csv('taiyuan.csv')
def rinse():
df.head()
df.describe()
df.drop(df[df['hourseSize']>1000].index,inplace=True)
df.info()
def pie_chart():
# 不同行政區房源數量占比
area_house_count = df.groupby('area')['area'].count()
area_house_count.sort_values(ascending=False,inplace=True) #按照降序排列
# area_house_count
# 不同戶型房源數量占比
hourseType_count = df.groupby('hourseType')['hourseType'].count()
hourseType_count.sort_values(ascending=False,inplace=True) #按照降序排列
new_hourseType_count = hourseType_count[hourseType_count>700]
new_hourseType_count['其它'] = hourseType_count[hourseType_count<700].sum()
# new_hourseType_count
# 不同朝向房源數量占比()
direction_count = df.groupby('direction')['direction'].count()
new_direction_count =direction_count[direction_count>800]
new_direction_count['其它'] = direction_count[direction_count<800].sum()
new_direction_count.sort_values(ascending=False)
# 不同裝修
fitment_count = df.groupby('fitment')['fitment'].count().sort_values(ascending=False)
fitment_count.sort_values(ascending=False,inplace=True)
fig=plt.figure(figsize=(10,8),dpi=80)
ax1=fig.add_subplot(2,2,1)
plt.title("不同行政區房源數量占比情況")
area_house_count.plot.pie(shadow=True,autopct='%0.f%%',explode=[0.05,0.05,0.05,0.05,0,0,0,0,0],labeldistance=1.1,startangle=90)
ax2=fig.add_subplot(2,2,2)
plt.title("不同戶型房源數量占比情況")
new_hourseType_count.plot.pie(shadow=True,autopct='%0.f%%',explode=[0.05,0,0,0,0,0],labeldistance=1.1,startangle=90)
ax3=fig.add_subplot(2,2,3)
plt.title("不同朝向房源數量占比情況")
new_direction_count.plot.pie(shadow=True,autopct='%0.f%%',explode=[0,0,0,],labeldistance=1.1,startangle=90)
ax4=fig.add_subplot(2,2,4)
plt.title("不同裝修類型的占比情況")
fitment_count.plot.pie(shadow=True,autopct='%0.f%%',labeldistance=1.1,explode=[0.05,0,0,0],startangle=45)
plt.savefig('不同行政區、戶型、朝向、裝修類型占比餅狀圖.jpg')
plt.show()
def price_contrast():
# 不同區的總價對比
area_house_mean_totalprice = df.groupby('area')['total_price'].mean()
area_house_mean_totalprice.sort_values(ascending=False,inplace=True)
# 不同區的單價對比
area_house_mean_unitprice = df.groupby('area')['unit_price'].mean()
area_house_mean_unitprice.sort_values(ascending=False,inplace=True)
fig = plt.figure(figsize=(15,5),dpi=80)
ax1 = fig.add_subplot(1,2,1)
plt.xlabel('價格')
plt.title("太原不同地區總價對比")
plt.ylim([100,200]) #設定y坐标軸的範圍
rects = area_house_mean_totalprice.plot.bar(alpha=0.7,color='#1E90FF')
plt.grid(alpha=0.5,color='#CD3700',linestyle='--',axis='y')
ax2 = fig.add_subplot(1,2,2)
plt.title("太原不同地區單價對比")
plt.ylim([5000,20000])
area_house_mean_unitprice.plot.bar(alpha=0.7,color='#4876FF')
plt.grid(alpha=0.5,color='#CD3700',linestyle='--',axis='y')
plt.savefig('不同區單價總價對比圖.jpg')
plt.show()
def district():
position_house_mean_price = df.groupby('position')['total_price'].mean()
position_house_mean_price.sort_values(ascending=False,inplace=True)
#繪圖 隻展示排名前十的地段
plt.title("太原房價排名前十的地段")
position_house_mean_price.head(10).plot.barh(alpha=0.7,color=['#CD3700','#9ACD32','#7EC0EE','y','orange','#4876FF','#EEA9B8','#EE7942','#CD69C9','#668B8B'])
plt.grid(color='#DDA0DD',linestyle='--',alpha=0.5)
plt.savefig('太原房價排名前十地段圖.jpg')
plt.show()
def xing_ying():
# 杏花嶺區的不同地段的均價對比
area_nanshan_price = df[df['area']=='杏花嶺區'].groupby('position')['total_price'].mean()
area_nanshan_price.sort_values(ascending=False,inplace=True)
#area_nanshan_price
# 迎澤區的不同地段的均價對比
area_baoan_price = df[df['area']=='迎澤區'].groupby('position')['total_price'].mean()
area_baoan_price.sort_values(ascending=False,inplace=True)
#area_baoan_price
fig = plt.figure(figsize=(15,4),dpi=80)
ax1 = fig.add_subplot(1,2,1)
plt.title("杏花嶺區的房價對比")
area_nanshan_price.plot.barh(alpha=0.7,color=['#A4D3EE','y','#4876FF','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#668B8B'])
ax2 = fig.add_subplot(1,2,2)
plt.title("迎澤區的房價對比")
area_baoan_price.plot.barh(alpha=0.7,color=['#CD3700','orange','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE','#A4D3EE'])
plt.savefig('杏花嶺區迎澤區房價對比圖.jpg')
plt.show()
def price_10():
community_top10 = df.groupby('community')['total_price'].mean().sort_values(ascending=False).head(10)
plt.xlabel('總價')
plt.ylabel('名稱')
plt.title("社群房總價排名前十的小區分析")
community_top10.plot.barh(alpha=0.7,width=0.7)
plt.savefig('社群房均價總價排名前10圖.jpg')
plt.show()
def orientation():
# 房屋朝向對價格的影響,隻分析單價
direction_unit_price = df.groupby('direction')['unit_price'].mean().sort_values(ascending=False)
plt.figure(figsize=(15,5),dpi=80)
plt.title("房屋朝向對價格的影響")
direction_unit_price.plot.bar(alpha=0.7)
plt.grid(color='#DDA0DD',linestyle='--',alpha=0.5,axis='y')
plt.savefig('房屋朝向對價格影響圖.jpg')
plt.show()
def finish():
fit_price = df.groupby('fitment')['unit_price'].mean().sort_values(ascending=False)
plt.title("不同裝修和單價的關系")
plt.ylabel("單價(萬元/每平方米)")
fit_price.plot.bar(color=['#FF7F50','#00E00D','#FFA500','#7B68EE'])
plt.savefig('不同裝修和單價關系圖.jpg')
plt.show()
def size():
# 通過密度圖和散點圖來分析房屋特征
fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(1,2,1)
plt.title("房間大小分布密度分析")
df['hourseSize'].hist(bins=20,ax=ax1,color='#F4A460',density= True) #直方圖 desity=True顯示頻率,為False顯示頻數
df['hourseSize'].plot(kind='kde',style='--',ax=ax1) #折線圖 kind='kde'(是與直方圖相關的密度圖)
ax2 = fig.add_subplot(1,2,2)
plt.title("房間大小與房價關系分析")
plt.scatter(df['hourseSize'],df['total_price'],s=4)
plt.savefig('房屋大小分布密度與房價關系圖.jpg')
plt.show()
def area():
fit_average_size = df.groupby('fitment')['hourseSize'].mean().sort_values(ascending=False)
plt.ylabel('面積')
plt.title("不同裝修房間的平均面積")
# plt.ylable("面積(平米)")
fit_average_size .plot.bar(color=['#FF7F50','#00E00D','#FFA500','#7B68EE'])
plt.savefig('不同裝修房間的平均面積圖.jpg')
plt.show()
def price_num():
bins_arr = np.arange(50,1000,50)
bins = pd.cut(df['total_price'],bins_arr)
totalprice_counts = df['total_price'].groupby(bins).count()
plt.xlabel('數量')
plt.ylabel('萬元')
plt.title("太原不同總價區間内的社群房源數量分析")
plt.ylabel("社群房數量")
totalprice_counts.plot.barh(alpha=0.7,width=0.7)
plt.savefig('太原不同總價區間社群房源數量圖.jpg')
plt.show()
if __name__ == '__main__':
#rinse()
pie_chart()
price_contrast()
district()
xing_ying()
price_10()
orientation()
finish()
size()
area()
price_num()