天天看點

python抓取鍊家二手房資料

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import openpyxl
import pandas as pd
import requests
from bs4 import BeautifulSoup
from openpyxl.utils.dataframe import dataframe_to_rows

session = requests.session()

# ========================= generate urls ==========================

def generate_home_url(city):  # 生成頁面url
    return 'http://' + city + '.lianjia.com/ershoufang/'

def generate_area_page_url(page_count, city, path):  # 生成頁面url
    url = 'http://' + city + '.lianjia.com' + path + 'pg{}/'
    for page_index in range(1, page_count):
        yield url.format(page_index)

# ========================= ==========================

def update_session():
    # 這裡模拟一下請求頭,頭檔案是從浏覽器裡面抓到的,否則服務會回複403錯誤,(其實就是伺服器做的簡單防爬蟲檢測)
    headers = {
        'Host': 'bj.lianjia.com',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch, br',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cookie': 'TY_SESSION_ID=25a21767-af26-4543-b2b4-b92f7d6028b5; TY_SESSION_ID=f5cecba1-d783-4d40-b86d-72ee2accfccf; select_city=110000; lianjia_ssid=7ea6e0a0-dd03-48c2-9031-987bda2481c2; lianjia_uuid=435b41db-4268-4e59-9852-c4cd50e86646; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ff914d8d8522-08cf45a790e359-5e130c17-1024000-16ff914d8d9a86%22%2C%22%24device_id%22%3A%2216ff914d8d8522-08cf45a790e359-5e130c17-1024000-16ff914d8d9a86%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D'
    }
    session.headers.clear()
    session.headers.update(headers)

def get_all_area_path(area_url):  # 分析url解析出區域的url
    update_session()
    res = session.get(area_url)

    if res.status_code == 200:
        soup = BeautifulSoup(res.text, 'lxml')

        urls = {}
        areas = soup.find_all('div', attrs={'data-role': 'ershoufang'})
        for item in areas:
            for a in item.find_all(name='a'):
                url = a.attrs['href']
                urls[a.text] = url
        # print('urls:'+str(urls))
        return urls

def get_all_page_urls(page_url):  # 分析url解析出每一頁的詳細url
    update_session()
    res = session.get(page_url)

    # res = requests.get(page_url, 'lxml')
    if res.status_code == 200:
        soup = BeautifulSoup(res.text, 'html.parser')

        urls = []
        infos = soup.find_all('div', attrs={'class': 'info clear'})
        for a in infos:
            url = a.a.attrs['href']
            urls.append(url)
        return urls

def get_page_by_url(page_url):  # 分析詳細url擷取所需資訊
    print("get_page_by_url:" + page_url)
    update_session()
    try:
        res = session.get(page_url, timeout=(30, 120))

        # res = requests.get(page_url)
        if res.status_code == 200:
            info = {}
            soup = BeautifulSoup(res.text, 'lxml')
            info['标題'] = soup.select('.main')[0].text
            info['總價'] = soup.select('.total')[0].text + '萬'
            info['每平方售價'] = soup.select('.unitPriceValue')[0].text
            info['參考總價'] = soup.select('.taxtext')[0].text
            info['建造時間'] = soup.select('.subInfo')[2].text
            info['小區名稱'] = soup.select('.info')[0].text
            info['所在區域'] = soup.select('.info a')[0].text + ':' + soup.select('.info a')[1].text
            info['鍊家編号'] = str(page_url)[34:].rsplit('.html')[0]

            for ul in soup.find_all('div', attrs={'class': 'base'}):
                # print('ul:'+str(ul))
                for li in ul.find_all(name='li'):
                    # print('li:'+str(li.text))
                    span = li.find('span', attrs={'class': 'label'})
                    if '房屋戶型' == span.text:
                        info['房屋戶型'] = li.text.replace('房屋戶型', '')
                    if '所在樓層' == span.text:
                        info['所在樓層'] = li.text.replace('所在樓層', '')
                    if '建築面積' == span.text:
                        info['建築面積'] = li.text.replace('建築面積', '')
                    if '戶型結構' == span.text:
                        info['戶型結構'] = li.text.replace('戶型結構', '')
                    if '套内面積' == span.text:
                        info['套内面積'] = li.text.replace('套内面積', '')
                    if '建築類型' == span.text:
                        info['建築類型'] = li.text.replace('建築類型', '')
                    if '房屋朝向' == span.text:
                        info['房屋朝向'] = li.text.replace('房屋朝向', '')
                    if '建築結構' == span.text:
                        info['建築結構'] = li.text.replace('建築結構', '')
                    if '裝修情況' == span.text:
                        info['裝修情況'] = li.text.replace('裝修情況', '')
                    if '梯戶比例' == span.text:
                        info['梯戶比例'] = li.text.replace('梯戶比例', '')
                    if '供暖方式' == span.text:
                        info['供暖方式'] = li.text.replace('供暖方式', '')
                    if '配備電梯' == span.text:
                        info['配備電梯'] = li.text.replace('配備電梯', '')
                    if '産權年限' == span.text:
                        info['産權年限'] = li.text.replace('産權年限', '')

            for ul in soup.find_all('div', attrs={'class': 'transaction'}):
                # print('ul:'+str(ul))
                for li in ul.find_all(name='li'):
                    # print('li:'+str(li.text))
                    span = li.find('span', attrs={'class': 'label'})
                    if '挂牌時間' == span.text:
                        info['挂牌時間'] = li.text.replace('挂牌時間', '').replace('\n', '').strip()
                    if '交易權屬' == span.text:
                        info['交易權屬'] = li.text.replace('交易權屬', '').replace('\n', '').strip()
                    if '上次交易' == span.text:
                        info['上次交易'] = li.text.replace('上次交易', '').replace('\n', '').strip()
                    if '房屋用途' == span.text:
                        info['房屋用途'] = li.text.replace('房屋用途', '').replace('\n', '').strip()
                    if '房屋年限' == span.text:
                        info['房屋年限'] = li.text.replace('房屋年限', '').replace('\n', '').strip()
                    if ('産權所屬' == span.text):
                        info['産權所屬'] = li.text.replace('産權所屬', '').replace('\n', '').strip()
                    if '抵押資訊' == span.text:
                        info['抵押資訊'] = li.text.replace('抵押資訊', '').replace('\n', '').strip()
                    if '房本備件' == span.text:
                        info['房本備件'] = li.text.replace('房本備件', '').replace('\n', '').strip()
            # print("info:" + str(info))
            return info
    except Exception as e:
        print(str(e))
    return None

def do_write_workbook(ws, data):
    if data == None:
        return
    frame = pd.DataFrame(data, index=['0'])
    # print('frame:' + str(frame))
    for r in dataframe_to_rows(frame, index=False, header=True):
        if '标題' in str(r):
            continue
        #print('row:' + str(r))
        ws.append(r)

def create_sheet(workbook_file, wb, sheet_name):
    data = {'标題': '西四環,珠江峰景精裝兩居,正對小區花園,看房友善', '總價': '590萬', '每平方售價': '57399元/平米',
            '參考總價': '首付及貸款情況請咨詢經紀人', '建造時間': '2007年建/闆塔結合', '小區名稱': '珠江峰景',
            '所在區域': '豐台:嶽各莊', '鍊家編号': '101106686239', '房屋戶型': '房屋戶型2室1廳1廚1衛',
            '所在樓層': '所在樓層高樓層 (共11層)', '建築面積': '建築面積102.79㎡', '戶型結構': '戶型結構平層',
            '套内面積': '套内面積84.24㎡', '建築類型': '建築類型闆塔結合', '房屋朝向': '房屋朝向南',
            '建築結構': '建築結構鋼混結構', '裝修情況': '裝修情況其他', '梯戶比例': '梯戶比例一梯四戶',
            '供暖方式': '供暖方式集中供暖', '配備電梯': '配備電梯有', '産權年限': '産權年限70年',
            '挂牌時間': '2020-01-04', '交易權屬': '商品房', '上次交易': '2008-07-07', '房屋用途': '普通住宅',
            '房屋年限': '滿五年', '産權所屬': '非共有', '抵押資訊': '暫無資料', '房本備件': '已上傳房本照片'}
    frame = pd.DataFrame(data, index=['0'])
    ws = wb.create_sheet(sheet_name)

    for r in dataframe_to_rows(frame, index=False, header=True):
        print('row:' + str(r))
        ws.append(r)
        # wb.save(workbook_file)
        break
    return ws

def has_sheet(wb, key):
    sheet_names = wb.get_sheet_names()
    for sheet_name in sheet_names:
        if key == sheet_name:
            return True
    return False

def fetch_all_area():
    city = 'bj'
    page_count = 15
    workbook_file = '鍊家二手房.xlsx'
    area_path_map = get_all_area_path(generate_home_url(city))
#這裡要注意下,如果沒有檔案會失敗,我沒有建立操作.你可以建一個xlsx檔案就行.
    wb = openpyxl.load_workbook(workbook_file)

    for key, val in area_path_map.items():
        print('key:' + str(key) + ' val:' + str(val))
        if has_sheet(wb, key): #由于抓取資料過程會失敗,重新運作,是以有判斷是否已經存在sheet.
            continue
        ws = create_sheet(workbook_file, wb, key)
        for area_url in generate_area_page_url(page_count, city, val):
            for page_url in get_all_page_urls(area_url):
                do_write_workbook(ws, get_page_by_url(page_url))
        wb.save(workbook_file)
        wb = openpyxl.load_workbook(workbook_file)

if __name__ == '__main__':
    fetch_all_area()
           

儲存失敗,不想再寫了.直接上代碼吧.

參考了 :https://blog.csdn.net/liujiayu2/article/details/86007384 的代碼.

抓取的資料不太一樣,是按區域存儲sheet的,原文是直接分頁抓取.打開鍊家二手房頁面,會看到下面有小區/地鐵,從這裡的小區得到每一個小區的path,然後拼成url,再分頁抓取資料.詳情頁的資料也把很多資訊抓取了.像取暖,挂牌時間等.

python抓取鍊家二手房資料