微網誌資料爬取

前言

這次我們來擷取一下微網誌，包括時間、評論數、點贊數和内容。本次以“四川農業大學的”為關鍵詞擷取到相關資料。來看一看效果吧。

編寫代碼

1.分析網頁

還是老套路，右鍵點選檢查。發現我們需要的資料原來的網頁裡就有。這就很簡單了。

唯一需要注意的是微網誌你不登入你就隻能看到第一頁的搜尋結果，登入後也隻能看到前50頁的搜尋結果。是以我們的請求頭裡必須帶上cookie。還有一點，微網誌内容過長時，它會折疊起來。展開的内容标簽就在折疊内容标簽的下方。如果是沒有展開全文的内容的話，p标簽下就隻會有一個a标簽。

然後我們來看看請求的參數。這個請求的表單内q是搜尋的内容，timescope是搜尋的時間範圍，page是頁數。因為我這次就隻是擷取以“四川農業大學的”為關鍵字，2021年1月1日至2021年5月5日内的微網誌，所需要更改的參數隻有page，是以我沒有采用表單的形式請求，而是直接在url上進行拼接。如果不會使用表單的話可以看我上一篇文章。8684網站航班資料擷取那篇文章就采用了表單請求的方式。

2.編寫代碼

首先是擷取資料的函數。

def get_html(url):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Cookie': '你的cookie',
        'Host': 's.weibo.com',
        'Pragma': 'no-cache',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
        'sec-ch-ua-mobile': '?0',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
    }
    res = requests.get(url,headers=headers)
    if res.status_code == 200:
        print('擷取成功')
        return res.text
    else:
        print('失敗')

然後是解析資料的函數。

def jianxi(res):
    data = []
    res = re.findall('(.*?)',res,re.S)
    for r in res:
        xp = etree.HTML(r)
        n = xp.xpath('//p[@class="txt" and @node-type="feed_list_content_full"]//text()')
        if len(n) == 0:
            n = xp.xpath('//p[@class="txt" and @node-type="feed_list_content"]//text()')
        t = xp.xpath('//div[@class="content"]/p[@class="from"]/a[1]/text()')
        p = xp.xpath('//div[@class="card-act"]//li[3]/a/text()')[0]
        d = xp.xpath('//div[@class="card-act"]//li[4]//em/text()')
        if len(d) != 0:
            d = d[0]
        else :
            d = '0'
        p =re.findall('\d*',p)
        p = ''.join('%s' % r.split() for r in p).replace('[', '').replace(']', '').replace('\'', '')
        if p == '':
            p = '0'
        t = ''.join(t[0].split())
        n = ''.join('%s' %r.split() for r in n).replace('[','').replace(']','').replace('\'','')
        n = re.sub(r'\\u...','',n)
        n = re.sub(r'收起全文d','',n)
        data.append({'時間': t, '評論數': p, '點贊數': d, '内容': n})
    return data

最後是寫入表格的函數。

def write_data(datas):
    wb = load_workbook('四川農業大學相關微網誌.xlsx')
    ws = wb.create_sheet('四川農業大學相關微網誌', 0)
    ys = {
        'A':'時間',
        'B':'評論數',
        'C':'點贊數',
        'D':'内容'
    }
    for key, value in ys.items():
        ws[key + '1'] = value
    b = 0
    for data in datas:
        for n in range(len(list(data.values())[0])):
            for key, value in ys.items():
                ws[key + str(n + 2 + b)] = list(data.values())[0][n][value]
        b += len(list(data.values())[0])
    wb.save('四川農業大學相關微網誌.xlsx')

3.總的代碼

#coding:utf-8
from openpyxl import Workbook
from openpyxl import load_workbook
from lxml import etree
import requests
import time,re

def get_html(url):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Cookie': '你的cookie',
        'Host': 's.weibo.com',
        'Pragma': 'no-cache',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
        'sec-ch-ua-mobile': '?0',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
    }
    res = requests.get(url,headers=headers)
    if res.status_code == 200:
        print('擷取成功')
        return res.text
    else:
        print('失敗')

def jianxi(res):
    data = []
    res = re.findall('(.*?)',res,re.S)
    for r in res:
        xp = etree.HTML(r)
        n = xp.xpath('//p[@class="txt" and @node-type="feed_list_content_full"]//text()')
        if len(n) == 0:
            n = xp.xpath('//p[@class="txt" and @node-type="feed_list_content"]//text()')
        t = xp.xpath('//div[@class="content"]/p[@class="from"]/a[1]/text()')
        p = xp.xpath('//div[@class="card-act"]//li[3]/a/text()')[0]
        d = xp.xpath('//div[@class="card-act"]//li[4]//em/text()')
        if len(d) != 0:
            d = d[0]
        else :
            d = '0'
        p =re.findall('\d*',p)
        p = ''.join('%s' % r.split() for r in p).replace('[', '').replace(']', '').replace('\'', '')
        if p == '':
            p = '0'
        t = ''.join(t[0].split())
        n = ''.join('%s' %r.split() for r in n).replace('[','').replace(']','').replace('\'','')
        n = re.sub(r'\\u...','',n)
        n = re.sub(r'收起全文d','',n)
        data.append({'時間': t, '評論數': p, '點贊數': d, '内容': n})
    return data

def write_data(datas):
    wb = load_workbook('四川農業大學相關微網誌.xlsx')
    ws = wb.create_sheet('四川農業大學相關微網誌', 0)
    ys = {
        'A':'時間',
        'B':'評論數',
        'C':'點贊數',
        'D':'内容'
    }
    for key, value in ys.items():
        ws[key + '1'] = value
    b = 0
    for data in datas:
        for n in range(len(list(data.values())[0])):
            for key, value in ys.items():
                ws[key + str(n + 2 + b)] = list(data.values())[0][n][value]
        b += len(list(data.values())[0])
    wb.save('四川農業大學相關微網誌.xlsx')

if __name__ == '__main__':
    wb = Workbook()
    wb.save('四川農業大學相關微網誌.xlsx')
    datas = []
    for i in range(1,51):
        url = 'https://s.weibo.com/weibo/%25E8%25B5%25B5%25E7%259D%25BF%25E5%258F%2597%25E4%25BC%25A4?q=%E5%9B%9B%E5%B7%9D%E5%86%9C%E4%B8%9A%E5%A4%A7%E5%AD%A6%E7%9A%84&typeall=1&suball=1×cope=custom:2021-01-01:2021-05-05&Refer=g&page='+str(i)
        res = get_html(url)
        data = jianxi(res)
        print(i,data)
        datas.append({str(i): data})
        time.sleep(0.5)
    write_data(datas)

微網誌資料爬取

目錄

前言

編寫代碼

1.分析網頁

2.編寫代碼

3.總的代碼

總結

繼續閱讀

TestLink導出用例轉換工具(XML2Excel)

YAML簡介和PyYAML安全操作YAML支援的類型YAML的優點：yaml的基本文法python操作

Small tricks

403 Forbidden，You don't have permission to access / on this server.Forbidden

libsvm for python 安裝

學習軟體測試基礎測試第七天

Zeppelin 配置通路 REST APIApache Zeppelin Configuration REST API

【Torch】最簡潔logging使用指南

27. Remove Element(清單)題目代碼

sort()函數到底是怎樣進行數字排序的

Cloud Studio初體驗

使用 ctypes 進行 Python 和 C 的混合程式設計

【python】【資料處理】畫多元資料分布圖

【python】netconf協定對接管理裝置

「Python 網絡自動化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 網絡裝置

在python中建立excel并寫入