天天看点

Pyrthon2.7 爬取微博热搜

闲来无聊,搞搞热搜,先拿微博练手

将爬取下来的数据发到自己的邮箱

相关的moudle通过pip install就行

效果展示

什么都是扯淡,直接贴代码

# -*- coding: utf-8 -*-
#!/usr/bin/python

import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
from email.header import Header
import traceback
import time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

weibo_url = "http://s.weibo.com"


class HotSearchInfo:
    def __init__(self, isForceTop, index, title, url, num, flag):
        self.isForceTop = isForceTop
        self.index = index
        self.title = title
        self.url = url
        self.num = num
        self.flag = flag

    def __str__(self):
        return u'置顶: %s, 排名: %s, 标题: %s, 链接: %s, 热度: %s, 标识: %s' % \
               (self.isForceTop, self.index, self.title, self.url, self.num, self.flag)


# 获取热搜页面
def get_html():
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    data = {
        'cate':'realtimehot'
    }
    html = ""
    try:
        r = requests.get('%s/top/summary?' % (weibo_url), params=data, headers=headers)
        if r.status_code == 200:
            html = r.text
    except:
        print "error"
    return html


# 解析热搜页面获取热搜列表数据
def parse_html(html):
    soup = BeautifulSoup(html, 'lxml')
    # 获取数据所在的div
    table = soup.find("div", attrs={"id": "pl_top_realtimehot"})
    # 获取热搜数据所在的tbody
    tbody = table.find("tbody")
    # 获取真正的热搜数据所有的列表
    trs = tbody.find_all('tr')
    result = []
    # 遍历获取每一个热搜的信息
    for tr in trs:
        td01 = tr.find("td", attrs={"class": "td-01"})
        td02 = tr.find("td", attrs={"class": "td-02"})
        td03 = tr.find("td", attrs={"class": "td-03"})
        # 是否强制置顶
        _isForceTop = td01.find("i", attrs={"class": "icon-top"})
        isForceTop = 0
        if _isForceTop is not None:
            isForceTop = 1
        # 排名
        _index = td01.text
        index = 0
        if _index is not None and _index != "":
            index = eval(_index)
        # 标题
        title = td02.find("a").text
        # 链接
        _url = td02.find("a")['href']
        url = ""
        if _url is not None and _url != "":
            url = "%s%s" % (weibo_url, _url)
        # 热度
        num_text = td02.find("span")
        num = 0
        if num_text is not None:
            num = eval(num_text.text)
        # 标识
        flag_text = td03.find("i")
        flag = u"无"
        if flag_text is not None:
            flag = flag_text.text
        result.append(HotSearchInfo(isForceTop, index, title, url, num, flag))
    return result


# 输出
def output(tr):
    result = parse_html(tr)
    content = build_content(result)
    send_mail(content)
    # for info in result:
    #     print info.__str__()+'\n'


def build_content(list):
    if list is None:
        return u""
    table = u"<html><table><thead><tr>序号</tr><tr>关键词</tr><tr>热度标识</tr><thead><tbody>%s</tbody></table><html>"
    trs = []
    for info in list:
        tr = u"<tr><td>%s</td><td><a href=\"%s\"/>%s<span>%s</span></td><td>%s</td></tr>" % \
             (info.index, info.url, info.title, info.num, info.flag)
        trs.append(tr)
    result = table % u"".join(trs)
    return result


def send_mail(content):
    sender = '[email protected]'
    pwd = "xxx"
    receivers = ['[email protected]']

    message = MIMEText(content, 'html', 'utf-8')
    message['Subject'] = Header('微博热搜榜单-%s' % time.strftime("%Y-%m-%d"), 'utf-8')
    message['From'] = sender
    message['To'] = ",".join(receivers)
    try:
        smtp_obj = smtplib.SMTP_SSL()
        smtp_obj.connect("smtp.163.com")
        # 此处密码是需要去邮箱设置的授权码,不是邮箱密码
        smtp_obj.login(sender, pwd)
        smtp_obj.sendmail(sender, receivers, message.as_string())
        print "邮件发送成功"
    except smtplib.SMTPException, e:
        print "Error: 无法发送邮件, %s" % traceback.format_exc()
    finally:
        if smtp_obj:
            smtp_obj.close()


def main():
    html = get_html()
    if html is None:
        print "get none"
        return
    output(html)


main()
           

千万不要使用阿里云的邮箱,发不出去