pypthon爬蟲相關
- 1、爬取圖檔至本地,目前僅能實作将百度帖裡面的圖檔下載下傳至本地,百度圖檔裡面的圖檔還不能實作,等完善
- 2、抓取糗百的段子,檢視作者、段子内容,點贊個數
- 3、更加人性化的檢視方式,按下回車顯示一個段子,如果按[q|Q]就直接退出
代碼
- 1、用到的子產品,urllib,urllib2,re
- 2、url比對用到了re子產品
- 3、檔案下載下傳使用了urllib.urlretrieve()來将分析出來的圖檔下載下傳至本地
1、百度貼吧圖檔
#!/usr/bin/env python
#-*- coding:utf8 -*-
import urllib, urllib2
import re
def getHtml(url):
page = urllib2.urlopen(url)
return page.read()
def getImage(html):
'''需要注意這裡的.*?表示的是非貪婪比對,如果遇到第一個>,那麼就停止比對'''
re_img = re.compile(r'<img class="BDE_Image" src="(.*?)".*?>')
img_list = re_img.findall( html )
i = 1
for imgurl in img_list:
print imgurl
'''使用urllib.urlretrieve()來将分析出來的圖檔下載下傳至本地
'''
urllib.urlretrieve(imgurl, filename='%s.jpg' % i)
i += 1
if __name__ == '__main__':
#url = 'http://tieba.baidu.com/p/3999261766'
#url = 'http://tieba.baidu.com/p/4957363500'
url = 'http://tieba.baidu.com/p/2263349749'
page = getHtml( url )
# print page
getImage( page )
2、一次性擷取指定頁面的糗百段子
#!/usr/bin/env python
#-*- coding:utf8 -*-
'''
# 爬取糗百段子
1. 攫取段子
2. 過濾帶有圖檔的段子
3. 實作第按一次回車顯示一個段子的釋出人,段子内容,點贊個數
'''
import urllib, urllib2
import re
page = 2
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
# print( url )
# User-Agent: 封裝
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"}
try:
req = urllib2.Request(url, headers=headers)
rsp = urllib2.urlopen(req)
html = rsp.read()
except urllib2.URLError, e:
if hasattr(e, 'code'):
print e.code
if hasattr(e, 'reason'):
print e.reason
re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?" alt="(.*?)"/>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(\d+)</i>', re.S)
items = re_page.findall( html )
# print(items)
for item in items:
for i in item:
print(i)
3、實作第按一次回車顯示一個段子的釋出人,段子内容,點贊個數
#!/usr/bin/env python
#-*- coding:utf8 -*-
'''
# 爬取糗百段子
1. 攫取段子
2. 實作每按一次回車隻顯示一個段子
'''
import urllib, urllib2
import re
import sys
def getPage(page_num):
url = 'http://www.qiushibaike.com/hot/page/' + str(page_num)
# print( url )
# User-Agent: 封裝
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"}
try:
req = urllib2.Request(url, headers=headers)
rsp = urllib2.urlopen(req)
html = rsp.read()
# print html
return html # 傳回網頁源碼
except urllib2.URLError, e:
if hasattr(e, 'code'):
print('連接配接伺服器失敗, 錯誤代碼: %s' % e.code)
return None
if hasattr(e, 'reason'):
print('連接配接伺服器失敗,錯誤原因: %s' % e.reason)
return None
def getPageContent(page_num=1):
html = getPage(page_num)
# print(html)
# re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?" alt="(.*?)"/>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(\d+)</i>', re.S)
re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?"/>.*?</a>.*?<a.*?>.*?<h2>(.*?)</h2>.*?</a>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(\d+)</i>', re.S)
items = re_page.findall( html )
# print( items[1] )
# print( len(items) )
page_contents = []
for item in items:
# print(' item '.center(50, '='))
# print( item )
page_contents.append((page_num, item[0].strip(), item[1].strip(), item[2].strip()))
# page_contents.append('\n')
return page_contents
def getOneStory(page_contents):
for story in page_contents:
input = raw_input()
if input.lower() == 'q':
sys.exit()
# print(story[0], story[1], story[2], story[3])
print('第%s頁\t釋出人:%s\t贊: %s\n%s' % (story[0], story[1], story[3], story[2]))
if '__main__' == __name__:
print("Loading web content from web site ...\n Press [q|Q] to exit, and press 'Enter' see next content: \n")
num = 1
while True:
page_contents = getPageContent(num)
getOneStory(page_contents)
num += 1
# page_content = getPageContent()
# print(page_content)
# for item in page_content:
# for i in item:
# print(i)
未完,待續……
Yesterday is history.
Tomorrow is a mystery.
But today is a gift.
That is why it's called the present.
The old game: give a wolf a taste, then keep him hungry.