天天看點

python爬蟲

pypthon爬蟲相關

  • 1、爬取圖檔至本地,目前僅能實作将百度帖裡面的圖檔下載下傳至本地,百度圖檔裡面的圖檔還不能實作,等完善
  • 2、抓取糗百的段子,檢視作者、段子内容,點贊個數
  • 3、更加人性化的檢視方式,按下回車顯示一個段子,如果按[q|Q]就直接退出

代碼

  • 1、用到的子產品,urllib,urllib2,re
  • 2、url比對用到了re子產品
  • 3、檔案下載下傳使用了urllib.urlretrieve()來将分析出來的圖檔下載下傳至本地

1、百度貼吧圖檔

#!/usr/bin/env python
#-*- coding:utf8 -*-

import urllib, urllib2
import re

def getHtml(url):
	page = urllib2.urlopen(url)

	return page.read()

def getImage(html):
	'''需要注意這裡的.*?表示的是非貪婪比對,如果遇到第一個>,那麼就停止比對'''
	re_img = re.compile(r'<img class="BDE_Image" src="(.*?)".*?>')
	img_list = re_img.findall( html )
	i = 1

	for imgurl in img_list:
		print imgurl
		'''使用urllib.urlretrieve()來将分析出來的圖檔下載下傳至本地
		'''
		urllib.urlretrieve(imgurl, filename='%s.jpg' % i)
		i += 1

if __name__ == '__main__':
	#url = 'http://tieba.baidu.com/p/3999261766'
	#url = 'http://tieba.baidu.com/p/4957363500'
	url = 'http://tieba.baidu.com/p/2263349749'
	page = getHtml( url )
	# print page
	getImage( page )

           

2、一次性擷取指定頁面的糗百段子

#!/usr/bin/env python
#-*- coding:utf8 -*-

'''
# 爬取糗百段子
1. 攫取段子
2. 過濾帶有圖檔的段子
3. 實作第按一次回車顯示一個段子的釋出人,段子内容,點贊個數
'''

import urllib, urllib2
import re

page = 2
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
# print( url )
# User-Agent: 封裝
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"}

try:
	req = urllib2.Request(url, headers=headers)
	rsp = urllib2.urlopen(req)
	html = rsp.read()
except urllib2.URLError, e:
	if hasattr(e, 'code'):
		print e.code
	if hasattr(e, 'reason'):
		print e.reason

re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?" alt="(.*?)"/>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(\d+)</i>', re.S)
items = re_page.findall( html )

# print(items)
for item in items:
	for i in item:
		print(i)
           

3、實作第按一次回車顯示一個段子的釋出人,段子内容,點贊個數

#!/usr/bin/env python
#-*- coding:utf8 -*-

'''
# 爬取糗百段子
1. 攫取段子
2. 實作每按一次回車隻顯示一個段子
'''

import urllib, urllib2
import re
import sys

def getPage(page_num):
   url = 'http://www.qiushibaike.com/hot/page/' + str(page_num)
   # print( url )
   # User-Agent: 封裝
   headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"}

   try:
      req = urllib2.Request(url, headers=headers)
      rsp = urllib2.urlopen(req)
      html = rsp.read()
      # print html

      return html # 傳回網頁源碼
   except urllib2.URLError, e:
      if hasattr(e, 'code'):
         print('連接配接伺服器失敗, 錯誤代碼: %s' % e.code)
         return None
      if hasattr(e, 'reason'):
         print('連接配接伺服器失敗,錯誤原因: %s' % e.reason)
         return None

def getPageContent(page_num=1):
   html = getPage(page_num)
   # print(html)
   # re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?" alt="(.*?)"/>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(\d+)</i>', re.S)
   re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?"/>.*?</a>.*?<a.*?>.*?<h2>(.*?)</h2>.*?</a>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(\d+)</i>', re.S)
   items = re_page.findall( html )
   # print( items[1] )
   # print( len(items) )
   page_contents = []

   for item in items:
   #  print(' item '.center(50, '='))
   #  print( item )
      page_contents.append((page_num, item[0].strip(), item[1].strip(), item[2].strip()))
      # page_contents.append('\n')
   
   return page_contents

def getOneStory(page_contents):
   for story in page_contents:
      input = raw_input()
      if input.lower() == 'q':
         sys.exit()
      # print(story[0], story[1], story[2], story[3])
      print('第%s頁\t釋出人:%s\t贊: %s\n%s' % (story[0], story[1], story[3], story[2]))

if '__main__' == __name__:
   print("Loading web content from web site ...\n Press [q|Q] to exit, and press 'Enter' see next content: \n")
   num = 1

   while True:
      page_contents = getPageContent(num)
      getOneStory(page_contents)
      num += 1

   # page_content = getPageContent()
   # print(page_content)
   # for item in page_content:
   #  for i in item:
   #     print(i)
           

未完,待續……

Yesterday is history.

Tomorrow is a mystery.

But today is a gift.

That is why it's called the present.

The old game: give a wolf a taste, then keep him hungry.