天天看點

Python爬蟲知識總結

urllib庫的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
1.urlopen
基本的HTTP請求子產品
#例子:
#1.輸出網站源碼
import urllib.request
response = urllib.request.urlopen('https://www.laosiji.com')			#輸出一個HTTPRespisne類型的對象
html = response.read().decode('utf-8')
print(html)
#2.輸出網站請求狀态參數
import urllib.request
response = urllib.request.urlopen('https://www.laosiji.com')
print(response.status)													#輸出請求狀态碼
print(response.getheaders())											#輸出請求得到的所有資訊
#3.輸出網站的所有者
import whois
request = whois.whois('www.laosiji.com')
print(request)
#4.檢視網頁架構所用技術
import builtwith
request = builtwith.parse('https://www.laosiji.com')
print(request)
#5.設定使用者代理
import urllib.request
user_agent="wswp"
url='https://www.laosiji.com'
headers = {'User-agent':user_agent}
request = urllib.request.Request(url,headers=headers)
request_all = urllib.request.urlopen(request)
html = request_all.read().decode('utf-8')
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
2.request
#例子:
#1.附帶參數
from urllib import request, parse
url = 'http://httpbin.org/post'
headers = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
    'Host': 'httpbin.org'
    }
dict = {
    'name': 'Germey'
    }
data = bytes(parse.urlencode(dict), encoding='utf8')					   #data需要轉換成bytes位元組流
req = request.Request(url=url, data=data, headers=headers, method='POST')  #url請求位址;data請求參數;headers請求頭部;method請求類型
response = request.urlopen(req)
print(response.read().decode('utf-8'))
#2.逾時處理
import urllib.request
url='https://www.laosiji.com'
try:
    response = urllib.request.urlopen(url, timeout=0.1)
except:
    print("timeout")
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
3.異常處理
code:傳回狀态碼
reason:傳回錯誤原因
headers:傳回請求頭部
#例子
from urllib import request,error
try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
    print(e.reason)
else:
    print('Request Successfully')



===========================================================================================================
requests庫的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
1.GET請求
#1.簡單的下載下傳源碼
import requests
url = "https://www.laosiji.com"
request = requests.get(url)

print(request.head)							#擷取網站頭部資訊
print(request.request.headers)				#檢視發送請求頭部
request.encoding = "ISO-8859-1"				#更改編碼
html = request.text							#擷取網站源碼
html_status = request.status_code			#請求狀态碼
html_encoding = request.encoding			#檢視相應頭部字元編碼"utf-8"
html_url = request.urlopen 					#檢視完整的url請求
#2.添加請求頭部參數
import requests
url = "https://www.laosiji.com"
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
kw = {'wd':'長城'}						    #params接收一個字典或者字元串的查詢參數,如百度查詢的是wd=""
request = requests.get(url,params=kw,headers=headers)
html = request.text
download_jsp = request.contents             #
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
2.post請求
#例子
url = "https://www.laosiji.com"
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
data = {
    "type":"AUTO",
    "i":"i love python",
    "doctype":"json",
    "xmlVersion":"1.8",
    "keyfrom":"fanyi.web",
    "ue":"UTF-8",
    "action":"FY_BY_ENTER",
    "typoResult":"true"
    }
request = requests.post(url,data=data,headers=headers)
html = request.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
3.代理

#proxies參數
import requests
url = "https://www.laosiji.com"
proxies = {
  "http": "http://12.34.56.79:9527",
  "https": "http://12.34.56.79:9527",
}				# 根據協定類型,選擇不同的代理
request = requests.get(url,proxies = proxies)
html = request.text

#私密代理驗證
import requests
         #如果代理需要使用HTTP Basic Auth,可以使用下面這種格式:
proxy = { "http": "mr_mao_hacker:[email protected]:16816" }
response = requests.get("http://www.baidu.com", proxies = proxy)
print response.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
4.web用戶端驗證
import requests
auth=('test', '123456')
response = requests.get('http://192.168.199.107', auth = auth)
print response.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
5.Cookies和Sission
#Cookies
import requests
response = requests.get("https://www.laosiji.com")
cookiejar = response.cookies
cookiedict = requests.utils.dict_from_cookiejar(cookiejar)
print(cookiejar)
print(cookiedict)
#Sission
import requests
ssion = requests.session()
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
data = {"email":"[email protected]","password":"administrator"}
ssion.post("https://www.laosiji.com/Plogin.do",data=data)
response = ssion.get("https://www.renren.com/41004312/profile")
print(response.text)

#處理SSL證書認證
import requests
response = requests.get("https://www.12306.cn",verify=True)				#跳過證書認證将True改為Flase即可
print(response.text)



===========================================================================================================
bs4庫中BeautifulSoup子產品的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" target="_blank" rel="external nofollow"  class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" target="_blank" rel="external nofollow"  class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" target="_blank" rel="external nofollow"  class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'xml')
print(soup.title)						#列印頭部資訊
print(soup.title.string)				#同上
print(soup.head.title.stripped_strings)	#輸出内容但是輸出為抽象使用for語句
print(soup.prettify())					#列印标準輸出
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#find_all使用
#用法find_all(tag_name,attrs,string)
print(soup.find_all("a",type="text/javascript"))	#找到所有a标簽的字段,後面可加條件參數
for tag in soup.find_all(re.compile('a'))
    print(tag)									#輸出a标簽的内容,根據标簽輸出
print(soup.find_all(id=re.compile('123123')))	#輸出标簽屬性id=123123的内容
print(soup.find_all('a',recursive=False))		#不對其子孫節點檢索,預設為True
print(soup.find_all(string="藍光"))				#字元串區域的檢索字元
soup.find_all(id=re.compile('123123')).get("class")			#擷取檢索到的标簽中的“class”屬性
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
print(soup.div.attrs)					#擷取div标簽中的屬性
print(soup.div["id"])					#列印标簽div屬性中中id的值
print(soup.contents)					#輸出标簽head的所有子節點
print(soup.head.children)				#輸出标簽head的所有子節點
print(soup.head.descendants)			#輸出所有标簽head的所有子孫節點
print(soup.li.parent)					#輸出标簽li的上一級标簽
print(soup.li.parents)					#輸出标簽li的所有上一級标簽
print(soup.head.next_sibling)			#輸出head的下一個同級标簽
print(soup.head.next_siblings)			#輸出head的下一個的所有标簽集
print(soup.head.previous_sibling)		#輸出head的上一個同級标簽
print(soup.head.previous_siblings)		#輸出head的上一個的所有标簽集
for tag in find_all(True):
    print(tag)							#輸出所有标簽


--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'xml')
for i in soup.select('ul li'):			#一級一級搜尋
    print(i.string)						#輸出搜尋到的中間字段内容
    print(i["class"])					#輸出搜尋到的參數内容

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---