urllib庫的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
1.urlopen
基本的HTTP請求子產品
#例子:
#1.輸出網站源碼
import urllib.request
response = urllib.request.urlopen('https://www.laosiji.com') #輸出一個HTTPRespisne類型的對象
html = response.read().decode('utf-8')
print(html)
#2.輸出網站請求狀态參數
import urllib.request
response = urllib.request.urlopen('https://www.laosiji.com')
print(response.status) #輸出請求狀态碼
print(response.getheaders()) #輸出請求得到的所有資訊
#3.輸出網站的所有者
import whois
request = whois.whois('www.laosiji.com')
print(request)
#4.檢視網頁架構所用技術
import builtwith
request = builtwith.parse('https://www.laosiji.com')
print(request)
#5.設定使用者代理
import urllib.request
user_agent="wswp"
url='https://www.laosiji.com'
headers = {'User-agent':user_agent}
request = urllib.request.Request(url,headers=headers)
request_all = urllib.request.urlopen(request)
html = request_all.read().decode('utf-8')
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
2.request
#例子:
#1.附帶參數
from urllib import request, parse
url = 'http://httpbin.org/post'
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
'Host': 'httpbin.org'
}
dict = {
'name': 'Germey'
}
data = bytes(parse.urlencode(dict), encoding='utf8') #data需要轉換成bytes位元組流
req = request.Request(url=url, data=data, headers=headers, method='POST') #url請求位址;data請求參數;headers請求頭部;method請求類型
response = request.urlopen(req)
print(response.read().decode('utf-8'))
#2.逾時處理
import urllib.request
url='https://www.laosiji.com'
try:
response = urllib.request.urlopen(url, timeout=0.1)
except:
print("timeout")
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
3.異常處理
code:傳回狀态碼
reason:傳回錯誤原因
headers:傳回請求頭部
#例子
from urllib import request,error
try:
response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
print(e.reason)
else:
print('Request Successfully')
===========================================================================================================
requests庫的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
1.GET請求
#1.簡單的下載下傳源碼
import requests
url = "https://www.laosiji.com"
request = requests.get(url)
print(request.head) #擷取網站頭部資訊
print(request.request.headers) #檢視發送請求頭部
request.encoding = "ISO-8859-1" #更改編碼
html = request.text #擷取網站源碼
html_status = request.status_code #請求狀态碼
html_encoding = request.encoding #檢視相應頭部字元編碼"utf-8"
html_url = request.urlopen #檢視完整的url請求
#2.添加請求頭部參數
import requests
url = "https://www.laosiji.com"
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
kw = {'wd':'長城'} #params接收一個字典或者字元串的查詢參數,如百度查詢的是wd=""
request = requests.get(url,params=kw,headers=headers)
html = request.text
download_jsp = request.contents #
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
2.post請求
#例子
url = "https://www.laosiji.com"
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
data = {
"type":"AUTO",
"i":"i love python",
"doctype":"json",
"xmlVersion":"1.8",
"keyfrom":"fanyi.web",
"ue":"UTF-8",
"action":"FY_BY_ENTER",
"typoResult":"true"
}
request = requests.post(url,data=data,headers=headers)
html = request.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
3.代理
#proxies參數
import requests
url = "https://www.laosiji.com"
proxies = {
"http": "http://12.34.56.79:9527",
"https": "http://12.34.56.79:9527",
} # 根據協定類型,選擇不同的代理
request = requests.get(url,proxies = proxies)
html = request.text
#私密代理驗證
import requests
#如果代理需要使用HTTP Basic Auth,可以使用下面這種格式:
proxy = { "http": "mr_mao_hacker:[email protected]:16816" }
response = requests.get("http://www.baidu.com", proxies = proxy)
print response.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
4.web用戶端驗證
import requests
auth=('test', '123456')
response = requests.get('http://192.168.199.107', auth = auth)
print response.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
5.Cookies和Sission
#Cookies
import requests
response = requests.get("https://www.laosiji.com")
cookiejar = response.cookies
cookiedict = requests.utils.dict_from_cookiejar(cookiejar)
print(cookiejar)
print(cookiedict)
#Sission
import requests
ssion = requests.session()
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
data = {"email":"[email protected]","password":"administrator"}
ssion.post("https://www.laosiji.com/Plogin.do",data=data)
response = ssion.get("https://www.renren.com/41004312/profile")
print(response.text)
#處理SSL證書認證
import requests
response = requests.get("https://www.12306.cn",verify=True) #跳過證書認證将True改為Flase即可
print(response.text)
===========================================================================================================
bs4庫中BeautifulSoup子產品的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" target="_blank" rel="external nofollow" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" target="_blank" rel="external nofollow" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" target="_blank" rel="external nofollow" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'xml')
print(soup.title) #列印頭部資訊
print(soup.title.string) #同上
print(soup.head.title.stripped_strings) #輸出内容但是輸出為抽象使用for語句
print(soup.prettify()) #列印标準輸出
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#find_all使用
#用法find_all(tag_name,attrs,string)
print(soup.find_all("a",type="text/javascript")) #找到所有a标簽的字段,後面可加條件參數
for tag in soup.find_all(re.compile('a'))
print(tag) #輸出a标簽的内容,根據标簽輸出
print(soup.find_all(id=re.compile('123123'))) #輸出标簽屬性id=123123的内容
print(soup.find_all('a',recursive=False)) #不對其子孫節點檢索,預設為True
print(soup.find_all(string="藍光")) #字元串區域的檢索字元
soup.find_all(id=re.compile('123123')).get("class") #擷取檢索到的标簽中的“class”屬性
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
print(soup.div.attrs) #擷取div标簽中的屬性
print(soup.div["id"]) #列印标簽div屬性中中id的值
print(soup.contents) #輸出标簽head的所有子節點
print(soup.head.children) #輸出标簽head的所有子節點
print(soup.head.descendants) #輸出所有标簽head的所有子孫節點
print(soup.li.parent) #輸出标簽li的上一級标簽
print(soup.li.parents) #輸出标簽li的所有上一級标簽
print(soup.head.next_sibling) #輸出head的下一個同級标簽
print(soup.head.next_siblings) #輸出head的下一個的所有标簽集
print(soup.head.previous_sibling) #輸出head的上一個同級标簽
print(soup.head.previous_siblings) #輸出head的上一個的所有标簽集
for tag in find_all(True):
print(tag) #輸出所有标簽
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'xml')
for i in soup.select('ul li'): #一級一級搜尋
print(i.string) #輸出搜尋到的中間字段内容
print(i["class"]) #輸出搜尋到的參數内容
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---