urllib库的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
1.urlopen
基本的HTTP请求模块
#例子:
#1.输出网站源码
import urllib.request
response = urllib.request.urlopen('https://www.laosiji.com') #输出一个HTTPRespisne类型的对象
html = response.read().decode('utf-8')
print(html)
#2.输出网站请求状态参数
import urllib.request
response = urllib.request.urlopen('https://www.laosiji.com')
print(response.status) #输出请求状态码
print(response.getheaders()) #输出请求得到的所有信息
#3.输出网站的所有者
import whois
request = whois.whois('www.laosiji.com')
print(request)
#4.查看网页架构所用技术
import builtwith
request = builtwith.parse('https://www.laosiji.com')
print(request)
#5.设置用户代理
import urllib.request
user_agent="wswp"
url='https://www.laosiji.com'
headers = {'User-agent':user_agent}
request = urllib.request.Request(url,headers=headers)
request_all = urllib.request.urlopen(request)
html = request_all.read().decode('utf-8')
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
2.request
#例子:
#1.附带参数
from urllib import request, parse
url = 'http://httpbin.org/post'
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
'Host': 'httpbin.org'
}
dict = {
'name': 'Germey'
}
data = bytes(parse.urlencode(dict), encoding='utf8') #data需要转换成bytes字节流
req = request.Request(url=url, data=data, headers=headers, method='POST') #url请求地址;data请求参数;headers请求头部;method请求类型
response = request.urlopen(req)
print(response.read().decode('utf-8'))
#2.超时处理
import urllib.request
url='https://www.laosiji.com'
try:
response = urllib.request.urlopen(url, timeout=0.1)
except:
print("timeout")
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
3.异常处理
code:返回状态码
reason:返回错误原因
headers:返回请求头部
#例子
from urllib import request,error
try:
response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
print(e.reason)
else:
print('Request Successfully')
===========================================================================================================
requests库的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
1.GET请求
#1.简单的下载源码
import requests
url = "https://www.laosiji.com"
request = requests.get(url)
print(request.head) #获取网站头部信息
print(request.request.headers) #查看发送请求头部
request.encoding = "ISO-8859-1" #更改编码
html = request.text #获取网站源码
html_status = request.status_code #请求状态码
html_encoding = request.encoding #查看相应头部字符编码"utf-8"
html_url = request.urlopen #查看完整的url请求
#2.添加请求头部参数
import requests
url = "https://www.laosiji.com"
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
kw = {'wd':'长城'} #params接收一个字典或者字符串的查询参数,如百度查询的是wd=""
request = requests.get(url,params=kw,headers=headers)
html = request.text
download_jsp = request.contents #
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
2.post请求
#例子
url = "https://www.laosiji.com"
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
data = {
"type":"AUTO",
"i":"i love python",
"doctype":"json",
"xmlVersion":"1.8",
"keyfrom":"fanyi.web",
"ue":"UTF-8",
"action":"FY_BY_ENTER",
"typoResult":"true"
}
request = requests.post(url,data=data,headers=headers)
html = request.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
3.代理
#proxies参数
import requests
url = "https://www.laosiji.com"
proxies = {
"http": "http://12.34.56.79:9527",
"https": "http://12.34.56.79:9527",
} # 根据协议类型,选择不同的代理
request = requests.get(url,proxies = proxies)
html = request.text
#私密代理验证
import requests
#如果代理需要使用HTTP Basic Auth,可以使用下面这种格式:
proxy = { "http": "mr_mao_hacker:[email protected]:16816" }
response = requests.get("http://www.baidu.com", proxies = proxy)
print response.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
4.web客户端验证
import requests
auth=('test', '123456')
response = requests.get('http://192.168.199.107', auth = auth)
print response.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
5.Cookies和Sission
#Cookies
import requests
response = requests.get("https://www.laosiji.com")
cookiejar = response.cookies
cookiedict = requests.utils.dict_from_cookiejar(cookiejar)
print(cookiejar)
print(cookiedict)
#Sission
import requests
ssion = requests.session()
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
data = {"email":"[email protected]","password":"administrator"}
ssion.post("https://www.laosiji.com/Plogin.do",data=data)
response = ssion.get("https://www.renren.com/41004312/profile")
print(response.text)
#处理SSL证书认证
import requests
response = requests.get("https://www.12306.cn",verify=True) #跳过证书认证将True改为Flase即可
print(response.text)
===========================================================================================================
bs4库中BeautifulSoup模块的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" target="_blank" rel="external nofollow" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" target="_blank" rel="external nofollow" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" target="_blank" rel="external nofollow" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'xml')
print(soup.title) #打印头部信息
print(soup.title.string) #同上
print(soup.head.title.stripped_strings) #输出内容但是输出为抽象使用for语句
print(soup.prettify()) #打印标准输出
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#find_all使用
#用法find_all(tag_name,attrs,string)
print(soup.find_all("a",type="text/javascript")) #找到所有a标签的字段,后面可加条件参数
for tag in soup.find_all(re.compile('a'))
print(tag) #输出a标签的内容,根据标签输出
print(soup.find_all(id=re.compile('123123'))) #输出标签属性id=123123的内容
print(soup.find_all('a',recursive=False)) #不对其子孙节点检索,默认为True
print(soup.find_all(string="蓝光")) #字符串区域的检索字符
soup.find_all(id=re.compile('123123')).get("class") #获取检索到的标签中的“class”属性
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
print(soup.div.attrs) #获取div标签中的属性
print(soup.div["id"]) #打印标签div属性中中id的值
print(soup.contents) #输出标签head的所有子节点
print(soup.head.children) #输出标签head的所有子节点
print(soup.head.descendants) #输出所有标签head的所有子孙节点
print(soup.li.parent) #输出标签li的上一级标签
print(soup.li.parents) #输出标签li的所有上一级标签
print(soup.head.next_sibling) #输出head的下一个同级标签
print(soup.head.next_siblings) #输出head的下一个的所有标签集
print(soup.head.previous_sibling) #输出head的上一个同级标签
print(soup.head.previous_siblings) #输出head的上一个的所有标签集
for tag in find_all(True):
print(tag) #输出所有标签
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'xml')
for i in soup.select('ul li'): #一级一级搜索
print(i.string) #输出搜索到的中间字段内容
print(i["class"]) #输出搜索到的参数内容
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---