Python爬虫知识总结

urllib库的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
1.urlopen
基本的HTTP请求模块
#例子:
#1.输出网站源码
import urllib.request
response = urllib.request.urlopen('https://www.laosiji.com')			#输出一个HTTPRespisne类型的对象
html = response.read().decode('utf-8')
print(html)
#2.输出网站请求状态参数
import urllib.request
response = urllib.request.urlopen('https://www.laosiji.com')
print(response.status)													#输出请求状态码
print(response.getheaders())											#输出请求得到的所有信息
#3.输出网站的所有者
import whois
request = whois.whois('www.laosiji.com')
print(request)
#4.查看网页架构所用技术
import builtwith
request = builtwith.parse('https://www.laosiji.com')
print(request)
#5.设置用户代理
import urllib.request
user_agent="wswp"
url='https://www.laosiji.com'
headers = {'User-agent':user_agent}
request = urllib.request.Request(url,headers=headers)
request_all = urllib.request.urlopen(request)
html = request_all.read().decode('utf-8')
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
2.request
#例子：
#1.附带参数
from urllib import request, parse
url = 'http://httpbin.org/post'
headers = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
    'Host': 'httpbin.org'
    }
dict = {
    'name': 'Germey'
    }
data = bytes(parse.urlencode(dict), encoding='utf8')					   #data需要转换成bytes字节流
req = request.Request(url=url, data=data, headers=headers, method='POST')  #url请求地址;data请求参数;headers请求头部;method请求类型
response = request.urlopen(req)
print(response.read().decode('utf-8'))
#2.超时处理
import urllib.request
url='https://www.laosiji.com'
try:
    response = urllib.request.urlopen(url, timeout=0.1)
except:
    print("timeout")
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
3.异常处理
code：返回状态码
reason：返回错误原因
headers：返回请求头部
#例子
from urllib import request,error
try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
    print(e.reason)
else:
    print('Request Successfully')



===========================================================================================================
requests库的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
1.GET请求
#1.简单的下载源码
import requests
url = "https://www.laosiji.com"
request = requests.get(url)

print(request.head)							#获取网站头部信息
print(request.request.headers)				#查看发送请求头部
request.encoding = "ISO-8859-1"				#更改编码
html = request.text							#获取网站源码
html_status = request.status_code			#请求状态码
html_encoding = request.encoding			#查看相应头部字符编码"utf-8"
html_url = request.urlopen 					#查看完整的url请求
#2.添加请求头部参数
import requests
url = "https://www.laosiji.com"
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
kw = {'wd':'长城'}						    #params接收一个字典或者字符串的查询参数，如百度查询的是wd=""
request = requests.get(url,params=kw,headers=headers)
html = request.text
download_jsp = request.contents             #
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
2.post请求
#例子
url = "https://www.laosiji.com"
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
data = {
    "type":"AUTO",
    "i":"i love python",
    "doctype":"json",
    "xmlVersion":"1.8",
    "keyfrom":"fanyi.web",
    "ue":"UTF-8",
    "action":"FY_BY_ENTER",
    "typoResult":"true"
    }
request = requests.post(url,data=data,headers=headers)
html = request.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
3.代理

#proxies参数
import requests
url = "https://www.laosiji.com"
proxies = {
  "http": "http://12.34.56.79:9527",
  "https": "http://12.34.56.79:9527",
}				# 根据协议类型，选择不同的代理
request = requests.get(url,proxies = proxies)
html = request.text

#私密代理验证
import requests
         #如果代理需要使用HTTP Basic Auth，可以使用下面这种格式：
proxy = { "http": "mr_mao_hacker:[email protected]:16816" }
response = requests.get("http://www.baidu.com", proxies = proxy)
print response.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
4.web客户端验证
import requests
auth=('test', '123456')
response = requests.get('http://192.168.199.107', auth = auth)
print response.text
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
5.Cookies和Sission
#Cookies
import requests
response = requests.get("https://www.laosiji.com")
cookiejar = response.cookies
cookiedict = requests.utils.dict_from_cookiejar(cookiejar)
print(cookiejar)
print(cookiedict)
#Sission
import requests
ssion = requests.session()
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"}
data = {"email":"[email protected]","password":"administrator"}
ssion.post("https://www.laosiji.com/Plogin.do",data=data)
response = ssion.get("https://www.renren.com/41004312/profile")
print(response.text)

#处理SSL证书认证
import requests
response = requests.get("https://www.12306.cn",verify=True)				#跳过证书认证将True改为Flase即可
print(response.text)



===========================================================================================================
bs4库中BeautifulSoup模块的使用
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" target="_blank" rel="external nofollow"  class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" target="_blank" rel="external nofollow"  class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" target="_blank" rel="external nofollow"  class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'xml')
print(soup.title)						#打印头部信息
print(soup.title.string)				#同上
print(soup.head.title.stripped_strings)	#输出内容但是输出为抽象使用for语句
print(soup.prettify())					#打印标准输出
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#find_all使用
#用法find_all(tag_name,attrs,string)
print(soup.find_all("a",type="text/javascript"))	#找到所有a标签的字段,后面可加条件参数
for tag in soup.find_all(re.compile('a'))
    print(tag)									#输出a标签的内容,根据标签输出
print(soup.find_all(id=re.compile('123123')))	#输出标签属性id=123123的内容
print(soup.find_all('a',recursive=False))		#不对其子孙节点检索，默认为True
print(soup.find_all(string="蓝光"))				#字符串区域的检索字符
soup.find_all(id=re.compile('123123')).get("class")			#获取检索到的标签中的“class”属性
--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
print(soup.div.attrs)					#获取div标签中的属性
print(soup.div["id"])					#打印标签div属性中中id的值
print(soup.contents)					#输出标签head的所有子节点
print(soup.head.children)				#输出标签head的所有子节点
print(soup.head.descendants)			#输出所有标签head的所有子孙节点
print(soup.li.parent)					#输出标签li的上一级标签
print(soup.li.parents)					#输出标签li的所有上一级标签
print(soup.head.next_sibling)			#输出head的下一个同级标签
print(soup.head.next_siblings)			#输出head的下一个的所有标签集
print(soup.head.previous_sibling)		#输出head的上一个同级标签
print(soup.head.previous_siblings)		#输出head的上一个的所有标签集
for tag in find_all(True):
    print(tag)							#输出所有标签


--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'xml')
for i in soup.select('ul li'):			#一级一级搜索
    print(i.string)						#输出搜索到的中间字段内容
    print(i["class"])					#输出搜索到的参数内容

--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
Python爬虫知识总结

继续阅读

3.跟老韩学Python之Python代码书写风格

2.跟老韩学Python之hello初体验

1.跟老韩学Python之工具那些事儿

学习-xlsxwriter模块

搭建zabbix服务

python getpass模块简析