天天看点

Python3 urllib 笔记urllib

urllib

1. request

request存在于urllib库中,导入方法:

from urllib import request

1.1 urlopen

urlopen的方法以及参数如下:

用于打开一个url

  • url: 打开的url
  • data: POST的头数据
  • timeout: 超时时间

用法如下:

def demo():
    try:
        s = request.urlopen(_url, timeout=)
    except request.HTTPError as e:
        print(e)
    finally:
        pass
    msg = s.info()
    data = s.read().decode('utf-8')
    #print_list(dir(s))     # print all functions and proporties
    print(s.status)
    print(msg)
           

urlopen返回的msg的方法:

- info(): 返回一个HTTPMessage对象

返回response headers信息

- read(size)

- readline()

- readlines()

- close()

- getcode(): 获得应答码

1.2 urlretrieve

用于下载一个远程文件

  • url: 地址
  • filename: 保存文件名
  • reporthook: 下载状态报告
  • data: POST的头数据
  • 返回(filename, HTTPMessage)

reporthook

def progress(blk, blk_size, total_size)

- blk: 传输的块数

- blk_size: 块大小

- total_size: 数据总大小

测试代码

def progress(blk, blk_size, total_size):
# 1.block num
# 2.block size
# 3.data size
    print('%d/%d - %.02f%%' % (blk * blk_size, total_size, (float)(blk_size * blk) *  / total_size))

def retrieve():
    request.urlretrieve(_url, 'index.html',reporthook=progress)     # download a url to a file
    #print('fname = ', fname)
    #print_list(msg.items())
           

1.3 Request

class Request:

    def __init__(self, url, data=None, headers={},
                 origin_req_host=None, unverifiable=False,
                 method=None):
           

可以使用Request添加或修改http头

def build_opener_demo():
    # POST
    data = {'username': 'jjhfen00', 'password': '962886do621374#'}
    header = {'User-Agent': 'Mozilla/5.0'}
    req = request.Request('http://www.douban.com', data=parse.urlencode(data).encode('utf-8'), headers=header)      # important to encode to utf-8
    # req.add_header('', '')
           

1.4 build_opener

"""
Create an opener object from a list of handlers.

    The opener will use several default handlers, including support
    for HTTP, FTP and when applicable HTTPS.

    If any of the handlers passed as arguments are subclasses of the
    default handlers, the default handlers will not be used."""
def build_opener(*handlers)
           
  • 参数为Handler列表
    • BaseHandler为父类
    • HTTPHandler
    • HTTPSHandler
    • HTTPCookieProcessor
  • 返回OpenerDirector
def build_opener_demo():
    # POST
    data = {'username': 'jjhfen00', 'password': '962886do621374#'}
    header = {'User-Agent': 'Mozilla/5.0'}
    req = request.Request('http://www.douban.com', data=parse.urlencode(data).encode('utf-8'), headers=header)      # important to encode to utf-8
    # req.add_header('', '')
    opener = request.build_opener(request.HTTPHandler(debuglevel=))        # output debug info
    s = opener.open(req)
    # request.urlopen('http://www.douban.com', data=parse.urlencode(data).encode('utf-8'), headers=headers)
    print(s.read().decode('utf-8'))
    s.close()
           

output

<!DOCTYPE HTML>
<html hljs-string">"zh-cms-Hans" class="">
<head>
<meta charset="UTF-8">
<meta name="description" content="提供图书、电影、音乐唱片的推荐、评论和价格比较,以及城市独特的文化生活。">
<meta name="keywords" content="豆瓣,广播,登陆豆瓣">
<meta property="qc:admins" content="" />
<meta property="wb:webmaster" content="d4a17a4fa24c2" />
<meta name="mobile-agent" content="format=xhtml; url=http://m.douban.com">
<title>豆瓣</title>
           

2. parse

2.1 urlencode

  • 把字典数据转换为url编码
  • 用途
  • 对url参数进行编码
  • 对post上去的form数据进行编码
def urlencode():
    params = {'score': , 'name': 'basic spider', 'comment': 'very good'}
    qs = parse.urlencode(params)        # trans dict to url
    print(qs)
    print(parse.parse_qs(qs))           # trans url to dict
           

output:

name=basic+spider&score=100&comment=very+good {'name': ['basic spider'], 'score': ['100'], 'comment': ['very good']}

2.2 parse_qs

  • 把url编码转换为字典数据
def parse_test():
    qs = 'https://www.baidu.com/s?wd=python%E5%AD%A6%E4%B9%A0&rsv_spt=1&rsv_iqid=0x9949316d002fc16e&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&oq=python&inputT=1288&rsv_t=332846lzDwGW5S%2FPcPxNwi%2BGLXjhhSsLTTEoq7CDUafIYdFz%2FW7jKJOOhPpuzg1oFlve&rsv_sug3=14&rsv_sug1=14&rsv_sug7=100&rsv_pq=f4c9c20c002dabfe&bs=python'
    result = parse.parse_qs(qs)
    print_dict(result)
           

output

('https://www.baidu.com/s?wd', ['python学习'])
('rsv_sug7', ['100'])
('rsv_iqid', ['0x9949316d002fc16e'])
('oq', ['python'])
('rsv_bp', ['1'])
('issp', ['1'])
('rsv_t', ['332846lzDwGW5S/PcPxNwi+GLXjhhSsLTTEoq7CDUafIYdFz/W7jKJOOhPpuzg1oFlve'])
('rsv_idx', ['2'])
('rsv_sug1', ['14'])
('rsv_spt', ['1'])
('rsv_enter', ['1'])
('ie', ['utf-8'])
('rsv_sug3', ['14'])
('inputT', ['1288'])
('tn', ['baiduhome_pg'])
('bs', ['python'])
('rsv_pq', ['f4c9c20c002dabfe'])
('f', ['8'])
           

3. cookie

需要使用的类:

  • http.cookiejar
    • from http import

  • request.HTTPCookieProcessor

CookieJar:

class CookieJar:
    """Collection of HTTP cookies.

    You may not need to know about this class: try
    urllib.request.build_opener(HTTPCookieProcessor).open(url).
    """
    def __init__(self, policy=None):
           

提供一个解析并且保存cookie的接口

HTTPCookieProcessor:

class HTTPCookieProcessor(BaseHandler):
    def __init__(self, cookiejar=None):
           
def cookie_demo():
    cookiejar_ = cookiejar.CookieJar()
    handler = request.HTTPCookieProcessor(cookiejar_)
    opener = request.build_opener(handler, request.HTTPHandler(debuglevel=))
    s = opener.open('http://www.douban.com')
    #print(s.read(500).decode('utf-8'))
    print_list(s.getheaders())
    s.close()

    print('=' * )
    print('cookiejar = ', cookiejar_._cookies)
    print('=' * )
    s = opener.open('http://www.douban.com')
    print(s.info()) # == print_dict(s.getheaders())
    s.close()
           

output

('Server', 'dae')
('Date', 'Sun, 28 Feb 2016 17:55:14 GMT')
('Content-Type', 'text/html; charset=utf-8')
('Content-Length', '94309')
('Connection', 'close')
('X-Douban-Mobileapp', '0')
('Expires', 'Sun, 1 Jan 2006 01:00:00 GMT')
('X-Douban-Newbid', 'Iit6RGmQAE4')
('Pragma', 'no-cache')
('Cache-Control', 'must-revalidate, no-cache, private')
('P3P', 'CP="IDC DSP COR ADM DEVi TAIi PSA PSD IVAi IVDi CONi HIS OUR IND CNT"')
('Set-Cookie', 'bid="Iit6RGmQAE4"; path=/; domain=.douban.com; expires=Mon, 27-Feb-2017 17:55:14 GMT')
('Set-Cookie', 'll="118172"; path=/; domain=.douban.com; expires=Mon, 27-Feb-2017 17:55:14 GMT')
('X-DAE-Node', 'sindar21c')
('X-DAE-App', 'sns')
('Strict-Transport-Security', 'max-age=0;')
======================================================================
cookiejar =  {'.douban.com': {'/': {'ll': Cookie(version=, name='ll', value='"118172"', port=None, port_specified=False, domain='.douban.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False), 'bid': Cookie(version=, name='bid', value='"Iit6RGmQAE4"', port=None, port_specified=False, domain='.douban.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False)}}}
======================================================================
Server: dae
Date: Sun,  Feb  :: GMT
Content-Type: text/html; charset=utf-
Content-Length: 
Connection: close
X-Douban-Mobileapp: 
Expires: Sun,  Jan  :: GMT
Pragma: no-cache
Cache-Control: must-revalidate, no-cache, private
X-DAE-Node: sindar17b
X-DAE-App: sns
Strict-Transport-Security: max-age=;