天天看点

Python学习(2)

爬取网页的部分链接

#!/usr/bin/python

#coding = utf8

from urllib.request import urlopen

from bs4 import BeautifulSoup

import re

import random

pages = set()

def getlink(pageurl):

    global pages

    html = urlopen('http://www.ftchinese.com' + pageurl)

    bs_data = BeautifulSoup(html,'lxml')

#from ipdb import set_trace

#set_trace()

    for link in bs_data.find_all('a',href = re.compile("^(/m/)")):

        if 'href' in link.attrs:

            if link.attrs['href'] not in pages:

            #我们遇到了新页面

                newpage = link.attrs['href']

                print(newpage)

                pages.add(newpage)

                getlink(newpage)

getlink("")

<code>&lt;br&gt;</code>

<code></code>

本文转自 妙曼  51CTO博客,原文链接:http://blog.51cto.com/yanruohan/1913551,如需转载请自行联系原作者