Python学习（2）

2017-11-06 23:50:00

爬取网页的部分链接

#!/usr/bin/python

#coding = utf8

from urllib.request import urlopen

from bs4 import BeautifulSoup

import re

import random

pages = set()

def getlink(pageurl):

global pages

html = urlopen('http://www.ftchinese.com' + pageurl)

bs_data = BeautifulSoup(html,'lxml')

#from ipdb import set_trace

#set_trace()

for link in bs_data.find_all('a',href = re.compile("^(/m/)")):

if 'href' in link.attrs:

if link.attrs['href'] not in pages:

#我们遇到了新页面

newpage = link.attrs['href']

print(newpage)

pages.add(newpage)

getlink(newpage)

getlink("")

本文转自妙曼 51CTO博客，原文链接：http://blog.51cto.com/yanruohan/1913551，如需转载请自行联系原作者

继续阅读