以上是百度百科和维基百科对网络爬虫的定义,简单来说爬虫就是抓取目标网站内容的工具,一般是根据定义的行为自动进行抓取,更智能的爬虫会自动分析目标网站结构类似与搜索引擎的爬虫,我们这里只讨论基本的爬虫原理。
###爬虫工作原理
网络爬虫框架主要由控制器、解析器和索引库三大部分组成,而爬虫工作原理主要是解析器这个环节,解析器的主要工作是下载网页,进行页面的处理,主要是将一些JS脚本标签、CSS代码内容、空格字符、HTML标签等内容处理掉,爬虫的基本工作是由解析器完成。所以解析器的具体流程是:
入口访问->下载内容->分析结构->提取内容
这里我们通过分析一个网站[落网:http://luoo.net] 对网站内容进行提取来进一步了解!
第一步 确定目的
抓取目标网站的某一期所有音乐
第二步 分析页面结构
访问落网的某一期刊,通过Chrome的开发者模式查看播放列表中的歌曲,右侧用红色框线圈出来的是一些需要特别注意的语义结构,见下图所示:
以上红色框线圈出的地方主要有歌曲名称,歌曲的编号等,这里并没有看到歌曲的实际文件地址,所以我们继续查看,点击某一个歌曲就会立即在浏览器中播放,这时我们可以看到在Chrome的开发者模式的Network中看到实际请求的播放文件,如下图所示:
根据以上分析我们可以得到播放清单的位置和音乐文件的路径,接下来我们通过Python来实现这个目的。
Python环境安装请自行Google
主要依赖第三方库
主要思路是分成两部分,第一部分用来发起请求分析出播放列表然后丢到队列中,第二部分在队列中逐条下载文件到本地,一般分析列表速度更快,下载速度比较慢可以借助多线程同时进行下载。
主要代码如下:
Python
#-*- coding: utf-8 -*- '''by sudo rm -rf http://imchenkun.com''' import os import requests from bs4 import BeautifulSoup import random from faker import Factory import Queue import threading fake = Factory.create() luoo_site = 'http://www.luoo.net/music/' luoo_site_mp3 = 'http://luoo-mp3.kssws.ks-cdn.com/low/luoo/radio%s/%s.mp3' proxy_ips = [ '27.15.236.236' ] # 替换自己的代理IP headers = { 'Connection': 'keep-alive', 'User-Agent': fake.user_agent() } def random_proxies(): ip_index = random.randint(0, len(proxy_ips)-1) res = { 'http': proxy_ips[ip_index] } return res def fix_characters(s): for c in ['<', '>', ':', '"', '/', '\\\\', '|', '?', '*']: s = s.replace(c, '') return s class LuooSpider(threading.Thread): def __init__(self, url, vols, queue=None): threading.Thread.__init__(self) print '[luoo spider]' print '=' * 20 self.url = url self.queue = queue self.vol = '1' self.vols = vols def run(self): for vol in self.vols: self.spider(vol) print '\\ncrawl end\\n\\n' def spider(self, vol): url = luoo_site + vol print 'crawling: ' + url + '\\n' res = requests.get(url, proxies=random_proxies()) soup = BeautifulSoup(res.content, 'html.parser') title = soup.find('span', attrs={'class': 'vol-title'}).text cover = soup.find('img', attrs={'class': 'vol-cover'})['src'] desc = soup.find('div', attrs={'class': 'vol-desc'}) track_names = soup.find_all('a', attrs={'class': 'trackname'}) track_count = len(track_names) tracks = [] for track in track_names: _id = str(int(track.text[:2])) if (int(vol) < 12) else track.text[:2] # 12期前的音乐编号1~9是1位(如:1~9),之后的都是2位 1~9会在左边垫0(如:01~09) _name = fix_characters(track.text[4:]) tracks.append({'id': _id, 'name': _name}) phases = { 'phase': vol, # 期刊编号 'title': title, # 期刊标题 'cover': cover, # 期刊封面 'desc': desc, # 期刊描述 'track_count': track_count, # 节目数 'tracks': tracks # 节目清单(节目编号,节目名称) } self.queue.put(phases) class LuooDownloader(threading.Thread): def __init__(self, url, dist, queue=None): threading.Thread.__init__(self) self.url = url self.queue = queue self.dist = dist self.__counter = 0 def run(self): while True: if self.queue.qsize() <= 0: pass else: phases = self.queue.get() self.download(phases) def download(self, phases): for track in phases['tracks']: file_url = self.url % (phases['phase'], track['id']) local_file_dict = '%s/%s' % (self.dist, phases['phase']) if not os.path.exists(local_file_dict): os.makedirs(local_file_dict) local_file = '%s/%s.%s.mp3' % (local_file_dict, track['id'], track['name']) if not os.path.isfile(local_file): print 'downloading: ' + track['name'] res = requests.get(file_url, proxies=random_proxies(), headers=headers) with open(local_file, 'wb') as f: f.write(res.content) f.close() print 'done.\\n' else: print 'break: ' + track['name'] if __name__ == '__main__': spider_queue = Queue.Queue() luoo = LuooSpider(luoo_site, vols=['680', '721', '725', '720'],queue=spider_queue) luoo.setDaemon(True) luoo.start() downloader_count = 5 for i in range(downloader_count): luoo_download = LuooDownloader(luoo_site_mp3, 'D:/luoo', queue=spider_queue) luoo_download.setDaemon(True) luoo_download.start()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#-*- coding: utf-8 -*-
'''by sudo rm -rf http://imchenkun.com'''
import os
import requests
from bs4 import BeautifulSoup
import random
from faker import Factory
import Queue
import threading
fake = Factory.create()
luoo_site = 'http://www.luoo.net/music/'
luoo_site_mp3 = 'http://luoo-mp3.kssws.ks-cdn.com/low/luoo/radio%s/%s.mp3'
proxy_ips = [ '27.15.236.236' ] # 替换自己的代理IP
headers = {
'Connection': 'keep-alive',
'User-Agent': fake.user_agent()
}
def random_proxies():
ip_index = random.randint(0, len(proxy_ips)-1)
res = { 'http': proxy_ips[ip_index] }
return res
def fix_characters(s):
for c in ['<', '>', ':', '"', '/', '\\\\', '|', '?', '*']:
s = s.replace(c, '')
return s
class LuooSpider(threading.Thread):
def __init__(self, url, vols, queue=None):
threading.Thread.__init__(self)
print '[luoo spider]'
print '=' * 20
self.url = url
self.queue = queue
self.vol = '1'
self.vols = vols
def run(self):
for vol in self.vols:
self.spider(vol)
print '\\ncrawl end\\n\\n'
def spider(self, vol):
url = luoo_site + vol
print 'crawling: ' + url + '\\n'
res = requests.get(url, proxies=random_proxies())
soup = BeautifulSoup(res.content, 'html.parser')
title = soup.find('span', attrs={'class': 'vol-title'}).text
cover = soup.find('img', attrs={'class': 'vol-cover'})['src']
desc = soup.find('div', attrs={'class': 'vol-desc'})
track_names = soup.find_all('a', attrs={'class': 'trackname'})
track_count = len(track_names)
tracks = []
for track in track_names:
_id = str(int(track.text[:2])) if (int(vol) < 12) else track.text[:2] # 12期前的音乐编号1~9是1位(如:1~9),之后的都是2位 1~9会在左边垫0(如:01~09)
_name = fix_characters(track.text[4:])
tracks.append({'id': _id, 'name': _name})
phases = {
'phase': vol, # 期刊编号
'title': title, # 期刊标题
'cover': cover, # 期刊封面
'desc': desc, # 期刊描述
'track_count': track_count, # 节目数
'tracks': tracks # 节目清单(节目编号,节目名称)
}
self.queue.put(phases)
class LuooDownloader(threading.Thread):
def __init__(self, url, dist, queue=None):
self.dist = dist
self.__counter = 0
def run(self):
while True:
if self.queue.qsize() <= 0:
pass
else:
phases = self.queue.get()
self.download(phases)
def download(self, phases):
for track in phases['tracks']:
file_url = self.url % (phases['phase'], track['id'])
local_file_dict = '%s/%s' % (self.dist, phases['phase'])
if not os.path.exists(local_file_dict):
os.makedirs(local_file_dict)
local_file = '%s/%s.%s.mp3' % (local_file_dict, track['id'], track['name'])
if not os.path.isfile(local_file):
print 'downloading: ' + track['name']
res = requests.get(file_url, proxies=random_proxies(), headers=headers)
with open(local_file, 'wb') as f:
f.write(res.content)
f.close()
print 'done.\\n'
print 'break: ' + track['name']
if __name__ == '__main__':
spider_queue = Queue.Queue()
luoo = LuooSpider(luoo_site, vols=['680', '721', '725', '720'],queue=spider_queue)
luoo.setDaemon(True)
luoo.start()
downloader_count = 5
for i in range(downloader_count):
luoo_download = LuooDownloader(luoo_site_mp3, 'D:/luoo', queue=spider_queue)
luoo_download.setDaemon(True)
luoo_download.start()
以上代码执行后结果如下图所示
本文转自 技术花妞妞 51CTO博客,原文链接:http://blog.51cto.com/xiaogongju/1972647