Python爬蟲

以上是百度百科和維基百科對網絡爬蟲的定義，簡單來說爬蟲就是抓取目标網站内容的工具，一般是根據定義的行為自動進行抓取，更智能的爬蟲會自動分析目标網站結構類似與搜尋引擎的爬蟲，我們這裡隻讨論基本的爬蟲原理。

###爬蟲工作原理

網絡爬蟲架構主要由控制器、解析器和索引庫三大部分組成，而爬蟲工作原理主要是解析器這個環節，解析器的主要工作是下載下傳網頁，進行頁面的處理，主要是将一些JS腳本标簽、CSS代碼内容、空格字元、HTML标簽等内容處理掉，爬蟲的基本工作是由解析器完成。是以解析器的具體流程是：

入口通路->下載下傳内容->分析結構->提取内容

這裡我們通過分析一個網站[落網：http://luoo.net] 對網站内容進行提取來進一步了解！

第一步确定目的

抓取目标網站的某一期所有音樂

第二步分析頁面結構

通路落網的某一期刊，通過Chrome的開發者模式檢視播放清單中的歌曲，右側用紅色框線圈出來的是一些需要特别注意的語義結構，見下圖所示：

以上紅色框線圈出的地方主要有歌曲名稱，歌曲的編号等，這裡并沒有看到歌曲的實際檔案位址，是以我們繼續檢視，點選某一個歌曲就會立即在浏覽器中播放，這時我們可以看到在Chrome的開發者模式的Network中看到實際請求的播放檔案，如下圖所示：

根據以上分析我們可以得到播放清單的位置和音樂檔案的路徑，接下來我們通過Python來實作這個目的。

Python環境安裝請自行Google

主要依賴第三方庫

主要思路是分成兩部分，第一部分用來發起請求分析出播放清單然後丢到隊列中，第二部分在隊列中逐條下載下傳檔案到本地，一般分析清單速度更快，下載下傳速度比較慢可以借助多線程同時進行下載下傳。

主要代碼如下:

Python

#-*- coding: utf-8 -*- '''by sudo rm -rf http://imchenkun.com''' import os import requests from bs4 import BeautifulSoup import random from faker import Factory import Queue import threading fake = Factory.create() luoo_site = 'http://www.luoo.net/music/' luoo_site_mp3 = 'http://luoo-mp3.kssws.ks-cdn.com/low/luoo/radio%s/%s.mp3' proxy_ips = [ '27.15.236.236' ] # 替換自己的代理IP headers = { 'Connection': 'keep-alive', 'User-Agent': fake.user_agent() } def random_proxies(): ip_index = random.randint(0, len(proxy_ips)-1) res = { 'http': proxy_ips[ip_index] } return res def fix_characters(s): for c in ['<', '>', ':', '"', '/', '\\\\', '|', '?', '*']: s = s.replace(c, '') return s class LuooSpider(threading.Thread): def __init__(self, url, vols, queue=None): threading.Thread.__init__(self) print '[luoo spider]' print '=' * 20 self.url = url self.queue = queue self.vol = '1' self.vols = vols def run(self): for vol in self.vols: self.spider(vol) print '\\ncrawl end\\n\\n' def spider(self, vol): url = luoo_site + vol print 'crawling: ' + url + '\\n' res = requests.get(url, proxies=random_proxies()) soup = BeautifulSoup(res.content, 'html.parser') title = soup.find('span', attrs={'class': 'vol-title'}).text cover = soup.find('img', attrs={'class': 'vol-cover'})['src'] desc = soup.find('div', attrs={'class': 'vol-desc'}) track_names = soup.find_all('a', attrs={'class': 'trackname'}) track_count = len(track_names) tracks = [] for track in track_names: _id = str(int(track.text[:2])) if (int(vol) < 12) else track.text[:2] # 12期前的音樂編号1~9是1位（如：1~9），之後的都是2位 1~9會在左邊墊0（如：01~09） _name = fix_characters(track.text[4:]) tracks.append({'id': _id, 'name': _name}) phases = { 'phase': vol, # 期刊編号 'title': title, # 期刊标題 'cover': cover, # 期刊封面 'desc': desc, # 期刊描述 'track_count': track_count, # 節目數 'tracks': tracks # 節目清單(節目編号，節目名稱) } self.queue.put(phases) class LuooDownloader(threading.Thread): def __init__(self, url, dist, queue=None): threading.Thread.__init__(self) self.url = url self.queue = queue self.dist = dist self.__counter = 0 def run(self): while True: if self.queue.qsize() <= 0: pass else: phases = self.queue.get() self.download(phases) def download(self, phases): for track in phases['tracks']: file_url = self.url % (phases['phase'], track['id']) local_file_dict = '%s/%s' % (self.dist, phases['phase']) if not os.path.exists(local_file_dict): os.makedirs(local_file_dict) local_file = '%s/%s.%s.mp3' % (local_file_dict, track['id'], track['name']) if not os.path.isfile(local_file): print 'downloading: ' + track['name'] res = requests.get(file_url, proxies=random_proxies(), headers=headers) with open(local_file, 'wb') as f: f.write(res.content) f.close() print 'done.\\n' else: print 'break: ' + track['name'] if __name__ == '__main__': spider_queue = Queue.Queue() luoo = LuooSpider(luoo_site, vols=['680', '721', '725', '720'],queue=spider_queue) luoo.setDaemon(True) luoo.start() downloader_count = 5 for i in range(downloader_count): luoo_download = LuooDownloader(luoo_site_mp3, 'D:/luoo', queue=spider_queue) luoo_download.setDaemon(True) luoo_download.start()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

#-*- coding: utf-8 -*-

'''by sudo rm -rf http://imchenkun.com'''

import os

import requests

from bs4 import BeautifulSoup

import random

from faker import Factory

import Queue

import threading

fake = Factory.create()

luoo_site = 'http://www.luoo.net/music/'

luoo_site_mp3 = 'http://luoo-mp3.kssws.ks-cdn.com/low/luoo/radio%s/%s.mp3'

proxy_ips = [ '27.15.236.236' ] # 替換自己的代理IP

headers = {

'Connection': 'keep-alive',

'User-Agent': fake.user_agent()

}

def random_proxies():

ip_index = random.randint(0, len(proxy_ips)-1)

res = { 'http': proxy_ips[ip_index] }

return res

def fix_characters(s):

for c in ['<', '>', ':', '"', '/', '\\\\', '|', '?', '*']:

s = s.replace(c, '')

return s

class LuooSpider(threading.Thread):

def __init__(self, url, vols, queue=None):

threading.Thread.__init__(self)

print '[luoo spider]'

print '=' * 20

self.url = url

self.queue = queue

self.vol = '1'

self.vols = vols

def run(self):

for vol in self.vols:

self.spider(vol)

print '\\ncrawl end\\n\\n'

def spider(self, vol):

url = luoo_site + vol

print 'crawling: ' + url + '\\n'

res = requests.get(url, proxies=random_proxies())

soup = BeautifulSoup(res.content, 'html.parser')

title = soup.find('span', attrs={'class': 'vol-title'}).text

cover = soup.find('img', attrs={'class': 'vol-cover'})['src']

desc = soup.find('div', attrs={'class': 'vol-desc'})

track_names = soup.find_all('a', attrs={'class': 'trackname'})

track_count = len(track_names)

tracks = []

for track in track_names:

_id = str(int(track.text[:2])) if (int(vol) < 12) else track.text[:2] # 12期前的音樂編号1~9是1位（如：1~9），之後的都是2位 1~9會在左邊墊0（如：01~09）

_name = fix_characters(track.text[4:])

tracks.append({'id': _id, 'name': _name})

phases = {

'phase': vol, # 期刊編号

'title': title, # 期刊标題

'cover': cover, # 期刊封面

'desc': desc, # 期刊描述

'track_count': track_count, # 節目數

'tracks': tracks # 節目清單(節目編号，節目名稱)

}

self.queue.put(phases)

class LuooDownloader(threading.Thread):

def __init__(self, url, dist, queue=None):

self.dist = dist

self.__counter = 0

def run(self):

while True:

if self.queue.qsize() <= 0:

pass

else:

phases = self.queue.get()

self.download(phases)

def download(self, phases):

for track in phases['tracks']:

file_url = self.url % (phases['phase'], track['id'])

local_file_dict = '%s/%s' % (self.dist, phases['phase'])

if not os.path.exists(local_file_dict):

os.makedirs(local_file_dict)

local_file = '%s/%s.%s.mp3' % (local_file_dict, track['id'], track['name'])

if not os.path.isfile(local_file):

print 'downloading: ' + track['name']

res = requests.get(file_url, proxies=random_proxies(), headers=headers)

with open(local_file, 'wb') as f:

f.write(res.content)

f.close()

print 'done.\\n'

print 'break: ' + track['name']

if __name__ == '__main__':

spider_queue = Queue.Queue()

luoo = LuooSpider(luoo_site, vols=['680', '721', '725', '720'],queue=spider_queue)

luoo.setDaemon(True)

luoo.start()

downloader_count = 5

for i in range(downloader_count):

luoo_download = LuooDownloader(luoo_site_mp3, 'D:/luoo', queue=spider_queue)

luoo_download.setDaemon(True)

luoo_download.start()

以上代碼執行後結果如下圖所示

本文轉自技術花妞妞 51CTO部落格，原文連結:http://blog.51cto.com/xiaogongju/1972647

Python爬蟲

繼續閱讀

來自python的【條件控制/語句循環/break/continue/else/pass】一、條件控制二、語句循環

無法解析的外部符号 wmain，該符号在函數 "void cdecl mainCRTStartupHelper(struct HINSTANCE *,unsigned short con......

TestLink導出用例轉換工具(XML2Excel)

YAML簡介和PyYAML安全操作YAML支援的類型YAML的優點：yaml的基本文法python操作

Small tricks

libsvm for python 安裝

學習軟體測試基礎測試第七天

Zeppelin 配置通路 REST APIApache Zeppelin Configuration REST API

【Torch】最簡潔logging使用指南

27. Remove Element(清單)題目代碼

Cloud Studio初體驗

使用 ctypes 進行 Python 和 C 的混合程式設計

【python】【資料處理】畫多元資料分布圖

【python】netconf協定對接管理裝置

「Python 網絡自動化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 網絡裝置

在python中建立excel并寫入