代理池分為4個子產品:存儲子產品、擷取子產品、檢測子產品、接口子產品。
子產品 | 作用 |
---|---|
存儲子產品 | 存儲子產品使用Redis的有序集合,用來做dialing的去重和狀态表示,同時它也是中心子產品和基礎子產品,将其他子產品串連起來 |
擷取子產品 | 擷取子產品定時從代理網站擷取代理,将擷取的代理傳遞給存儲子產品,并儲存到資料庫 |
檢測子產品 | 檢測子產品定時從存儲子產品擷取所有代理,并對代理進行檢測,根據不同的檢測結果對代理設定不同的辨別 |
接口子產品 | 接口子產品通過Web API接通的服務接口,接口通過連接配接資料庫并通過Web形式傳回可用的代理 |
存儲子產品的實作:
import redis
from random import choice
MAX_SCORE = 100
MIN_SCORE = 0
INITIAL_SCORE = 10
REDIS_HOST = "localhost"
REDIS_PORT = 6379
DB_NUM = 5
REDIS_KEY = 'proxies'
class PoolEmptyError(Exception):
"""自定義ip代理池為空異常"""
pass
class RedisClient(object):
"""redis資料庫用戶端類"""
def __init__(self, host=REDIS_HOST, port=REDIS_PORT, db=DB_NUM):
"""
初始化
:param host:Redis 位址
:param port: Redis 端口
:param db: Redis 資料庫
"""
self.db = redis.StrictRedis(host=host, port=port, db=db, decode_responses=True)
def add(self, proxy, score=INITIAL_SCORE):
"""
添加代理,設定分數為初始分數
:param proxy: 代理
:param score: 分數
:return: 添加結果
"""
if not self.db.zscore(REDIS_KEY, proxy):
return self.db.zadd(REDIS_KEY, {proxy: score})
def random(self):
"""
随機擷取有效代理,首先嘗試擷取最高分數代理,如果最高分數不存在,則按照排名擷取,否則異常
:return:随機代理
"""
result = self.db.zrangebyscore(REDIS_KEY, MAX_SCORE, MAX_SCORE)
if len(result):
return choice(result)
else:
result = self.db.zrevrange(REDIS_KEY, 0, 100)
if len(result):
return choice(result)
else:
raise PoolEmptyError
def decrease(self, proxy):
"""
代理值減一分,分數小于最小值,則代理删除
:param proxy: 代理
:return: 修改後的代理分數
"""
score = self.db.zscore(REDIS_KEY, proxy)
if score and score > MIN_SCORE:
print('代理', proxy, '目前分數', score, '減1')
return self.db.zincrby(REDIS_KEY, -1, proxy)
else:
print("代理", proxy, "目前分數", score, "移除")
return self.db.zrem(REDIS_KEY, proxy)
def exits(self, proxy):
"""
判斷proxy是否存在
:param proxy: 代理
:return:是否存在
"""
return not self.db.zscore(REDIS_KEY, proxy) is None
def max(self, proxy):
"""
将代理在設定為MAX_SCORE
:param proxy:
:return:
"""
print("代理", proxy, "可用, 設定為", MAX_SCORE)
return self.db.zadd(REDIS_KEY, {proxy:MAX_SCORE})
def count(self):
"""
擷取數量
:return: 數量
"""
return self.db.zcard(REDIS_KEY)
def all(self):
"""
擷取全部代理
:return: 全部代理清單
"""
return self.db.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE, withscores=True)
擷取子產品:
import json
from utils import get_page
import re
from pyquery import PyQuery as pq
class ProxyMetaclass(type):
"""
定義元類, 給類添加倆私有類屬性__CrawlFunc__, __CrawlFuncCount__
:return 類建立的引用
"""
def __new__(cls, name, base, attrs):
count = 0
attrs['__CrawlFunc__'] = []
for k, v in attrs.items():
if 'crawl_' in k:
attrs['__CrawlFunc__'].append(k)
count += 1
attrs["__CrawlFuncCount__"] = count
return type.__new__(cls, name, base, attrs)
class Crawler(object, metaclass=ProxyMetaclass):
"""
爬蟲類定義
"""
def get_proxies(self, callback):
"""
通過crawl擷取的到的代理添加到proxies清單中,并傳回,這裡用到了協程
:param callback: 下面的crawl_daili66
:return:
"""
proxies = []
for proxy in eval("self.{}()".format(callback)):
print("成功擷取到代理", proxy)
proxies.append(proxy)
return proxies
def crawl_daili66(self, page_count=4):
"""
66快代理
:param page_count:
:return:
"""
start_url = 'http://www.66ip.cn/{}.html'
headers = {
"Cookie": "__jsluid=d67ba1bf483046976b81a6b33a372b1c; Hm_lvt_1761fabf3c988e7f04bec51acd4073f4=1555505708; Hm_lpvt_1761fabf3c988e7f04bec51acd4073f4=1555508780",
"Host": "www.66ip.cn",
}
urls = [start_url.format(page) for page in range(1, page_count + 1)]
count = page_count
for url in urls:
headers_referer = {
"Referer": "http://www.66ip.cn/{}".format(count - page_count + 1)
}
# 需要特别注意:headers.update(headers_referer)傳回值None,添加headers_referer的鍵值對到headers字典的鍵值對
headers.update(headers_referer)
page_count -= 1
html = get_page(url, options=headers)
if html:
doc = pq(html)
trs = doc('.containerbox table tr:gt(0)').items()
for tr in trs:
ip = tr.find('td:nth-child(1)').text()
port = tr.find('td:nth-child(2)').text()
yield ":".join([ip, port])
def crawl_kuaidaili(self):
"""
快代理
:return:
"""
for i in range(1, 4):
start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i)
html = get_page(start_url)
if html:
ip_address = re.compile('<td data-title="IP">(.*?)</td>')
re_ip_address = ip_address.findall(html)
port = re.compile('<td data-title="PORT">(.*?)</td>')
re_port = port.findall(html)
for address, port in zip(re_ip_address, re_port):
address_port = address + ':' + port
yield address_port.replace(' ', '')
def crawl_ip3366(self):
"""
雲代理
:return:
"""
for page in range(1, 4):
start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
html = get_page(start_url)
ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
# \s * 比對空格,起到換行作用
re_ip_address = ip_address.findall(html)
for address, port in re_ip_address:
result = address + ':' + port
yield result.replace(' ', '')
def crawl_xicidaili(self):
"""
國内高匿代理IP
:return:
"""
for i in range(1, 3):
start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
'Host': 'www.xicidaili.com',
'Referer': 'http://www.xicidaili.com/nn/3',
'Upgrade-Insecure-Requests': '1',
}
html = get_page(start_url, options=headers)
if html:
find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
trs = find_trs.findall(html)
for tr in trs:
find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
re_ip_address = find_ip.findall(tr)
find_port = re.compile('<td>(\d+)</td>')
re_port = find_port.findall(tr)
for address, port in zip(re_ip_address, re_port):
address_port = address + ':' + port
yield address_port.replace(' ', '')
def crawl_iphai(self):
"""ip海"""
start_url = 'http://www.iphai.com/'
html = get_page(start_url)
if html:
find_tr = re.compile('<tr>(.*?)</tr>', re.S)
trs = find_tr.findall(html)
for s in range(1, len(trs)):
find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
re_ip_address = find_ip.findall(trs[s])
find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
re_port = find_port.findall(trs[s])
for address, port in zip(re_ip_address, re_port):
address_port = address + ':' + port
yield address_port.replace(' ', '')
def crawl_data5u(self):
"""無憂代理"""
start_url = 'http://www.data5u.com/free/gngn/index.shtml'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
'Host': 'www.data5u.com',
'Referer': 'http://www.data5u.com/free/index.shtml',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
}
html = get_page(start_url, options=headers)
if html:
ip_address = re.compile('<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S)
re_ip_address = ip_address.findall(html)
for address, port in re_ip_address:
result = address + ':' + port
yield result.replace(' ', '')
if __name__ == "__main__":
c = Crawler()
# 一一測試
print(c.get_proxies("crawl_daili66"))
公共子產品:
import requests
from requests.exceptions import ConnectionError
from db import RedisClient
base_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
}
def get_page(url, options={}, proxy=None):
"""
抓取代理
:param url:網址
:param options:
:return:
"""
headers = dict(base_headers, **options)
print('正在抓取', url)
try:
redis = RedisClient()
proxies = {
"http": "http://"+redis.random(),
}
response = requests.get(url, headers=headers, proxies=proxies)
print('抓取成功', url, response.status_code)
if response.status_code == 200:
return response.text
except ConnectionError:
print('抓取失敗', url)
return None
更多内容,參考下一篇部落格
參考文獻:
github:https://github.com/Python3WebSpider/ProxyPool