Flask開發系列之Flask+redis實作IP代理池
代理池的要求
- 多站抓取,異步檢測:多站抓取:指的是我們需要從各大免費的ip代理網站,把他們公開的一些免費代理抓取下來;一步檢測指的是:把這些代理通過異步請求的方式,利用這些代理請求網站:如果能正常請求就證明代理可用,如果不能正常請求就證明代理不行,這時就可以把這個代理剔除掉,異步指的是:我們不需要一直等待代理請求網站,到得到response之後在執行相應的操作就可以了,異步可以提高檢測效率。
- 定時篩選,持續更新:我們維護一個代理池,我們需要做的是需要定時從裡面拿出一部分來檢測,剔除掉不可用的代理。這可以保證代理是可用的
- 提供接口,易于提取:代理實際上是維護在一個隊列中,隊列可以使用資料庫存儲,也可以使用一些資料結構來存儲,但是如果要擷取代理的話,要提供一個簡單的接口,最簡單的是web形式的接口:本文主要示範一個利用python flask包來提供接口:之後使用python請求網址,從網頁中拿到代理的資訊了
代理池的架構
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLiETPwJWZ3ZCMwcTP39DciV2duQ0MlQ0MlE0YpZlMkVXQU1EerR1TzEkeNRDMp10dNpmT4VlaNRTRU1EejRUT1UERNlHMp5ENjpnTzEFVNZ3YE1UNFRUT5hTaORzY650MRRVT2NmMiNnSywEd5ITW110MaZHetlVdO1GT0UERNl3YXJGc5kHT20ESjBjUIF2Lc12bj5SYphXa5VWen5WY35iclN3Ztl2Lc9CX6MHc0RHaiojIsJye.webp)
- 擷取器:從各大網站平台抓取代理:ip和端口
- 過濾器:剔除掉不可用的代理
- 将可用代理放到代理隊列
- 定時檢測器:剔除不可用的代理
- API:通過接口形式拿到代理對象,友善使用
測試實作版
import requests
import re
import time
import redis
from bloom_filter import BloomFilter
import ast
pool = redis.ConnectionPool(host=\'localhost\',password=\'xxx\', port=6379, decode_responses=True)
r = redis.Redis(connection_pool=pool)
bloombloom = BloomFilter(max_elements=10000, error_rate=0.1)
bloombloom.add(str({\'http\': \'117.91.232.53:9999\'}))
def get_ip(i):
ip_list=[]
url = \'https://www.kuaidaili.com/free/inha/\'
url = url + str(i + 1)
html = requests.get(url=url, ).text
regip = \'<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>\'
matcher = re.compile(regip, re.S)
ipstr = re.findall(matcher, html)
time.sleep(1)
for j in ipstr:
ip_list.append(j[0] + \':\' + j[1])
print(\'共收集到%d個代理ip\' % len(ip_list))
print(ip_list)
return ip_list
def valVer(proxys):
global badNum,goodNum,good_list
good = []
for proxy in proxys:
try:
proxy_host = proxy
protocol = \'https\' if \'https\' in proxy_host else \'http\'
proxies = {protocol: proxy_host}
print(\'現在正在測試的IP:\', proxies)
response = requests.get(\'http://www.baidu.com\', proxies=proxies, timeout=2)
if response.status_code != 200:
badNum += 1
print(proxy_host, \'bad proxy\')
else:
goodNum += 1
good.append(proxies)
good_list.append(proxies)
print(proxy_host, \'success proxy\')
except Exception as e:
print(e)
# print proxy_host, \'bad proxy\'
badNum += 1
continue
print(\'success proxy num : \', goodNum)
print(\'bad proxy num : \', badNum)
print("這次:",good)
print("此時全部:",good_list)
return good
def time_valVer(proxys):
good = []
for proxy in proxys:
try:
print(\'現在正在定時測試的IP:\',proxy)
proxy = ast.literal_eval(proxy)
response = requests.get(\'http://www.baidu.com\', proxies=proxy, timeout=2)
if response.status_code != 200:
r.lrem("ip_list", proxy, 1)
print(proxy, \'bad proxy\')
else:
good.append(proxy)
good_list.append(proxy)
print(proxy, \'success proxy\')
except Exception as e:
print(e)
continue
def stone(good):
for IP in good:
if str(IP) in bloombloom:
print("%s不能存儲,有相同的IP",IP)
continue
else:
print("存儲的IP:", IP)
bloombloom.add(str(IP))
r.rpush("ip_list", str(IP))
if __name__ == \'__main__\':
badNum = 0
goodNum = 0
good_list = []
for i in range(0,10):
if i%10 == 0 and i!=0:
proxy_list = []
for i in range(0, r.llen("ip_list")):
proxy_list.append(r.lindex("ip_list", i))
time_valVer(proxy_list)
else:
ip_list = get_ip(i)
good = valVer(ip_list)
stone(good)
from flask import Flask
import redis # 導入redis子產品,通過python操作redis 也可以直接在redis主機的服務端操作緩存資料庫
r = redis.Redis(host=\'localhost\', port=6379,password=\'xxx\',decode_responses=True)
app = Flask(__name__)
@app.route(\'/ip/<int:index>\')
def reponse(index):
print(index)
print(r.lindex("ip_list", index))
return r.lindex("ip_list", index)
if __name__ == \'__main__\':
app.run(debug=True)
擷取ip:
改進版
import requests
import re
import time
import redis
from bloom_filter import BloomFilter
import ast
pool = redis.ConnectionPool(host=\'localhost\',password=\'XXX\', port=6379, decode_responses=True)
r = redis.Redis(connection_pool=pool)
bloombloom = BloomFilter(max_elements=10000, error_rate=0.1)
def get_ip(i):
ip_list=[]
url = \'https://www.kuaidaili.com/free/inha/\'
url = url + str(i + 1)
html = requests.get(url=url, ).text
regip = \'<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>\'
matcher = re.compile(regip, re.S)
ipstr = re.findall(matcher, html)
time.sleep(1)
for j in ipstr:
ip_list.append(j[0] + \':\' + j[1])
print(\'共收集到%d個代理ip\' % len(ip_list))
print(ip_list)
return ip_list
def valVer(proxys):
global badNum,goodNum,good_list
good = []
for proxy in proxys:
try:
proxy_host = proxy
protocol = \'https\' if \'https\' in proxy_host else \'http\'
proxies = {protocol: proxy_host}
response = requests.get(\'http://www.baidu.com\', proxies=proxies, timeout=2)
if response.status_code != 200:
badNum += 1
else:
goodNum += 1
good.append(proxies)
good_list.append(proxies)
except Exception as e:
print(e)
badNum += 1
continue
print(\'success proxy num : \', goodNum)
print(\'bad proxy num : \', badNum)
print("這次:",good)
print("此時全部:",good_list)
return good
def time_valVer(proxys):
for proxy in proxys:
try:
print(\'現在正在定時測試的IP:\',proxy)
proxy = ast.literal_eval(proxy)
response = requests.get(\'http://www.baidu.com\', proxies=proxy, timeout=2)
if response.status_code != 200:
r.lrem("ip_list", proxy, 1)
except Exception as e:
print(e)
continue
def stone_redis(good):
for IP in good:
if str(IP) in bloombloom:
print("%s不能存儲,有相同的IP",IP)
continue
else:
print("存儲的IP:", IP)
bloombloom.add(str(IP))
r.rpush("ip_list", str(IP))
def init():
for i in range(0, r.llen("ip_list")):
print(r.lindex("ip_list", i))
bloombloom.add(r.lindex("ip_list", i))
if __name__ == \'__main__\':
badNum = 0
goodNum = 0
good_list = []
init()
for i in range(0,10):
if i%2 == 0 and i!=0:
proxy_list = []
for i in range(0, r.llen("ip_list")):
proxy_list.append(r.lindex("ip_list", i))
time_valVer(proxy_list)
else:
ip_list = get_ip(i)
good = valVer(ip_list)
stone_redis(good)
from flask import Flask, abort, request, jsonify
import redis # 導入redis子產品,通過python操作redis 也可以直接在redis主機的服務端操作緩存資料庫
r = redis.Redis(host=\'localhost\', port=6379,password=\'XXX\',decode_responses=True)
app = Flask(__name__)
@app.route(\'/ip/<int:index>\', methods=[\'GET\'])
def reponse(index):
print(index)
ip = {"ip":r.lindex("ip_list", index)}
print(r.lindex("ip_list", index))
return jsonify(ip)
if __name__ == \'__main__\':
app.run(debug=True)
擷取ip: