在前幾天寫的一建抓取網站所有連結的腳步往後衍生了以下的兩個腳步,一個是查詢網站友情連結,另一個是查詢網站的死鍊。我這裡隻是初步實作了功能,還有很多地方需要優化,比如說查詢友情連結腳步會存在帶www與不帶www不能共存識别的問題,查詢網站死鍊的腳步運作好慢的問題,這個問題是我目前解決不了的,我的能力還有限。
很多人說,爬蟲學的好,“勞煩”吃的 飽。是以,在爬蟲教程中,都會勸說大家善良,但是我現在能力有限,可以随便放開造,如果有喜歡一起學習的朋友,可以加我微信,互相讨論,共同學習。下面分享這兩個腳步源代碼,供大家欣賞,^_^。
一、友情連結查詢
Python
import requests
from bs4 import BeautifulSoup
import time
url=input("輸入主域名:")
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
def shouye():
r=requests.get(url,headers=headers)
soup=BeautifulSoup(r.content.decode('utf-8',"ignore"),'lxml')
suoyoua=soup.find_all('a')
alla=[]
for lia in suoyoua:
dana=lia.get("href")
try:
if dana.find('http')!=-1 and dana.find(url) == -1:
alla.append(dana)
except:
continue
# 去重
alla=sorted(set(alla), key=alla.index)
fanhui(alla)
def fanhui(alla):
for duiurl in alla:
try:
r=requests.get(duiurl,headers=headers)
except:
print('該網站打不開', duiurl)
continue
try:
soup = BeautifulSoup(r.content.decode('utf-8',"ignore"), 'lxml')
except Exception as ex:
print(duiurl,ex)
suoyoua = soup.find_all('a')
sya=[]
for lia in suoyoua:
dana = lia.get("href")
sya.append(dana)
sya=str(sya)
if sya.find(url)==-1:
print('該網站沒有我們網站連結',duiurl)
if __name__ == '__main__':
startime = time.time()
shouye()
endtime = time.time()
thetime=endtime-startime
print(thetime)
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLicmbw5iMkBzY3AjY1EmNkNjYxEDZlRWNzUWNyYDZ1kjN2kDOx8CX0JXZ252bj91Ztl2Lc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.png)
二、死連結查詢
Python
import requests
from bs4 import BeautifulSoup
# 程序
from threading import Thread
import time
bbb=[]
jishu=0
def shouye():
global jishu
url=input("輸入主域名:")
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
r=requests.get(url,headers=headers)
soup=BeautifulSoup(r.content.decode('utf-8'),'lxml')
suoyoua=soup.find_all('a')
alla=[]
for lia in suoyoua:
dana=lia.get("href")
alla.append(dana)
# 去重
alla=sorted(set(alla), key=alla.index)
# 開啟多線程
t_list = []
for lianjie in alla:
for i in range(5):
t = Thread(target=neiye, args=(lianjie, url))
t_list.append(t)
t.start()
# 回收線程
for t in t_list:
t.join()
def neiye(lianjie,url):
global bbb
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
if lianjie.find(url)!=-1:
ciurl= lianjie
elif lianjie.find('http')==-1 and lianjie.find('/')!=-1:
ciurl=url + lianjie
else:
ciurl = url + '/' + lianjie
r = requests.get(ciurl , headers=headers)
bba=[]
alla = []
try:
soup = BeautifulSoup(r.content.decode('utf-8'), 'lxml')
suoyoua = soup.find_all('a')
except:
bba.append(ciurl)
else:
for lia in suoyoua:
try:
dana = lia.get("href")
except:
continue
alla.append(dana)
# 去重
alla = sorted(set(alla), key=alla.index)
global jishu
for lian2 in alla:
if lian2 in bbb:
continue
else:
bbb.append(lian2)
neiye(lian2,url)
if __name__ == '__main__':
startime = time.time()
shouye()
bbb = sorted(set(bbb), key=bbb.index)
num=0
for ads in bbb:
if ads.find('http')!=-1:
ads=ads
else:
ads='http://zhuxiaoedu.com'+ads
print(num, ads)
num += 1
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
try:
r = requests.get(ads, headers=headers)
except Exception as e:
print(e)
continue
print(r.status_code)
endtime = time.time()
thetime=endtime-startime
print(thetime)