天天看點

python爬蟲搜狐新聞_Python爬蟲——主題抓取搜狐新聞(步驟及代碼實作),爬取...

import requests

from bs4 import BeautifulSoup

import jieba

from gensim.corpora.dictionary import Dictionary

import re

import jieba.analyse as ana

def getdata():

#news_all=[]

news_dictall={}

for p in range(1,10):

p2=1603263206992+p*8

url='https://v2.sohu.com/public-api/feed?scene=CATEGORY&sceneId=1460&page='+str(p)+'&size=20&_='+str(p2)

headers={

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',

'cookie':'itssohu=true; BAIDU_SSP_lcr=https://news.hao123.com/wangzhi; IPLOC=CN3300; SUV=201021142102FD7T; reqtype=pc; gidinf=x099980109ee124d51195e802000a3aab2e8ca7bf7da; t=1603261548713; jv=78160d8250d5ed3e3248758eeacbc62e-kuzhE2gk1603261903982; ppinf=2|1603261904|1604471504|bG9naW5pZDowOnx1c2VyaWQ6Mjg6MTMxODgwMjEyODc2ODQzODI3MkBzb2h1LmNvbXxzZXJ2aWNldXNlOjMwOjAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMHxjcnQ6MTA6MjAyMC0xMC0yMXxlbXQ6MTowfGFwcGlkOjY6MTE2MDA1fHRydXN0OjE6MXxwYXJ0bmVyaWQ6MTowfHJlbGF0aW9uOjA6fHV1aWQ6MTY6czExZjVhZTI2NTJiNmM3Nnx1aWQ6MTY6czExZjVhZTI2NTJiNmM3Nnx1bmlxbmFtZTowOnw; pprdig=L2Psu-NwDR2a1BZITLwhlxdvI2OrHzl6jqQlF3zP4z70gqsyYxXmf5dCZGuhPFZ-XWWE5mflwnCHURGUQaB5cxxf8HKpzVIbqTJJ3_TNhPgpDMMQdFo64Cqoay43UxanOZJc4-9dcAE6GU3PIufRjmHw_LApBXLN7sOMUodmfYE; ppmdig=1603261913000000cfdc2813caf37424544d67b1ffee4770'

}

res=requests.get(url,headers=headers)

soup=BeautifulSoup(res.text,'lxml')

news=re.findall('"mobileTitle":"(.*?)",',str(soup))

herf=re.findall('"originalSource":"(.*?)"',str(soup))

#news=soup.find_all("div",attrs={'class':'news-wrapper'})

#html=etree.HTML(res.text)

#news=html.xpath('/html/body/div[2]/div[1]/div[2]/div[2]/div/div[3]/div[3]/h4/a/text()')

news_dic=dict(zip(news,herf))#把标題和連結儲存到字典

for k,v in news_dic.items():

news_dictall[k]=v #每一頁的字典合并

return(news_dictall)#傳回總字典

def ifsim(topicwords):

news_dicfin={}

news_dic=getdata()

ana.set_stop_words('D:\作業\python\文本挖掘\資料集\新聞資料集\data\stopwords.txt') # 輸入停用詞

for k,v in news_dic.items():

word_list=ana.extract_tags(k,topK=50,withWeight=False) #去除停用詞+詞頻分析

#word_lil.append(word_list)

word_lil=[]

for i in word_list:

word_lil.append([i])#将分詞轉化為list in list 形式以便傳入dictionary

word_dic=Dictionary(word_lil)#轉化為dictionary詞典形式 以便分析

d=dict(word_dic.items())

docwords=set(d.values())

#相關度計算

commwords=topicwords.intersection(docwords)#取交集

if len(commwords)>0:#交集>0符合條件的存入最終的字典

news_dicfin[k]=v

print(news_dicfin)

if __name__=='__main__':

topicwords={"疫情","新冠","肺炎","确診","病例"}

ifsim(topicwords)