import requests,pymongo,time, random
from bs4 import BeautifulSoup
from multiprocessing import Pool
client = pymongo.MongoClient('localhost',27017)
ganji =client['ganji']
list_urls = ganji['list_urls']
# 获取北京赶集网的列表页
# url = 'http://bj.ganji.com/wu/'
# response = requests.get(url)
# soup = BeautifulSoup(response.text,'lxml')
#
# urls = soup.select('div > dl > dt > a')
# for new_url in urls:
# add_new_url = 'http://bj.ganji.com'+new_url.get('href')
# print(add_new_url)
channel_list = '''
http://bj.ganji.com/shouji/
http://bj.ganji.com/shoujihaoma/
http://bj.ganji.com/shoujipeijian/
http://bj.ganji.com/bijibendiannao/
http://bj.ganji.com/taishidiannaozhengji/
http://bj.ganji.com/diannaoyingjian/
http://bj.ganji.com/wangluoshebei/
http://bj.ganji.com/shumaxiangji/
http://bj.ganji.com/youxiji/
http://bj.ganji.com/xuniwupin/
http://bj.ganji.com/jiaju/
http://bj.ganji.com/jiadian/
http://bj.ganji.com/zixingchemaimai/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/yingyouyunfu/
http://bj.ganji.com/fushixiaobaxuemao/
http://bj.ganji.com/meironghuazhuang/
http://bj.ganji.com/yundongqicai/
http://bj.ganji.com/yueqi/
http://bj.ganji.com/tushu/
http://bj.ganji.com/bangongjiaju/
http://bj.ganji.com/wujingongju/
http://bj.ganji.com/nongyongpin/
http://bj.ganji.com/xianzhilipin/
http://bj.ganji.com/shoucangpin/
http://bj.ganji.com/baojianpin/
http://bj.ganji.com/laonianyongpin/
http://bj.ganji.com/gou/
http://bj.ganji.com/qitaxiaochong/
http://bj.ganji.com/xiaofeika/
http://bj.ganji.com/menpiao/
http://bj.ganji.com/jiaju/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/shouji/
http://bj.ganji.com/shoujihaoma/
http://bj.ganji.com/bangong/
http://bj.ganji.com/nongyongpin/
http://bj.ganji.com/jiadian/
http://bj.ganji.com/ershoubijibendiannao/
http://bj.ganji.com/ruanjiantushu/
http://bj.ganji.com/yingyouyunfu/
http://bj.ganji.com/diannao/
http://bj.ganji.com/xianzhilipin/
http://bj.ganji.com/fushixiaobaxuemao/
http://bj.ganji.com/meironghuazhuang/
http://bj.ganji.com/shuma/
http://bj.ganji.com/laonianyongpin/
http://bj.ganji.com/xuniwupin/
http://bj.ganji.com/qitawupin/
http://bj.ganji.com/ershoufree/
http://bj.ganji.com/wupinjiaohuan/
'''
def get_links_from(channel,page):
url = '{}{}/'.format(channel,page)
print(url)
time.sleep(random.randint(1,5))
response = requests.get(url)
soup = BeautifulSoup(response.text,'lxml')
if soup.find('div', class_="pageBox"):
links = soup.select('dd.feature > div > ul > li > a')
titles = soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dt > a')
for links,titles in zip(links,titles ):
data = {
'url': links.get('href'),
'title':titles.get('title')
}
# print(data)
list_urls.insert_one(data)
#print(url+'已经抓取完成')
#print("该频道抓取完成")
def get_all_links_from(channel):
for i in range(1,101):
get_links_from(channel,i)
if __name__ =='__main__':
pool = Pool()
pool.map(get_all_links_from,channel_list.split())