天天看点

python爬虫scrapy连接mongodb_python爬虫程序 赶集网列表页抓取以及把列表存入mongodb...

import requests,pymongo,time, random

from bs4 import BeautifulSoup

from multiprocessing import Pool

client = pymongo.MongoClient('localhost',27017)

ganji =client['ganji']

list_urls = ganji['list_urls']

# 获取北京赶集网的列表页

# url = 'http://bj.ganji.com/wu/'

# response = requests.get(url)

# soup = BeautifulSoup(response.text,'lxml')

#

# urls = soup.select('div > dl > dt > a')

# for new_url in urls:

# add_new_url = 'http://bj.ganji.com'+new_url.get('href')

# print(add_new_url)

channel_list = '''

http://bj.ganji.com/shouji/

http://bj.ganji.com/shoujihaoma/

http://bj.ganji.com/shoujipeijian/

http://bj.ganji.com/bijibendiannao/

http://bj.ganji.com/taishidiannaozhengji/

http://bj.ganji.com/diannaoyingjian/

http://bj.ganji.com/wangluoshebei/

http://bj.ganji.com/shumaxiangji/

http://bj.ganji.com/youxiji/

http://bj.ganji.com/xuniwupin/

http://bj.ganji.com/jiaju/

http://bj.ganji.com/jiadian/

http://bj.ganji.com/zixingchemaimai/

http://bj.ganji.com/rirongbaihuo/

http://bj.ganji.com/yingyouyunfu/

http://bj.ganji.com/fushixiaobaxuemao/

http://bj.ganji.com/meironghuazhuang/

http://bj.ganji.com/yundongqicai/

http://bj.ganji.com/yueqi/

http://bj.ganji.com/tushu/

http://bj.ganji.com/bangongjiaju/

http://bj.ganji.com/wujingongju/

http://bj.ganji.com/nongyongpin/

http://bj.ganji.com/xianzhilipin/

http://bj.ganji.com/shoucangpin/

http://bj.ganji.com/baojianpin/

http://bj.ganji.com/laonianyongpin/

http://bj.ganji.com/gou/

http://bj.ganji.com/qitaxiaochong/

http://bj.ganji.com/xiaofeika/

http://bj.ganji.com/menpiao/

http://bj.ganji.com/jiaju/

http://bj.ganji.com/rirongbaihuo/

http://bj.ganji.com/shouji/

http://bj.ganji.com/shoujihaoma/

http://bj.ganji.com/bangong/

http://bj.ganji.com/nongyongpin/

http://bj.ganji.com/jiadian/

http://bj.ganji.com/ershoubijibendiannao/

http://bj.ganji.com/ruanjiantushu/

http://bj.ganji.com/yingyouyunfu/

http://bj.ganji.com/diannao/

http://bj.ganji.com/xianzhilipin/

http://bj.ganji.com/fushixiaobaxuemao/

http://bj.ganji.com/meironghuazhuang/

http://bj.ganji.com/shuma/

http://bj.ganji.com/laonianyongpin/

http://bj.ganji.com/xuniwupin/

http://bj.ganji.com/qitawupin/

http://bj.ganji.com/ershoufree/

http://bj.ganji.com/wupinjiaohuan/

'''

def get_links_from(channel,page):

url = '{}{}/'.format(channel,page)

print(url)

time.sleep(random.randint(1,5))

response = requests.get(url)

soup = BeautifulSoup(response.text,'lxml')

if soup.find('div', class_="pageBox"):

links = soup.select('dd.feature > div > ul > li > a')

titles = soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dt > a')

for links,titles in zip(links,titles ):

data = {

'url': links.get('href'),

'title':titles.get('title')

}

# print(data)

list_urls.insert_one(data)

#print(url+'已经抓取完成')

#print("该频道抓取完成")

def get_all_links_from(channel):

for i in range(1,101):

get_links_from(channel,i)

if __name__ =='__main__':

pool = Pool()

pool.map(get_all_links_from,channel_list.split())