今天在爬職位資訊的時候,先用的requests子產品,後來發現,擷取的資料全部是亂碼。果斷換成selenium+Chrome,成功擷取資料,儲存MongoDB
在前期校驗擷取成果時,建議用selenium+Chrome,後期測試通過,換成無界面的PhantomJS
代碼如下:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
# from config import *
import pymongo
browser = webdriver.Chrome('/home/worker/Desktop/driver/chromedriver')
wait = WebDriverWait(browser, 10)
browser.set_window_size(1400, 900)
# 連接配接資料庫
client = pymongo.MongoClient('127.0.0.1', 27017)
# 定義資料庫名稱
db = client.Job
# 定義表名
coll = db.job
for i in range(1,101):
browser.get('https://search.51job.com/list/020000,000000,0000,00,9,99,Python,2,{}.html?.format(i))
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#resultList > div'))
)
html = browser.page_source
# print(html)
doc = pq(html)
#siblings,在兄弟元素中找('.el')的元素
items = doc('#resultList .title').siblings('.el').items()
#當擷取資料時,有的字段有資料有的沒有,把擷取資料之前的頁面儲存,比對
# with open('./html.html','w') as f:
# f.write(html)
for item in items:
# for row in rows:
Job={}
# Job['job_name']=item.find('a').eq(0).text(),
# eq(0),想得到第一個标簽内的内容
Job['job_name']=item.find('.t1').eq(0).text()
Job['com_name']=item.find('.t2').text()
Job['addr']=item.find('.t3').text()
try:
Job['job_money']=item.find('.t4').text()
except:
Job['job_money']=None
Job['job_time']=item.find('.t5').text()
# time.sleep(0.1)
# print(Job['job_name'])
# print(Job['com_name'])
coll.insert(Job)
# print(Job)
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLiAzNvwVZ2x2bzNXak9CX90TQNNkRrFlQKBTSvwFbslmZvwFMwQzLcVmepNHdu9mZvwFVywUNMZTY18CX052bm9CXuVzVhVXOsNGbSNzYoFjMMBjVtJWd0ckW65UbM5WOHJWa5kHT20ESjBjUIF2LcRHelR3LcJzLctmch1mclRXY39zM5cTOzgTN5EzNwcDM4EDMy8CX0Vmbu4GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.jpg)