python使用Selenium爬取淘寶商品資訊

由于淘寶對自動化工具進行了識别，直接進入登入頁面滑動二維碼一直會報錯，是以采取了曲線救國的方式，通過用微網誌賬号來登入淘寶。剛自學《Python3網絡爬蟲開發實戰》，和裡面的代碼有一點點差別。廢話不多說，直接上代碼。

#coding=utf-8
"""
__author__ = zenghaisheng

"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib import quote
from bs4 import BeautifulSoup

browser = webdriver.Chrome()
wait = WebDriverWait(browser,10)
KEYTWORD = "white something you want to search"
WEIBO_NAME = "white your weibo name"
WEIBO_PASSWOORD = 'white your weibo password'

def index_page(page):

    print('正在爬取第{}頁'.format(page))
    try:
        url = "https://s.taobao.com/search?q=" +quote(KEYTWORD)
        browser.get(url)
        #點選切換密碼登陸
        a_element = browser.find_element_by_class_name('login-switch')
        a_element.click()
        #跳轉到微網誌登陸頁面
        weibo_login = browser.find_element_by_class_name('weibo-login')
        weibo_login.click()
        name_input = browser.find_element(By.NAME,'username')
        name_input.send_keys(WEIBO_NAME)
        password_input = browser.find_element(By.NAME,'password')
        password_input.send_keys(WEIBO_PASSWOORD)
        submit = browser.find_element_by_class_name('W_btn_g')
        submit.send_keys(Keys.ENTER)
        #登陸成功，跳轉回淘寶
        wait = WebDriverWait(browser,10)

        if page > 1:
            input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-page .form > input')))
            sumbit_go_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.m-page .form .J_Submit')))
            input_page.clear()
            input_page.send_keys(page)
            sumbit_go_page.send_keys(Keys.ENTER)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item')))
        goods_msgs =  get_goods_msg()
        return goods_msgs

    except Exception as e:
        print(e)

def get_goods_msg():

    html = browser.page_source
    soup = BeautifulSoup(html,'lxml')
    goods_list = soup.find_all(class_='J_MouserOnverReq')
    for i in goods_list:
        i_soup = BeautifulSoup(str(i),'lxml')
        #商品顯示圖連結
        data_imgurl = 'https:'+i_soup.find(class_='J_ItemPic img')["data-src"]
        #商品連結
        data_href = 'https:'+i_soup.find(class_='pic-link')["data-href"]
        #商品标題
        data_title = i_soup.find(class_='title').get_text().strip()
        #商品價格
        data_price = i_soup.select('.ctx-box .price strong')[0].get_text()
        #多少人付款
        data_pay_peoples = i_soup.find(class_='deal-cnt').get_text().replace("人付款",'')
        yield dict(
            data_imgurl = data_imgurl,
            data_href = data_href,
            data_title = data_title,
            data_price = data_price,
            data_pay_peoples = data_pay_peoples,
        )

if __name__ == "__main__":
    #填寫你想搜尋的第幾頁數
    page_num = 1
    goods_msgs = index_page(page_num)
    for good_msg in goods_msgs:
        print(good_msg)

python使用Selenium爬取淘寶商品資訊

繼續閱讀

TestLink導出用例轉換工具(XML2Excel)

利用Selenium內建TestLink做自動化測試

YAML簡介和PyYAML安全操作YAML支援的類型YAML的優點：yaml的基本文法python操作

Small tricks

libsvm for python 安裝

學習軟體測試基礎測試第七天

Zeppelin 配置通路 REST APIApache Zeppelin Configuration REST API

【Torch】最簡潔logging使用指南

27. Remove Element(清單)題目代碼

sort()函數到底是怎樣進行數字排序的

Cloud Studio初體驗

使用 ctypes 進行 Python 和 C 的混合程式設計

【python】【資料處理】畫多元資料分布圖

【python】netconf協定對接管理裝置

「Python 網絡自動化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 網絡裝置

在python中建立excel并寫入