天天看點

python使用Selenium爬取淘寶商品資訊

       由于淘寶對自動化工具進行了識别,直接進入登入頁面滑動二維碼一直會報錯,是以采取了曲線救國的方式,通過用微網誌賬号來登入淘寶。剛自學《Python3網絡爬蟲開發實戰》,和裡面的代碼有一點點差別。廢話不多說,直接上代碼。

#coding=utf-8
"""
__author__ = zenghaisheng

"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib import quote
from bs4 import BeautifulSoup

browser = webdriver.Chrome()
wait = WebDriverWait(browser,10)
KEYTWORD = "white something you want to search"
WEIBO_NAME = "white your weibo name"
WEIBO_PASSWOORD = 'white your weibo password'

def index_page(page):

    print('正在爬取第{}頁'.format(page))
    try:
        url = "https://s.taobao.com/search?q=" +quote(KEYTWORD)
        browser.get(url)
        #點選切換密碼登陸
        a_element = browser.find_element_by_class_name('login-switch')
        a_element.click()
        #跳轉到微網誌登陸頁面
        weibo_login = browser.find_element_by_class_name('weibo-login')
        weibo_login.click()
        name_input = browser.find_element(By.NAME,'username')
        name_input.send_keys(WEIBO_NAME)
        password_input = browser.find_element(By.NAME,'password')
        password_input.send_keys(WEIBO_PASSWOORD)
        submit = browser.find_element_by_class_name('W_btn_g')
        submit.send_keys(Keys.ENTER)
        #登陸成功,跳轉回淘寶
        wait = WebDriverWait(browser,10)

        if page > 1:
            input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-page .form > input')))
            sumbit_go_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.m-page .form .J_Submit')))
            input_page.clear()
            input_page.send_keys(page)
            sumbit_go_page.send_keys(Keys.ENTER)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item')))
        goods_msgs =  get_goods_msg()
        return goods_msgs

    except Exception as e:
        print(e)

def get_goods_msg():

    html = browser.page_source
    soup = BeautifulSoup(html,'lxml')
    goods_list = soup.find_all(class_='J_MouserOnverReq')
    for i in goods_list:
        i_soup = BeautifulSoup(str(i),'lxml')
        #商品顯示圖連結
        data_imgurl = 'https:'+i_soup.find(class_='J_ItemPic img')["data-src"]
        #商品連結
        data_href = 'https:'+i_soup.find(class_='pic-link')["data-href"]
        #商品标題
        data_title = i_soup.find(class_='title').get_text().strip()
        #商品價格
        data_price = i_soup.select('.ctx-box .price strong')[0].get_text()
        #多少人付款
        data_pay_peoples = i_soup.find(class_='deal-cnt').get_text().replace("人付款",'')
        yield dict(
            data_imgurl = data_imgurl,
            data_href = data_href,
            data_title = data_title,
            data_price = data_price,
            data_pay_peoples = data_pay_peoples,
        )

if __name__ == "__main__":
    #填寫你想搜尋的第幾頁數
    page_num = 1
    goods_msgs = index_page(page_num)
    for good_msg in goods_msgs:
        print(good_msg)