由于淘寶對自動化工具進行了識别,直接進入登入頁面滑動二維碼一直會報錯,是以采取了曲線救國的方式,通過用微網誌賬号來登入淘寶。剛自學《Python3網絡爬蟲開發實戰》,和裡面的代碼有一點點差別。廢話不多說,直接上代碼。
#coding=utf-8
"""
__author__ = zenghaisheng
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib import quote
from bs4 import BeautifulSoup
browser = webdriver.Chrome()
wait = WebDriverWait(browser,10)
KEYTWORD = "white something you want to search"
WEIBO_NAME = "white your weibo name"
WEIBO_PASSWOORD = 'white your weibo password'
def index_page(page):
print('正在爬取第{}頁'.format(page))
try:
url = "https://s.taobao.com/search?q=" +quote(KEYTWORD)
browser.get(url)
#點選切換密碼登陸
a_element = browser.find_element_by_class_name('login-switch')
a_element.click()
#跳轉到微網誌登陸頁面
weibo_login = browser.find_element_by_class_name('weibo-login')
weibo_login.click()
name_input = browser.find_element(By.NAME,'username')
name_input.send_keys(WEIBO_NAME)
password_input = browser.find_element(By.NAME,'password')
password_input.send_keys(WEIBO_PASSWOORD)
submit = browser.find_element_by_class_name('W_btn_g')
submit.send_keys(Keys.ENTER)
#登陸成功,跳轉回淘寶
wait = WebDriverWait(browser,10)
if page > 1:
input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-page .form > input')))
sumbit_go_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.m-page .form .J_Submit')))
input_page.clear()
input_page.send_keys(page)
sumbit_go_page.send_keys(Keys.ENTER)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item')))
goods_msgs = get_goods_msg()
return goods_msgs
except Exception as e:
print(e)
def get_goods_msg():
html = browser.page_source
soup = BeautifulSoup(html,'lxml')
goods_list = soup.find_all(class_='J_MouserOnverReq')
for i in goods_list:
i_soup = BeautifulSoup(str(i),'lxml')
#商品顯示圖連結
data_imgurl = 'https:'+i_soup.find(class_='J_ItemPic img')["data-src"]
#商品連結
data_href = 'https:'+i_soup.find(class_='pic-link')["data-href"]
#商品标題
data_title = i_soup.find(class_='title').get_text().strip()
#商品價格
data_price = i_soup.select('.ctx-box .price strong')[0].get_text()
#多少人付款
data_pay_peoples = i_soup.find(class_='deal-cnt').get_text().replace("人付款",'')
yield dict(
data_imgurl = data_imgurl,
data_href = data_href,
data_title = data_title,
data_price = data_price,
data_pay_peoples = data_pay_peoples,
)
if __name__ == "__main__":
#填寫你想搜尋的第幾頁數
page_num = 1
goods_msgs = index_page(page_num)
for good_msg in goods_msgs:
print(good_msg)