--***2019-3-27測試有效***----
第一步:
打開cmd,輸入scrapy startproject taobao_s建立一個項目。
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLiETPwJWZ3ZCMwcTP39zZuBnLuVzRjVXWE1kenpnTyUFRNhHMT1ENnRVT4VkaNJTRq10dJRVT0UERNlHMD1EMVpWT10EVNZXSU1ENFRUT5hzQNBTVq1UNNRVT2NmMiNnSywEd5ITW110MaZHetlVdO1GT0UERNl3YXJGc5kHT20ESjBjUIF2Lc12bj5SYphXa5VWen5WY35iclN3Ztl2Lc9CX6MHc0RHaiojIsJye.png)
接着cd 進入我們的項目檔案夾内輸入scrapy genspider taobao www.taobao.com建立一個爬蟲
檔案内是這樣的,tools是我建的一個工具子產品,裡面有一個處理資料的函數和selenium登入的函數。
class TaobaoSpider(scrapy.Spider):
name = \'taobao\'
# allowed_domains = [\'www.taobao.com\']
base_url = [\'https://s.taobao.com/search?q=\']
pages = 100
re_headers = {
\'user-agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36\',
\'referer\': \'https://www.taobao.com/\',
\'accept-encoding\': \'gzip, deflate, b\',
}
i = 1
def start_requests(self):
keys = self.settings.get(\'KEYS\')#擷取要搜尋的關鍵詞
self.browser,list = register()#這裡調用selenium登入的方法并傳回browser和一個cookies
self.browser.get(self.base_url[0]+keys)#使用browser登入淘寶商品搜尋頁面
self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")#使用execute_script執行js操作,這裡是下拉到最底下
url_i = self.browser.current_url#擷取selenium界面目前的url用來錯誤處理
html = self.browser.page_source#擷取源代碼
yield scrapy.Request(url=self.base_url[0]+keys,headers=self.re_headers,cookies=list,callback=self.parse,meta={\'html\':html,\'i\':self.i,\'url\':url_i})
def parse(self, response):
time.sleep(5)#等待時間,可調
html = response.meta.get(\'html\')
i = response.meta.get("i")
url_i = response.meta.get("url")
i +=1
if i > 100:#因為翻一百頁,到了之後就不在執行循環
return
try:
soup = BeautifulSoup(html,\'html.parser\')
lists = soup.select(\'#mainsrp-itemlist > div > div > div > div\')
for list in lists:#這一段是解析資料
item = TaobaoSItem()
url = list.select(\'a[class="pic-link J_ClickStat J_ItemPicA"]\')[0].attrs.get(\'href\',\'\')
name = list.select("a[class=\'J_ClickStat\']")[0].get_text().strip()
name = data_cleaning(name)
price = list.select(\'div[class="price g_price g_price-highlight"] strong\')[0].get_text()
num = list.select(\'div[class="deal-cnt"]\')[0].get_text()
shop_name = list.select("a[class=\'shopname J_MouseEneterLeave J_ShopInfo\']")[0].get_text().strip()
shop_name = data_cleaning(shop_name)
item[\'url\'] = url
item[\'name\'] = name
item[\'price\'] = price
item[\'num\'] = num
item[\'shop_name\'] = shop_name
yield item
button = self.browser.find_elements(By.XPATH,\'//a[@class="J_Ajax num icon-tag"]\')[-1]#這裡是擷取點選下一頁的,因為到第二頁以後會有二個一樣class的,一個是上一頁,一個是下一頁。
button.click()#點選進入下一頁
time.sleep(random.random()*2)
self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")#下拉操作
html = self.browser.page_source
yield scrapy.Request(url=response.url,headers=self.re_headers,callback=self.parse,meta={\'html\':html,\'i\':i,\'url\':url_i},dont_filter=True)
except Exception as e:#如果被淘寶抓到就重新登入,用儲存的url在接着擷取資料
time.sleep(10)
print(e)
self.browser.close()
self.browser,list = register()
self.browser.get(url=url_i)
time.sleep(random.random()*2)
self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
html = self.browser.page_source
yield scrapy.Request(url=response.url,headers=self.re_headers,callback=self.parse,meta={\'html\':html,\'i\':i,\'url\':url_i},dont_filter=True)
def close(spider, reason):#這是結束時執行的函數,用來關掉開啟的浏覽器程序
spider.browser.close()
這是tools
def data_cleaning(data):#這是清洗資料的
if \' \' in data:
data = re.sub(\' \', \'\', data)
if "\'" in data:
data = re.sub("\'", \'\', data)
if r\'\n\' in data:
data = re.sub(r\'\\n\', \'\', data)
return data
def register():#這是登入的函數,主要
while True: #因為淘寶能夠識别出selenium,有時我們會登入失敗,會重新登入
browser = webdriver.FirefoxOptions()
browser.add_argument(\'-headless\') #無頭浏覽器
browser = webdriver.Firefox(firefox_options=browser)
# browser = webdriver.Firefox()
browser.get(\'https://login.taobao.com/member/login.jhtml\')#進入登入頁面try:
input = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, \'forget-pwd.J_Quick2Static\'))) #因為登入頁面有時候是掃碼登入,使用需要我們點選切換到密碼登入
input.click()
except Exception as e: #因為頁面有時是直接密碼登入,使用如果直接是密碼登入就不需要點選
print(e)
user = browser.find_element(By.ID, \'TPL_username_1\')#找到賬号輸入框
password = browser.find_element(By.ID, \'TPL_password_1\')#密碼輸入框
user.send_keys(USER) #輸入賬号并等待一下
time.sleep(random.random() * 2)
password.send_keys(PASSWORD)#輸入密碼并等待一下
time.sleep(random.random() * 1)
browser.execute_script("Object.defineProperties(navigator,{webdriver:{get:() => false}})") #淘寶對selenium的識别主要是通過navigator.webdriver,使用selenium的浏覽器api顯示的是True,所有我們改成fALSE就可以過淘寶的檢測
action = ActionChains(browser)
time.sleep(random.random() * 1)
butt = browser.find_element(By.ID, \'nc_1_n1z\')
browser.switch_to.frame(browser.find_element(By.ID, \'_oid_ifr_\'))
browser.switch_to.default_content()
action.click_and_hold(butt).perform()
action.reset_actions()
action.move_by_offset(285, 0).perform()#輸入賬号密碼後會有一個滑動驗證
time.sleep(random.random() * 1)
button = browser.find_element(By.ID, \'J_SubmitStatic\')#登入按鈕
time.sleep(random.random() * 2)
button.click()
time.sleep(random.random() * 2)
# browser.get(\'https://www.taobao.com/\')
cookie = browser.get_cookies()#擷取cookies,原本想selenium實作登入,其他使用scrapy來,但是淘寶的商品搜尋頁的js找不到加上時間不夠就沒寫了。
list = {}#scrapy攜帶的cookies需要字典類型的
for cookiez in cookie:
name = cookiez[\'name\']
value = cookiez[\'value\']
list[name] = value
if len(list) > 10:
break
else:
browser.close()
return browser,list
然後是資料儲存
class TaobaoSPipeline(object):
def open_spider(self,spider): #scrapy打開時啟動,這裡是打開或者建立一個txt檔案,檔案路徑是目前路徑
self.f = open(\'淘寶店鋪資料.txt\',\'w\')
def process_item(self, item, spider):#資料儲存以字典的形式,也可以改成資料庫或者csv
data = {}
data[\'url\'] = item[\'url\']
data[\'name\'] = item[\'name\']
data[\'price\'] = item[\'price\']
data[\'num\'] = item[\'num\']
data[\'shop_name\'] = item[\'shop_name\']
self.f.write(str(data)+\'\n\')
return item
def close_spider(self,spider):#scrapy結束時啟動,用來關掉檔案。
self.f.close()
代碼位址 https://github.com/18370652038/taobao.git