爬虫学习记录
获取58同城上的招聘信息
爬虫的意义
我们编写爬虫就是把网页中的关键信息爬取下来,然后做分析,现在是数据时代,所以数据是很重要的资源。爬虫可以帮助我们获取这些资源。
本文的目的
现在的爬虫技术很多,但是以python为主,作为初学者我建议不要使用太多现成的工具,这样无法学习到里面的技术,比如你在使用scrapy时都很难了解它在内部调用了什么,这篇文章也将用urllib2+beautifulSoup+mysql来获取58同城上的招聘信息,最关键的是分析网页源代码,找到需要的信息。
获取网页源码
url = "http://hz.58.com/tech/" + "pn"+str(start)+"/"
request = urllib2.Request(url=url,headers=headers)
response = urllib2.urlopen(request,timeout=60)
html = response.read().decode('utf-8')
soup = BeautifulSoup(html,'lxml')
获取58的列表信息
for item in all_dl:
job = item.find('dt').find('a')
info = getdatas.getInfo(job['href'])
if info != 0:
count += insertmysql.insertMysql(info)
print "现在的数据量为%d"%(count)
time.sleep(5)
start = start + 1
其中的每一个item就是一条招聘信息,然后进入这个二级地址,获取相关的招聘信息
二级网址
在这个部分首先也要获取网页源代码,然后用beautifulSoup来匹配关键信息,beautifulSoup的用法可以在官网看看。
def getInfo(url):
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
try:
# proxies = {'http': proxy_ip}
request = urllib2.Request(url=url, headers=headers)
# request.set_proxy(proxy_ip, 'http')
response = urllib2.urlopen(request)
html = response.read().decode('utf-8')
# html = requests.get(url, headers=headers, proxies=proxies)
html = BeautifulSoup(html, 'lxml')
info = {}
info['id'] = uuid.uuid4()
info['title'] = html.find('div', class_='item_con pos_info').find('span', class_='pos_name').get_text()
temp = html.find('div', class_='pos_base_info').find('span', class_='pos_salary').get_text()
info['salary_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])
info['salary_max'] = 0 + int(re.findall(r"\-(\d+)", temp)[0])
info['company'] = html.find('div', class_='item_con company_baseInfo').find('p',class_='comp_baseInfo_title').find('a', class_='baseInfo_link').get_text()
temp = html.find('div', class_='item_con company_baseInfo').find('p', class_='comp_baseInfo_scale').get_text()
info['scale_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])
info['scale_max'] = 0+int(re.findall(r"\-(\d+)", temp)[0])
info['address'] = html.find('div', class_='item_con work_adress').find('p', class_='detail_adress').get_text()
return info
except Exception, e:
return 0
我用uuid作为主键,爬取了招聘信息中的主要内容,薪水,公司规模,公司地址等信息,但是58里的招聘页面有些不是按照这个标准设置的,所以如果想要更加完整的信息,就需要在分类讨论一下。
存储数据库
这里选择的数据库是mysql,python连接mysql也很容易:
db = MySQLdb.connect(host='localhost', user='root', passwd='123', db='58city', port=3306,charset='utf8')
cursor = db.cursor()
然后将相关的信息放到mysql中:
cursor.execute(
'insert into jobs(id,title,salary_min,salary_max,company,scale_min,scale_max,address) values(%s,%s,%s,%s,%s,%s,%s,%s)',
(id,title,salary_min,salary_max,company,scale_min,scale_max,address))
db.commit()
db.close()
cursor.close()
我们在写代码的时候会肯定会有bug,所以使用try catch 的方法最好。
except Exception, e:
print e.message+"数据库报错"+e.message+e.args[0]
return 0
反爬的策略
我们可以做个ip代理,防止地址被封,并且设置休眠时间,以免爬取太快
被网站察觉。
这里提供源代码
# coding:utf8
import random
import urllib2
import time
from bs4 import BeautifulSoup
import getdatas
import insertmysql
import requests
ISOTIMEFORMAT = '%Y-%m-%d %X'
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
import getip
# 获取tag
# start
"********** START **********"
time.strftime(ISOTIMEFORMAT, time.localtime())
try:
start = 33
count = 0
# proxy_list = getip.get_ips()
while True:
try:
# proxy_ip = random.choice(proxy_list)
# proxies = {'http': proxy_ip}
#
url = "http://hz.58.com/tech/" + "pn"+str(start)+"/"
request = urllib2.Request(url=url,headers=headers)
# request.set_proxy(proxy_ip,'http')
response = urllib2.urlopen(request,timeout=60)
html = response.read().decode('utf-8')
# html = requests.get(url, headers=headers, proxies=proxies)
soup = BeautifulSoup(html,'lxml')
all_dl = soup.find('div',id='infolist').findAll('dl')
if len(all_dl) == 0:
break
for item in all_dl:
job = item.find('dt').find('a')
info = getdatas.getInfo(job['href'])
if info != 0:
count += insertmysql.insertMysql(info)
print "现在的数据量为%d"%(count)
time.sleep(5)
start = start + 1
print start
time.sleep(5)
# print info_list['director']
except Exception, e:
print e.message + "1"
except Exception, e:
print e.message +'2'
# coding:utf8
import urllib2
import urllib
import json
import time
import re
import random
import uuid
import requests
from bs4 import BeautifulSoup
def getInfo(url):
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
try:
# proxies = {'http': proxy_ip}
request = urllib2.Request(url=url, headers=headers)
# request.set_proxy(proxy_ip, 'http')
response = urllib2.urlopen(request)
html = response.read().decode('utf-8')
# html = requests.get(url, headers=headers, proxies=proxies)
html = BeautifulSoup(html, 'lxml')
info = {}
info['id'] = uuid.uuid4()
info['title'] = html.find('div', class_='item_con pos_info').find('span', class_='pos_name').get_text()
temp = html.find('div', class_='pos_base_info').find('span', class_='pos_salary').get_text()
info['salary_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])
info['salary_max'] = 0 + int(re.findall(r"\-(\d+)", temp)[0])
info['company'] = html.find('div', class_='item_con company_baseInfo').find('p',class_='comp_baseInfo_title').find('a', class_='baseInfo_link').get_text()
temp = html.find('div', class_='item_con company_baseInfo').find('p', class_='comp_baseInfo_scale').get_text()
info['scale_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])
info['scale_max'] = 0+int(re.findall(r"\-(\d+)", temp)[0])
info['address'] = html.find('div', class_='item_con work_adress').find('p', class_='detail_adress').get_text()
return info
except Exception, e:
return 0
# -*- coding:utf-8 -*-
import MySQLdb
import MySQLdb.cursors
import getCity
def insertMysql(info):
if info == None:
print "there is no infomation"
return 0
else:
try:
db = MySQLdb.connect(host='localhost', user='root', passwd='123', db='58city', port=3306,charset='utf8')
cursor = db.cursor()
id = info['id']
title = info['title']
salary_min = info['salary_min']
salary_max = info['salary_max']
company = info['company']
scale_min = info['scale_min']
scale_max = info['scale_max']
address = info['address']
cursor.execute(
'insert into jobs(id,title,salary_min,salary_max,company,scale_min,scale_max,address) values(%s,%s,%s,%s,%s,%s,%s,%s)',
(id,title,salary_min,salary_max,company,scale_min,scale_max,address))
db.commit()
db.close()
cursor.close()
return 1
except Exception, e:
print e.message+"数据库报错"+e.message+e.args[0]
return 0