天天看点

python爬取招聘网站源代码_python爬取58上的招聘信息

爬虫学习记录

获取58同城上的招聘信息

爬虫的意义

我们编写爬虫就是把网页中的关键信息爬取下来,然后做分析,现在是数据时代,所以数据是很重要的资源。爬虫可以帮助我们获取这些资源。

本文的目的

现在的爬虫技术很多,但是以python为主,作为初学者我建议不要使用太多现成的工具,这样无法学习到里面的技术,比如你在使用scrapy时都很难了解它在内部调用了什么,这篇文章也将用urllib2+beautifulSoup+mysql来获取58同城上的招聘信息,最关键的是分析网页源代码,找到需要的信息。

获取网页源码

url = "http://hz.58.com/tech/" + "pn"+str(start)+"/"

request = urllib2.Request(url=url,headers=headers)

response = urllib2.urlopen(request,timeout=60)

html = response.read().decode('utf-8')

soup = BeautifulSoup(html,'lxml')

获取58的列表信息

for item in all_dl:

job = item.find('dt').find('a')

info = getdatas.getInfo(job['href'])

if info != 0:

count += insertmysql.insertMysql(info)

print "现在的数据量为%d"%(count)

time.sleep(5)

start = start + 1

其中的每一个item就是一条招聘信息,然后进入这个二级地址,获取相关的招聘信息

二级网址

在这个部分首先也要获取网页源代码,然后用beautifulSoup来匹配关键信息,beautifulSoup的用法可以在官网看看。

def getInfo(url):

headers = {}

headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"

try:

# proxies = {'http': proxy_ip}

request = urllib2.Request(url=url, headers=headers)

# request.set_proxy(proxy_ip, 'http')

response = urllib2.urlopen(request)

html = response.read().decode('utf-8')

# html = requests.get(url, headers=headers, proxies=proxies)

html = BeautifulSoup(html, 'lxml')

info = {}

info['id'] = uuid.uuid4()

info['title'] = html.find('div', class_='item_con pos_info').find('span', class_='pos_name').get_text()

temp = html.find('div', class_='pos_base_info').find('span', class_='pos_salary').get_text()

info['salary_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])

info['salary_max'] = 0 + int(re.findall(r"\-(\d+)", temp)[0])

info['company'] = html.find('div', class_='item_con company_baseInfo').find('p',class_='comp_baseInfo_title').find('a', class_='baseInfo_link').get_text()

temp = html.find('div', class_='item_con company_baseInfo').find('p', class_='comp_baseInfo_scale').get_text()

info['scale_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])

info['scale_max'] = 0+int(re.findall(r"\-(\d+)", temp)[0])

info['address'] = html.find('div', class_='item_con work_adress').find('p', class_='detail_adress').get_text()

return info

except Exception, e:

return 0

我用uuid作为主键,爬取了招聘信息中的主要内容,薪水,公司规模,公司地址等信息,但是58里的招聘页面有些不是按照这个标准设置的,所以如果想要更加完整的信息,就需要在分类讨论一下。

存储数据库

这里选择的数据库是mysql,python连接mysql也很容易:

db = MySQLdb.connect(host='localhost', user='root', passwd='123', db='58city', port=3306,charset='utf8')

cursor = db.cursor()

然后将相关的信息放到mysql中:

cursor.execute(

'insert into jobs(id,title,salary_min,salary_max,company,scale_min,scale_max,address) values(%s,%s,%s,%s,%s,%s,%s,%s)',

(id,title,salary_min,salary_max,company,scale_min,scale_max,address))

db.commit()

db.close()

cursor.close()

我们在写代码的时候会肯定会有bug,所以使用try catch 的方法最好。

except Exception, e:

print e.message+"数据库报错"+e.message+e.args[0]

return 0

反爬的策略

我们可以做个ip代理,防止地址被封,并且设置休眠时间,以免爬取太快

被网站察觉。

这里提供源代码

# coding:utf8

import random

import urllib2

import time

from bs4 import BeautifulSoup

import getdatas

import insertmysql

import requests

ISOTIMEFORMAT = '%Y-%m-%d %X'

headers = {}

headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"

import getip

# 获取tag

# start

print

"********** START **********"

print

time.strftime(ISOTIMEFORMAT, time.localtime())

try:

start = 33

count = 0

# proxy_list = getip.get_ips()

while True:

try:

# proxy_ip = random.choice(proxy_list)

# proxies = {'http': proxy_ip}

#

url = "http://hz.58.com/tech/" + "pn"+str(start)+"/"

request = urllib2.Request(url=url,headers=headers)

# request.set_proxy(proxy_ip,'http')

response = urllib2.urlopen(request,timeout=60)

html = response.read().decode('utf-8')

# html = requests.get(url, headers=headers, proxies=proxies)

soup = BeautifulSoup(html,'lxml')

all_dl = soup.find('div',id='infolist').findAll('dl')

if len(all_dl) == 0:

break

for item in all_dl:

job = item.find('dt').find('a')

info = getdatas.getInfo(job['href'])

if info != 0:

count += insertmysql.insertMysql(info)

print "现在的数据量为%d"%(count)

time.sleep(5)

start = start + 1

print start

time.sleep(5)

# print info_list['director']

except Exception, e:

print e.message + "1"

except Exception, e:

print e.message +'2'

# coding:utf8

import urllib2

import urllib

import json

import time

import re

import random

import uuid

import requests

from bs4 import BeautifulSoup

def getInfo(url):

headers = {}

headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"

try:

# proxies = {'http': proxy_ip}

request = urllib2.Request(url=url, headers=headers)

# request.set_proxy(proxy_ip, 'http')

response = urllib2.urlopen(request)

html = response.read().decode('utf-8')

# html = requests.get(url, headers=headers, proxies=proxies)

html = BeautifulSoup(html, 'lxml')

info = {}

info['id'] = uuid.uuid4()

info['title'] = html.find('div', class_='item_con pos_info').find('span', class_='pos_name').get_text()

temp = html.find('div', class_='pos_base_info').find('span', class_='pos_salary').get_text()

info['salary_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])

info['salary_max'] = 0 + int(re.findall(r"\-(\d+)", temp)[0])

info['company'] = html.find('div', class_='item_con company_baseInfo').find('p',class_='comp_baseInfo_title').find('a', class_='baseInfo_link').get_text()

temp = html.find('div', class_='item_con company_baseInfo').find('p', class_='comp_baseInfo_scale').get_text()

info['scale_min'] = 0+int(re.findall(r"(\d+)\-", temp)[0])

info['scale_max'] = 0+int(re.findall(r"\-(\d+)", temp)[0])

info['address'] = html.find('div', class_='item_con work_adress').find('p', class_='detail_adress').get_text()

return info

except Exception, e:

return 0

# -*- coding:utf-8 -*-

import MySQLdb

import MySQLdb.cursors

import getCity

def insertMysql(info):

if info == None:

print "there is no infomation"

return 0

else:

try:

db = MySQLdb.connect(host='localhost', user='root', passwd='123', db='58city', port=3306,charset='utf8')

cursor = db.cursor()

id = info['id']

title = info['title']

salary_min = info['salary_min']

salary_max = info['salary_max']

company = info['company']

scale_min = info['scale_min']

scale_max = info['scale_max']

address = info['address']

cursor.execute(

'insert into jobs(id,title,salary_min,salary_max,company,scale_min,scale_max,address) values(%s,%s,%s,%s,%s,%s,%s,%s)',

(id,title,salary_min,salary_max,company,scale_min,scale_max,address))

db.commit()

db.close()

cursor.close()

return 1

except Exception, e:

print e.message+"数据库报错"+e.message+e.args[0]

return 0