天天看點

python找房源_python抓取鍊家房源資訊

#-*-coding:utf-8-*-

importurllibimporturllib2importreimportrequestsimportjsonimportlxmlfrom bs4 importBeautifulSoupimporttimefrom pymongo importMongoClientfrom lxml importetree

client= MongoClient('localhost',27017)

db=client.test

House=db.House

headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.9','Cache-Control':'max-age=0','Connection':'keep-alive','Cookie':'......','Host':'bj.lianjia.com','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}

URL= 'https://bj.lianjia.com/ershoufang/pg'

defdownload(url):

num_try= 2

while num_try >0:

num_try-= 1

try:

content= requests.get(url,headers =headers)returncontent.textexcepturllib2.URLError as e:print 'Download error',e.reasonreturnNonedefget_message(url):

html=download(url)

soup= BeautifulSoup(html,'html.parser')

prices= soup.find_all('div','priceInfo')

total_price=[]for each inprices:

total_price.append(each.span.string)

address=[]

house_types=[]

areas=[]

towards=[]

decorates=[]

elevates=[]

message= soup.find_all('div',attrs={'class':'houseInfo'})for each inmessage:

List= each.get_text().split('|')

address.append(List[0].strip())

house_types.append(List[1].strip())

areas.append(List[2].strip())

towards.append(List[3].strip())

decorates.append(List[4].strip())if len(List) == 5:

elevates.append("None")else:

elevates.append(List[5].strip())for addres,house_type,area,price,toward,decorate,elevate inzip(address,house_types,areas,total_price,towards,decorates,elevates):

mess= "{\"Address\":\"%s\",\"House_type\":\"%s\",\"Area\":\"%s\",\"Price\":\"%s\",\"Toward\":\"%s\",\"Decorate\":\"%s\",\"Elevete\":\"%s\"}"%(addres,house_type,area,price,toward,decorate,elevate)printmess

message=json.loads(mess)

House.insert(message)if __name__ == '__main__':

t=time.time()printtfor num in xrange(1,101):

url= URL +str(num)printurl

get_message(url)

time.sleep(1)

t1=time.time()print 'Total time:'

print t1 - t - 100