系列目录
- 基于Python网络爬虫与推荐算法新闻推荐平台
- 新闻推荐平台功能详解----新闻爬虫
- 新闻推荐平台功能详解----数据分析系统
文章目录
- 系列目录
- 功能介绍
- 一、结构
- 二、具体实现
-
- 1.标签推荐
- 2.热度推荐
- 3. 地区推荐
- 总结
功能介绍
将分析系统产生的分析结果数据进行对应的新闻推荐,推荐方式如下三种:
- 标签推荐
- 热度推荐
- 地区推荐
一、结构
新闻爬虫包括两部分:URL采集器、详情页采集器、定时器
推荐类别 | 实现方式 |
---|---|
标签推荐 | 用户注册时可以选择标签,并且在用户阅读过程中会对标签进行正向/反向的反馈,反馈的结果是影响用户标签的权重。将用户的标签进行新闻关键词进行相似度计算,相似度越高意味着新闻与用户感兴趣的标签相关性越高,反之相反。 |
热度推荐 | 新闻的热度来源于用户对新闻的阅读与评论,在一定程度上新闻热度与用户的评论阅读量成正相关,再通过添加上时间作为热度的另外一个参考,距离当前时间越近的新闻越可能是成为热点的新闻。 |
地区推荐 | 用户在登录/使用新闻阅读平台时会留下用户访问的IP地址,通过IP地址进行地区分析可以得到用户所在地的大概范围,通过对省份地区与新闻内容的匹配可以得到与用户所在地区相关的新闻内容,从而也作为新闻推荐的一种方式 |
二、具体实现
1.标签推荐
# -*- coding: utf-8 -*-
'''
Author:Z
Desc:通过用户标签进行内容用户推送分析,并把推送结果导入数据库
'''
import datetime
import logging
import os
from logging.handlers import TimedRotatingFileHandler
import pymysql
from Spider.settings import DB_HOST, DB_USER, DB_PASSWD, DB_NAME, DB_PORT
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)-7s - %(message)s')
# 2. 初始化handler,并配置formater
log_file_handler = TimedRotatingFileHandler(filename="Recommend/recommend/rlg.log",
when="S", interval=10,
backupCount=20)
log_file_handler.setFormatter(formatter)
# 3. 向logger对象中添加handler
logger.addHandler(log_file_handler)
class NewsRecommend:
def __init__(self, file):
self.file = file
self.db = self.connect()
self.cursor = self.db.cursor()
self.user_dict = self.loadDBData()
self.news_tags = self.loadFileData()
self.result = self.getRecResult()
def connect(self):
'''
@Description:数据库连接
@:param host --> 数据库链接
@:param user --> 用户名
@:param password --> 密码
@:param database --> 数据库名
@:param port --> 端口号
@:param charset --> 编码
'''
db = pymysql.Connect(host=DB_HOST, user=DB_USER, password=DB_PASSWD, database=DB_NAME, port=DB_PORT,
charset='utf8')
return db
def loadDBData(self):
'''
@Description:从数据库加载
@:param None
'''
logging.info("从数据库获取数据")
sql_s = 'select userid,tags from news_api_user'
try:
self.cursor.execute(sql_s)
message = self.cursor.fetchall()
except:
logging.error("Database Error")
self.db.rollback()
return message
def loadFileData(self):
'''
@Description:从文件中加载分词数据
@:param None
'''
print("开始加载文件数据:%s" % self.file)
news_tags = dict()
for line in open(self.file, "r", encoding="utf-8").readlines():
try:
newid, newtags = line.strip().split("\t")
news_tags[newid] = newtags
logger.info("Loading:{}".format(newtags))
except:
logger.info("读取分词数据过程中出现错误,错误行为:{}".format(line))
pass
return news_tags
def getRecResult(self):
'''
@Description:获取标签推荐的结果
@:param None
'''
news_cor_list = list()
# 取出user的标签“user[1]”
for user in self.user_dict:
# 取出news的标签self。news_tags[newsid]
usertags = set(user[1].split(","))
count = 0
for newsid in self.news_tags:
newstags = set(self.news_tags[newsid].split(","))
cor = (len(usertags & newstags) / len(usertags | newstags))
if cor > 0.0 and count < 20:
count += 1
news_cor_list.append([user[0], int(newsid), float(format(cor, ".2f"))])
logger.info("news_cor_list:{}".format(news_cor_list))
return news_cor_list
def writeToMySQL(self):
'''
@Description:将推荐结果写入数据库
@:param None
'''
logging.info("将数据写入数据库...")
for row in self.result:
time = datetime.datetime.now().strftime("%Y-%m-%d")
sql_i = 'insert into news_api_recommend(userid, newsid, hadread, cor, species, time) values (%d, %d, 0, %.2f, 0, \'%s\')'%\
(int(row[0]), int(row[1]), float(row[2]), time)
try:
self.cursor.execute(sql_i)
self.db.commit()
except Exception:
logger.error("rollback:{}".format(row))
self.db.rollback()
logging.info("推荐内容数据写入完成....")
def beginNewsRecommendByTags():
original_data_path = "Recommend/data/keywords/"
files = os.listdir(original_data_path)
for file in files:
print("开始计算文件 %s 下的新闻相关度。" % file)
cor = NewsRecommend(original_data_path + file)
cor.writeToMySQL()
print("\n相关度计算完毕")
2.热度推荐
# -*- coding: utf-8 -*-
'''
Author:Z
Desc:通过热值对用户进行推送新闻
'''
import datetime
import logging
from logging.handlers import TimedRotatingFileHandler
import pymysql
from Spider.settings import DB_HOST, DB_USER, DB_PASSWD, DB_NAME, DB_PORT
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)-7s - %(message)s')
# 2. 初始化handler,并配置formater
log_file_handler = TimedRotatingFileHandler(filename="Recommend/recommend/hlg.log",
when="S", interval=10,
backupCount=20)
log_file_handler.setFormatter(formatter)
# 3. 向logger对象中添加handler
logger.addHandler(log_file_handler)
class NewsRecommendByHotValue():
def __init__(self):
self.db = self.connect()
self.cursor = self.db.cursor()
self.userlist = self.loadDBData()
# self.news_tags = self.loadFileData()
self.result = self.getRecResult()
def connect(self):
'''
@Description:数据库连接
@:param host --> 数据库链接
@:param user --> 用户名
@:param password --> 密码
@:param database --> 数据库名
@:param port --> 端口号
@:param charset --> 编码
'''
db = pymysql.Connect(host=DB_HOST, user=DB_USER, password=DB_PASSWD, database=DB_NAME, port=DB_PORT,
charset='utf8')
return db
def loadDBData(self):
'''
@Description:加载数据库用户数据
@:param None
'''
sql_s = 'select userid from news_api_user'
try:
self.cursor.execute(sql_s)
useridlist = self.cursor.fetchall()
except:
logging.error("Database Error")
self.db.rollback()
return useridlist
def getRecResult(self):
'''
@Description:加载数据库新闻热度数据并进行热度推荐
@:param None
'''
sql_s = 'select news_id,news_hot from news_api_newshot order by news_hot DESC limit 0,20;'
self.cursor.execute(sql_s)
newsidlist = self.cursor.fetchall()
print(newsidlist)
time = datetime.datetime.now().strftime("%Y-%m-%d")
for user in self.userlist:
print(user[0])
for newsid in newsidlist:
sql_w = 'insert into news_api_recommend(userid, newsid, hadread, cor, species, time) values (%d, %d, 0, %.2f, 2, \'%s\')' % \
(int(user[0]), int(newsid[0]), 1, time)
logger.info("sql_w:{}".format(sql_w))
try:
self.cursor.execute(sql_w)
self.db.commit()
except:
logger.error("rollback:{}".format(newsid[0]))
self.db.rollback()
return True
def beginrecommendbyhotvalue():
NewsRecommendByHotValue()
3. 地区推荐
# -*- coding: utf-8 -*-
'''
@FileName:NewsRecommendByCity.py
@Description:通过ip获取到用户登录的所在区域,并通过区域进行内容匹配,然后给用户进行新闻推荐
@Author:Zline
@Time:2021/3/21 9:11
@Copyright:©2019-2021 Zline
'''
import datetime
import logging
import os
import re
from logging.handlers import TimedRotatingFileHandler
import pymysql
import requests
from Spider.settings import DB_HOST, DB_USER, DB_PASSWD, DB_NAME, DB_PORT
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)-7s - %(message)s')
# 2. 初始化handler,并配置formater
log_file_handler = TimedRotatingFileHandler(filename="Recommend/recommend/clg.log",
when="S", interval=10,
backupCount=20)
log_file_handler.setFormatter(formatter)
# 3. 向logger对象中添加handler
logger.addHandler(log_file_handler)
# http://ip.ws.126.net/ipquery?ip=223.104.63.12
class NewsRecommendByCity():
def __init__(self):
self.file = self.getFile()
self.db = self.connect()
self.cursor = self.db.cursor()
# self.userid = userid
self.userslist = self.getUserData()
self.news_tags = self.loadFileData()
self.region = self.getRegion()
self.reco = self.getRecommendByCity()
self.result = self.writeToMySQL()
# 连接mysql数据库
def connect(self):
db = pymysql.Connect(host=DB_HOST, user=DB_USER, password=DB_PASSWD, database=DB_NAME, port=DB_PORT,
charset='utf8')
return db
def getRecommendByCity(self):
'''
@Description:通过地区匹配新闻内容
@:param region ----> 地区
'''
city_cor_list = list()
for user in self.userslist:
userid = user[0]
# print('region', self.region)
logger.info("region:{}".format(self.region))
# print('user', user)
city_key = {dict(self.region).get(userid)}
# print(city_key)
for newsid in self.news_tags:
newstags = set(self.news_tags[newsid].split(","))
# print(city_key,newstags)
if len(city_key & newstags) > 0:
city_cor_list.append([int(userid), int(newsid), 1])
logger.info("city_cor_list.append:{}".format(str(userid)+":"+str(newsid)))
return city_cor_list
def getRegion(self):
'''
@Description:通过ip获取用户所在地区
@:param ip ----> 用户登录ip
'''
poslist = dict()
for user in self.userslist:
# print(user)
# print(user[0])
userid = user[0]
ip = user[1]
url = 'http://ip.ws.126.net/ipquery?ip=' + str(ip)
res = requests.get(url)
pos = re.findall('lo="(.*?)"', res.text)
poslist[userid] = list(pos)[0]
return poslist
def getUserData(self):
'''
@Description:通过数据库获取用户的ip
@:param
'''
users = ''
sql_s = 'select userid,ip from news_api_user '
try:
self.cursor.execute(sql_s)
users = self.cursor.fetchall()
print(users)
except Exception:
print("Demo Error")
return users
def getFile(self):
'''
@Description:获取新闻对象
@:param
'''
original_data_path = "Recommend/data/keywords/"
files = os.listdir(original_data_path)
for file in files:
return original_data_path + file
def loadFileData(self):
'''
@Description:加载本地的新闻标签词库
@:param
'''
print("开始加载文件数据:%s" % self.file)
news_tags = dict()
for line in open(self.file, "r", encoding="utf-8").readlines():
try:
newid, newtags = line.strip().split("\t")
news_tags[newid] = newtags
except:
print("读取分词数据过程中出现错误,错误行为:{}".format(line))
pass
return news_tags
def writeToMySQL(self):
logging.info("将数据写入数据库...")
print(self.reco)
time = datetime.datetime.now().strftime("%Y-%m-%d")
print(time)
for user in self.userslist:
userid = user[0]
sql_u_region = "update news_api_user set region='%s' where userid=%d" % (dict(self.region).get(userid).replace("省", ""), userid)
try:
self.cursor.execute(sql_u_region)
self.db.commit()
except Exception:
logger.error("rollback:{}".format(userid))
self.db.rollback()
for row in self.reco:
sql_i = 'insert into news_api_recommend(userid, newsid, hadread, cor, species ,time) values (%d, %d, 0, %.2f, 1, \'%s\')' % \
(int(row[0]), int(row[1]), float(row[2]), time)
print(sql_i)
try:
self.cursor.execute(sql_i)
self.db.commit()
except Exception:
logger.error("rollback:{}".format(row))
# print("rollback", row)
self.db.rollback()
logger.info("推荐内容数据写入完成....")
print('结束')
return 1
def beginrecommendbycity():
NewsRecommendByCity()
总结
目前采用的推荐方式仅有三种,预计中还有基于好友的协同过滤的推荐方式,但是并没有实现好友系统,该功能就只能搁置了。
此外,这样单方面的推荐系统准确率还是相对较低的,与有机器学习的推荐算法比起来还是有差距的,因此如果还想提升推荐算法的使用体验的话,就需要使用机器学习了,争取下一个版本能用上机器学习吧~
项目完整的源码已更新,有需要的可以自行下载😀
欢迎提交问题和错误
个人码云主页,欢迎交流!!
个人GitHub主页,欢迎交流!!