数据集
-
Dataset1.txt
328 个同学的身高、体重、性别数据(78 个女生、250 个男生)
-
Dataset2.txt
124 个同学的数据(40 女、84 男)
-
Dataset3.txt
90 个同学的数据(16 女,74 男)
百度网盘 提取码:8plu
工作一
以dataset1为训练数据库,假设身高与体重满足高斯分布,
- 进行高斯分布的参数估计,
- 并进行基于最小错误率的贝叶斯分类,分别考虑男女的先验概率,0.5-0.5;0.6-0.4;0.7-0.3,0.8-0.2,
- 并以dataset2和dataset3为测试数据库分析分类性能,
- 并探讨先验概率对分类性能的影响。
导入数据
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import numpy as np
import csv
p = r'train_data.csv'
with open(p,encoding = 'utf-8') as f:
train_data = np.loadtxt(f,str,delimiter = ",")
print("打印训练集前5个数据集:\n",train_data[:5])
p = r'test_data1.csv'
with open(p,encoding = 'utf-8') as f:
test_data1 = np.loadtxt(f,str,delimiter = ",")
print("打印测试集1前5个数据集:\n",test_data1[:5])
p = r'test_data2.csv'
with open(p,encoding = 'utf-8') as f:
test_data2 = np.loadtxt(f,str,delimiter = ",")
print("打印测试集1前5个数据集:\n",test_data2[:5])
#将数据集拆分为特征向量和标签值
X_train,X_test1,X_test2 = train_data[:,:2],test_data1[:,:2],test_data2[:,:2]
y_train,y_test1,y_test2 = train_data[:,2],test_data1[:,2],test_data2[:,2]
#查看训练集维度
print("查看X_train特征维度:",X_train.shape,"\n查看y_train特征维度:",y_train.shape)
print("查看X_test1特征维度:",X_test1.shape,"\n查看y_test1特征维度:",y_test1.shape)
print("查看X_test2特征维度:",X_test2.shape,"\n查看X_test2特征维度:",y_test2.shape)
#将字符型准换成浮点型
X_train = X_train.astype(float)
X_test1 = X_test1.astype(float)
X_test2 = X_test2.astype(float)
#将"M","F"转化为0,1
y_train = np.array([1 if i=='M' else 0 for i in y_train])
y_test1 = np.array([1 if i=='M' else 0 for i in y_test1])
y_test2 = np.array([1 if i=='M' else 0 for i in y_test2])
import matplotlib.pyplot as plt
from matplotlib.pylab import mpl
from matplotlib.ticker import FuncFormatter
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus']=False
fig, ax = plt.subplots(3,1,figsize=(9,25))
ax[0].scatter(X_train[y_train==1,0],X_train[y_train==1,1], marker="o",c="green",label="男士",alpha=1)
ax[0].scatter(X_train[y_train==0,0],X_train[y_train==0,1], marker="x",c="red",label="女士",alpha=1)
ax[1].scatter(X_test1[y_test1==1,0],X_test1[y_test1==1,1], marker="o",c="green",label="男士",alpha=1)
ax[1].scatter(X_test1[y_test1==0,0],X_test1[y_test1==0,1], marker="x",c="red",label="女士",alpha=1)
ax[2].scatter(X_test2[y_test2==1,0],X_test2[y_test2==1,1], marker="o",c="green",label="男士",alpha=1)
ax[2].scatter(X_test2[y_test2==0,0],X_test2[y_test2==0,1], marker="x",c="red",label="女士",alpha=1)
for i in range(3):
ax[i].set_xlim(140, 200)
ax[i].set_ylim(35, 110)
ax[i].set_xlabel('体重')
ax[i].set_ylabel('身高')
ax[i].legend()
ax[0].set_title('测试集的散点分布')
ax[1].set_title('test_data1的散点分布')
ax[2].set_title('test_data2的散点分布')
plt.show()
打印训练集前5个数据集:
[['161' '46' 'F']
['160' '56' 'F']
['163' '50' 'F']
['169' '54' 'F']
['160' '48' 'F']]
打印测试集1前5个数据集:
[['151' '42' 'F']
['153' '48' 'F']
['155' '43' 'F']
['158' '49' 'F']
['158' '58' 'F']]
打印测试集1前5个数据集:
[['163' '48' 'F']
['169' '50' 'F']
['170' '50' 'F']
['167' '50' 'F']
['167' '55' 'F']]
查看X_train特征维度: (328, 2)
查看y_train特征维度: (328,)
查看X_test1特征维度: (124, 2)
查看y_test1特征维度: (124,)
查看X_test2特征维度: (90, 2)
查看X_test2特征维度: (90,)
一步步实现朴素贝叶斯
from collections import Counter
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
def get_prior(label):
prior = []
dict = Counter(label)
for i in dict:
prior.append(dict[i]/len(label))
print("训练集的先验概率[F,M]:",np.array(prior))
return np.array(prior)
prior = get_prior(y_train)
n_class = [0,1]
def avgs(data,label):
return np.array([data[label == i].mean(axis=0) for i in n_class])
avg = avgs(X_train,y_train)
print("train_data的均值:\n",avg)
def vars(data,label):
return np.array([data[label == i].var(axis=0) for i in n_class])
var = vars(X_train,y_train)
print("train_data的方差:\n",var)
#计算似然度
def Calculate_likelihood(row):
return (1/np.sqrt(2*np.pi*var)*np.exp(-(row.reshape(-1,1,2) - avg)**2/(2*var))).prod(axis = 2)
a = Calculate_likelihood(X_test2)
probs = a * prior
prob_sum = probs.sum(axis =1)
b = (probs/prob_sum[:,None]).argmax(axis=1)
def get_acc(y_test,y_hat):
print("准确度:",(y_hat == y_test).sum()/len(y_hat))
print("女生的精确度:",precision_score(y_test, y_hat, average=None)[0],"\n男生的精确度:",precision_score(y_test, y_hat, average=None)[1])
print("女生的召回率:",recall_score(y_test, y_hat, average=None)[0],"\n男生的召回率:",recall_score(y_test, y_hat, average=None)[1])
get_acc(y_test2,b)
训练集的先验概率[F,M]: [0.23780488 0.76219512]
train_data的均值:
[[162.32051282 51.40384615]
[174.996 67.234 ]]
train_data的方差:
[[20.91009204 26.77601085]
[28.313984 97.986244 ]]
准确度: 0.8666666666666667
女生的精确度: 0.6
男生的精确度: 0.9428571428571428
女生的召回率: 0.75
男生的召回率: 0.8918918918918919
封装为类
from collections import Counter
class GaussianNB1():
def __init__(self):
self.prior = None
self.avgs = None
self.vars = None
self.n_class = [0,1]
def get_prior(self,label):
prior = []
dict = Counter(label)
for i in dict:
prior.append(dict[i]/len(label))
return np.array(prior)
def avg_s(self,data,label):
return np.array([data[label == i].mean(axis=0) for i in self.n_class])
def var_s(self,data,label):
return np.array([data[label == i].var(axis=0) for i in self.n_class])
def fit(self,data,label,prior=None):
#self.prior = self.get_prior(label)
#self.calss =
if prior is None:
self.prior = self.get_prior(label)
else:
self.prior = prior
self.avgs = self.avg_s(data,label)
self.vars = self.var_s(data,label)
def Calculate_likelihood(self,data_test):
return (1/np.sqrt(2*np.pi*self.vars)*np.exp(-(data_test.reshape(-1,1,2) - self.avgs)**2/(2*self.vars))).prod(axis = 2)
def predict(self,data_test):
a = self.Calculate_likelihood(data_test)
probs = a * self.prior
prob_sum = probs.sum(axis =1)
return (probs/prob_sum[:,None]).argmax(axis=1)
def get_acc(self,y_test,y_hat):
print("准确度:",(y_hat == y_test).sum()/len(y_hat))
print("女生的精确度:",precision_score(y_test, y_hat, average=None)[0],"\n男生的精确度:",precision_score(y_test, y_hat, average=None)[1])
#print("女生的召回率:",recall_score(y_test, y_hat, average=None)[0],"\n男生的召回率:",recall_score(y_test, y_hat, average=None)[1])
return((y_hat == y_test).sum()/len(y_hat),precision_score(y_test, y_hat, average=None)[0],precision_score(y_test, y_hat, average=None)[1])
last1,last2,last3 = [],[],[]
clf1 = GaussianNB1()
clf1.fit(X_train,y_train)
print("如果按照训练集计算的先验证概率")
y_hat= clf1.predict(X_test1)
clf1.get_acc(y_test1,y_hat)
for i in [[0.5,0.5],[0.6,0.4],[0.7,0.3],[0.8,0.2]]:
print("如果按照",i,"的先验证概率")
clf1.fit(X_train,y_train,i)
y_hat = clf1.predict(X_test1)
a,b,c = clf1.get_acc(y_test1,y_hat)
last1.append(a),last2.append(b),last3.append(c)
如果按照训练集计算的先验证概率
准确度: 0.8870967741935484
女生的精确度: 0.825
男生的精确度: 0.9166666666666666
如果按照 [0.5, 0.5] 的先验证概率
准确度: 0.8629032258064516
女生的精确度: 0.7555555555555555
男生的精确度: 0.9240506329113924
如果按照 [0.6, 0.4] 的先验证概率
准确度: 0.8629032258064516
女生的精确度: 0.7446808510638298
男生的精确度: 0.935064935064935
如果按照 [0.7, 0.3] 的先验证概率
准确度: 0.8467741935483871
女生的精确度: 0.7142857142857143
男生的精确度: 0.9333333333333333
如果按照 [0.8, 0.2] 的先验证概率
准确度: 0.8145161290322581
女生的精确度: 0.660377358490566
男生的精确度: 0.9295774647887324
last4,last5,last6 = [],[],[]
for i in [[0.5,0.5],[0.6,0.4],[0.7,0.3],[0.8,0.2]]:
print("如果按照",i,"的先验证概率")
clf1.fit(X_train,y_train,i)
y_hat = clf1.predict(X_test2)
a,b,c = clf1.get_acc(y_test2,y_hat)
last4.append(a),last5.append(b),last6.append(c)
如果按照 [0.5, 0.5] 的先验证概率
准确度: 0.8888888888888888
女生的精确度: 0.6363636363636364
男生的精确度: 0.9705882352941176
如果按照 [0.6, 0.4] 的先验证概率
准确度: 0.8666666666666667
女生的精确度: 0.5833333333333334
男生的精确度: 0.9696969696969697
如果按照 [0.7, 0.3] 的先验证概率
准确度: 0.8555555555555555
女生的精确度: 0.56
男生的精确度: 0.9692307692307692
如果按照 [0.8, 0.2] 的先验证概率
准确度: 0.8111111111111111
女生的精确度: 0.4827586206896552
男生的精确度: 0.9672131147540983
import matplotlib
import matplotlib.pyplot as plt
# 处理乱码
matplotlib.rcParams['font.sans-serif'] = ['SimHei'] # 用黑体显示中文
fig, ax = plt.subplots(1, 2,figsize=(15,5))
#x = [[0.5,0.5],[0.6,0.4],[0.7,0.3],[0.8,0.2]]
x = [0.5,0.6,0.7,0.8]
ax[0].plot(x, last1, "r", marker='o', ms=10, label="整体")
ax[0].plot(x, last2, "g", marker='*', ms=10, label="女士")
ax[0].plot(x, last3, "b", marker='*', ms=10, label="男士")
ax[1].plot(x, last4, "r", marker='o', ms=10, label="整体")
ax[1].plot(x, last5, "g", marker='*', ms=10, label="女士")
ax[1].plot(x, last6, "b", marker='*', ms=10, label="男士")
for i in range(2):
ax[i].set_ylim(0.3, 1.05)
ax[i].set_xlabel('男生的先验概率')
ax[i].set_ylabel('准确度')
ax[i].legend()
ax[0].set_title("Dataset测试结果图示")
ax[1].set_title('Dataset3测试结果图示')
plt.savefig("a.jpg")
plt.show()
分析
随着先验概率的增加:
男性分类正确率呈上升趋势;
女性分类正确率呈下降趋势;
整体分类正确率在二者之间波动;
分析:
先验概率的选择对试验结果有一定的影响;
先验概率与数据集的整体分布有关
基于sklearn朴素贝叶斯模型
#建立模型
clf = GaussianNB()
#使用训练集对模型进行训练
clf.fit(X_train,y_train)
GaussianNB(priors=None)
#使用测试集数据检验模型准确率
print("tsst_data1的准确率",clf.score(X_test1,y_test1))
print("tsst_data2的准确率",clf.score(X_test2,y_test2))
tsst_data1的准确率 0.8870967741935484
tsst_data2的准确率 0.8666666666666667
工作二
基于线性分类器进行性别分类,同样以dataset1作为训练样本,dataset2和dataset3作为测试样本。
基于线性判别分析
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 13 12:02:21 2020
fisher 算法
@author: lihuanyu
"""
#%%LDA算法的实现
def Calculate_means_cov(X,y):
n_class = [0,1]
means = np.array([X[y == i].mean(axis=0) for i in n_class])
cov_girl = np.zeros((len(means[0]),len(means[0])))
cov_boy = np.zeros((len(means[0]),len(means[0])))
#计算类内散度矩阵
for i,j in zip(X,y):
i = i.reshape(2,1)
if j == 0:
cov_girl += (i - means[0].reshape(2,1))*((i - means[0].reshape(2,1)).reshape(1,len(means[0])))
else:
cov_boy += (i - means[0].reshape(2,1))*((i - means[0].reshape(2,1)).reshape(1,len(means[0])))
#类内散度相加
s_w = cov_boy + cov_girl
#对其进行奇异值分解
u, s, v = np.linalg.svd(s_w)
s_w_inv = np.dot(np.dot(v.T, np.linalg.inv(np.diag(s))), u.T)
return(means,s_w_inv.dot((means[0]-means[1]).reshape(len(means[0]),1)))
Calculate_means_cov(X_train,y_train)
#%%算法的判别
mean,w = Calculate_means_cov(X_train,y_train)
kernel_girl = np.dot(w.T,mean[0].reshape(len(mean[0]),1))
kernel_boy = np.dot(w.T,mean[1].reshape(len(mean[1]),1))
def Distinguish(X,y):
#新样本进行判断
y_pre = []
for i in X:
new_pos = np.dot(w.T,i.reshape(2,1))
if(abs(new_pos - kernel_girl) > abs(new_pos - kernel_boy)):
y_pre.append(1)
else:
y_pre.append(0)
print(accuracy_score(y_pre,y))
Distinguish(X_test1,y_test1)
Distinguish(X_test2,y_test2)
#%%绘制图像
import matplotlib.pyplot as plt
from matplotlib.pylab import mpl
from matplotlib.ticker import FuncFormatter
plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1],color='red')
plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1],color='blue')
line_x = np.arange(150,190)
line_y = -(w[0]*0.1*line_x) / w[1]*0.1
plt.plot(line_x, line_y, linewidth=3.0, label = 'fisher boundary line ')
plt.legend(loc='upper right')
plt.xlabel('feature 1')
plt.ylabel('feature 2')
plt.show()
0.8870967741935484
0.8777777777777778
分析
线性分类器的效果并不是很理想,我么可以利用SVM 分类器,等集成学习实现
工作三
以dataset1为训练数据库,进行基于pazen窗方法的概率密度估计,并进行0.5-0.5先验概率条件下的最小错误率并进行基于最小错误率的贝叶斯分类并以dataset2和dataset3为测试数据库分析分类性能。
from sklearn.neighbors import KernelDensity
import numpy as np
from collections import Counter
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
#训练男士时的概率密度估计
kde_man = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X_train[y_train==1])
#训练女士的概率密度估计
kde_woman = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X_train[y_train==0])
#对于test1
a = kde_woman.score_samples(X_test1)[:, np.newaxis]
b = kde_man.score_samples(X_test1)[:, np.newaxis]
c = np.exp(np.concatenate((a,b),axis=1))
probs = c * [0.5,0.5]
prob_sum = probs.sum(axis =1)
b = (probs/prob_sum[:,None]).argmax(axis=1)
def get_acc(y_test,y_hat):
print("test_data1准确度:",(y_hat == y_test).sum()/len(y_hat))
print("女生的精确度:",precision_score(y_test, y_hat, average=None)[0],"\n男生的精确度:",precision_score(y_test, y_hat, average=None)[1])
print("女生的召回率:",recall_score(y_test, y_hat, average=None)[0],"\n男生的召回率:",recall_score(y_test, y_hat, average=None)[1])
get_acc(y_test1,b)
test_data1准确度: 0.8548387096774194
女生的精确度: 0.7619047619047619
男生的精确度: 0.9024390243902439
女生的召回率: 0.8
男生的召回率: 0.8809523809523809
#对于test2
a = kde_woman.score_samples(X_test2)[:, np.newaxis]
b = kde_man.score_samples(X_test2)[:, np.newaxis]
c = np.exp(np.concatenate((a,b),axis=1))
probs = c * [0.5,0.5]
prob_sum = probs.sum(axis =1)
b = (probs/prob_sum[:,None]).argmax(axis=1)
def get_acc(y_test,y_hat):
print("test_data2准确度:",(y_hat == y_test).sum()/len(y_hat))
print("女生的精确度:",precision_score(y_test, y_hat, average=None)[0],"\n男生的精确度:",precision_score(y_test, y_hat, average=None)[1])
print("女生的召回率:",recall_score(y_test, y_hat, average=None)[0],"\n男生的召回率:",recall_score(y_test, y_hat, average=None)[1])
get_acc(y_test2,b)
test_data2准确度: 0.8666666666666667
女生的精确度: 0.6428571428571429
男生的精确度: 0.9078947368421053
女生的召回率: 0.5625
男生的召回率: 0.9324324324324325
分析
我们和核密度估计采用的是高斯核,带宽为0.75。在先验概率相同的情况下(均为0.5),我们发现从总体上而言test_data1和test_2的准确率都比较好,但是就各自的类别而言,男生的查准率和查全率都比较高,而女生的效果不是很理想,我认为这与先验概率有一定关系。