用sklearn.datasets as ds.make_blobs(N, n_features=2, centers=3, cluster_std=(1, 2.5, 1), random_state=2)
創造三個方差不同聚類的資料,對資料旋轉為更不規則的效果。
# 普通二維矩陣 對資料進行旋轉和拉伸操作
m = np.array(((1, 1), (1, 3)))
data_r = data3.dot(m)
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.colors
import matplotlib.pyplot as plt
import sklearn.datasets as ds
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.metrics import euclidean_distances
from sklearn.cluster import MeanShift
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import SpectralClustering
def expand(a, b):
d = (b - a) * 0.1
return a-d, b+d
if __name__ == "__main__":
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
N = 300
centers = 3
# 四個聚類的分布方差不同
data, y_raw = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=(1, 2.5, 1), random_state=2)
# data3隻有175個數
data3 = np.vstack((data[y_raw == 0][:], data[y_raw == 1][:50], data[y_raw == 2][:20]))
# y3實際值
y3 = np.array([0] * 100 + [1] * 50 + [2] * 20)
# 普通二維矩陣 對資料進行旋轉和拉伸操作
m = np.array(((1, 1), (1, 3)))
data_r = data3.dot(m)
用不同的聚類方法分析聚類效果
1 kmeans是最為常用的聚類方法,隻需要規定聚類的數量,但不适用于方差不同,非正圓的圖形
data_list = [data_r,data_r]
y_list = [y3,y3]
titles= ['原始資料','kmeans']
model = KMeans(n_clusters=3, init='k-means++', n_init=5)
# 設定圖檔的背景色為白色 大小為8英寸
plt.figure(figsize=(8, 9), facecolor='w')
for i, (x, y, title) in enumerate(zip(data_list, y_list, titles), start=1):
plt.subplot(2, 1, i)
plt.title(title)
if i % 2 == 1:
y_pred = y
else:
y_pred = model.fit_predict(x)
plt.scatter(x[:, 0], x[:, 1], c=y_pred, s=30, edgecolors='none')
x1_min, x2_min = np.min(x, axis=0)
x1_max, x2_max = np.max(x, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(b=True, ls=':')
plt.tight_layout(2, rect=(0, 0, 1, 0.97))
plt.suptitle('資料分布對KMeans聚類的影響', fontsize=18)
plt.show()
圖中的效果看出,kmeans會按方差相同的正态分布方式劃分三個類别。
2 meanshift 需要設定帶寬
preference = -np.median(m)
print('Preference:', preference)
plt.figure(figsize=(12, 9), facecolor='w')
for i, mul in enumerate(np.linspace(0.1, 0.4, 8)):
# 0.1 0.2 0.3 0.4
band_width = mul * (-1) * preference
model = MeanShift(bin_seeding=True, bandwidth=band_width)
ms = model.fit(data_r)
centers = ms.cluster_centers_
y_hat = ms.labels_
n_clusters = np.unique(y_hat).size
print('帶寬:', mul, band_width, '聚類簇的個數為:', n_clusters)
plt.subplot(2, 4, i+1)
plt.title('帶寬:%.2f,聚類簇的個數為:%d' % (band_width, n_clusters))
clrs = []
for c in np.linspace(16711680, 255, n_clusters, dtype=int):
clrs.append('#%06x' % c)
# clrs = plt.cm.Spectral(np.linspace(0, 1, n_clusters))
for k, clr in enumerate(clrs):
cur = (y_hat == k)
plt.scatter(data_r[cur, 0], data_r[cur, 1], c=clr, edgecolors='none')
plt.scatter(centers[:, 0], centers[:, 1], s=150, c=clrs, marker='*', edgecolors='k')
plt.grid(b=True, ls=':')
plt.tight_layout(2)
plt.suptitle('MeanShift聚類', fontsize=15)
plt.subplots_adjust(top=0.9)
plt.show()
3 affinitypropagation 不需要設定聚類個數,通過調整preference值會得到不同的聚類效果
# 歐氏距離
m = euclidean_distances(data_r, squared=True)
preference = -np.median(m)
plt.figure(figsize=(12, 9), facecolor='w')
for i, mul in enumerate(np.linspace(1, 10, 16)):
print(mul)
p = mul * preference
model = AffinityPropagation(affinity='euclidean', preference=p)
af = model.fit(data_r)
center_indices = af.cluster_centers_indices_
n_clusters = len(center_indices)
print(('p = %.1f' % mul), p, '聚類簇的個數為:', n_clusters)
y_hat = af.labels_
plt.subplot(4, 4, i + 1)
plt.title('Preference:%.2f,簇個數:%d' % (p, n_clusters))
clrs = []
for c in np.linspace(16711680, 255, n_clusters, dtype=int):
clrs.append('#%06x' % c)
# clrs = plt.cm.Spectral(np.linspace(0, 1, n_clusters))
for k, clr in enumerate(clrs):
cur = (y_hat == k)
plt.scatter(data_r[cur, 0], data_r[cur, 1], s=15, c=clr, edgecolors='none')
center = data_r[center_indices[k]]
for x in data_r[cur]:
plt.plot([x[0], center[0]], [x[1], center[1]], color=clr, lw=0.5, zorder=1)
plt.scatter(data_r[center_indices, 0], data_r[center_indices, 1], s=80, c=clrs, marker='*', edgecolors='k',
zorder=2)
plt.grid(b=True, ls=':')
plt.tight_layout()
plt.suptitle('AP聚類', fontsize=20)
plt.subplots_adjust(top=0.92)
plt.show()
4 層類聚類 也需要規定聚類的個數及相似度距離計算的方式和兩個簇間相似度的比較方式"ward", "complete", "average"
從結果看,ward得到了相當好的聚類效果
plt.figure(figsize=(10, 8), facecolor='w')
linkages = ("ward", "complete", "average")
plt.subplot(2, 2, 1)
plt.scatter(data_r[:, 0], data_r[:, 1], c=y3, s=12, edgecolors='k')
plt.title('Prime', fontsize=12)
plt.grid(b=True, ls=':')
data_min1, data_min2 = np.min(data_r, axis=0)
data_max1, data_max2 = np.max(data_r, axis=0)
plt.xlim(expand(data_min1, data_max1))
plt.ylim(expand(data_min2, data_max2))
connectivity = kneighbors_graph(data_r, n_neighbors=3, mode='distance', metric='minkowski', p=2,
include_self=True)
connectivity = 0.5 * (connectivity + connectivity.T)
for i, linkage in enumerate(linkages):
ac = AgglomerativeClustering(n_clusters=3, affinity='euclidean',
connectivity=connectivity, linkage=linkage)
ac.fit(data_r)
y = ac.labels_
plt.subplot(2, 2, i + 2)
plt.scatter(data_r[:, 0], data_r[:, 1], c=y, s=12, edgecolors='k')
plt.title(linkage, fontsize=12)
plt.grid(b=True, ls=':')
plt.xlim(expand(data_min1, data_max1))
plt.ylim(expand(data_min2, data_max2))
plt.suptitle('層次聚類的不同合并政策', fontsize=15)
plt.tight_layout(0.5, rect=(0, 0, 1, 0.95))
plt.show()
5 密度聚類 不需要指定聚類的個數,需要規定半徑值及支撐的樣本數量,随着半徑和樣闆數量的擴大,分成的類别數量會降低。選擇不适合的半徑和個數都拿不到理想的結果
params = ((0.1, 2),(0.1, 3),(0.2, 3),(0.3, 3),(0.5, 3), (0.5, 5), (0.5, 10), (1., 3), (1., 10), (1., 20))
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(9, 17), facecolor='w')
plt.suptitle('DBSCAN聚類', fontsize=15)
data1 = StandardScaler().fit_transform(data_r)
for i in range(8):
eps, min_samples = params[i]
model = DBSCAN(eps=eps, min_samples=min_samples)
model.fit(data1)
y_hat = model.labels_
core_indices = np.zeros_like(y_hat, dtype=bool)
core_indices[model.core_sample_indices_] = True
y_unique = np.unique(y_hat)
n_clusters = y_unique.size - (1 if -1 in y_hat else 0)
print(y_unique, '聚類簇的個數為:', n_clusters)
plt.subplot(2, 4, i + 1)
clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))
print(clrs)
for k, clr in zip(y_unique, clrs):
cur = (y_hat == k)
if k == -1:
plt.scatter(data1[cur, 0], data1[cur, 1], s=10, c='k')
continue
plt.scatter(data1[cur, 0], data1[cur, 1], s=15, c=clr, edgecolors='k')
plt.scatter(data1[cur & core_indices][:, 0], data1[cur & core_indices][:, 1], s=30, c=clr, marker='o',
edgecolors='k')
x1_min, x2_min = np.min(data1, axis=0)
x1_max, x2_max = np.max(data1, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.plot()
plt.grid(b=True, ls=':', color='#606060')
plt.title(r'$\epsilon$ = %.1f m = %d,聚類數目:%d' % (eps, min_samples, n_clusters), fontsize=12)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
6 譜聚類需要規定聚類的數量和相似度計算的方式及方差s
通過調整不同的方內插補點得到的聚類效果不同,其中方差為21.54時接近聚類原始樣本
m_distance = euclidean_distances(data_r, squared=True)
plt.figure(figsize=(12, 8), facecolor='w')
plt.suptitle('譜聚類', fontsize=16)
clrs = plt.cm.Spectral(np.linspace(0, 0.8, 3))
for i, s in enumerate(np.logspace(-4, 4, 16)):
af = np.exp(-m_distance ** 2 / (s ** 2)) + 1e-6
model = SpectralClustering(n_clusters=3, affinity='precomputed', assign_labels='kmeans',
random_state=1)
# 譜聚類的訓練值是兩兩樣本之間的相似度
y_hat = model.fit_predict(af)
plt.subplot(4, 4, i + 1)
for k, clr in enumerate(clrs):
cur = (y_hat == k)
plt.scatter(data_r[cur, 0], data_r[cur, 1], s=40, c=clr, edgecolors='k')
x1_min, x2_min = np.min(data_r, axis=0)
x1_max, x2_max = np.max(data_r, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(b=True, ls=':', color='#808080')
plt.title(r'$\sigma$ = %.2f' % s, fontsize=13)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
7 高斯混合模型 借助EM算法,相比kmeans采用不同的方差模拟
8 dirichelet先驗條件下的混合高斯模型
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.colors
import matplotlib.pyplot as plt
import sklearn.datasets as ds
from sklearn.cluster import KMeans
from sklearn.metrics import euclidean_distances
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
def expand(a, b):
d = (b - a) * 0.1
return a-d, b+d
if __name__ == "__main__":
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
N = 300
centers = 3
# 四個聚類的分布方差不同
data, y_raw = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=(1, 2.5, 1), random_state=2)
# data3隻有175個數
data3 = np.vstack((data[y_raw == 0][:], data[y_raw == 1][:50], data[y_raw == 2][:20]))
# y3實際值
y3 = np.array([0] * 100 + [1] * 50 + [2] * 20)
# 普通二維矩陣 對資料進行旋轉和拉伸操作
m = np.array(((1, 1), (1, 3)))
data_r = data3.dot(m)
model = KMeans(n_clusters=3, init='k-means++', n_init=5)
model.fit(data_r)
gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=0)
x_min = np.min(data_r, axis=0)
x_max = np.max(data_r, axis=0)
gmm.fit(data_r)
dpgmm = BayesianGaussianMixture(n_components=3, covariance_type='full', max_iter=1000, n_init=5,
weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1)
dpgmm.fit(data_r)
data_list = [data_r,data_r,]
y_list = [y3,y3]
modelList = [model,gmm,dpgmm]
titles= ['原始資料','kmeans','高斯混合模型','DPGMM']
plt.figure(figsize=(10, 10), facecolor='w')
for i, title in enumerate(titles):
plt.subplot(2, 2, i+1)
plt.title(title)
if i == 0:
y_pred = y3
else:
y_pred = modelList[i-1].predict(data_r)
plt.scatter(data_r[:, 0], data_r[:, 1], c=y_pred, s=30, edgecolors='none')
x1_min, x2_min = np.min(data_r, axis=0)
x1_max, x2_max = np.max(data_r, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(b=True, ls=':')
plt.tight_layout(2, rect=(0, 0, 1, 0.97))
plt.suptitle('高斯混合模型聚類', fontsize=18)
plt.show()