1.項目背景 在做交通路線分析的時候,客戶需要找出車輛的行車規律,我們将車輛每天的行車路線當做一個資料樣本,總共有365天或是更多,從這些資料中通過聚類來獲得行車路線規律統計分析。
我首先想到是K-means算法,不過它的算法思想是任選K個中心點,然後不停的疊代,在疊代的過程中需要不停的更新中心點。在我們着這個項目中,此方案不能解決,因為我們是通過編輯距離來計算兩條路線的相似度。可以參考(1.交通聚類:編輯距離 (Levenshtein距離)Java實作) 這篇文章了解一下編輯距離。當我們第一步選出k個中心點後,并且兩兩計算編輯距離,然後再重新選擇中心點,這時問題出來了,我們得到了編輯距離的均值,是數字化的。如果在根據這個均值兩兩比較編輯距離就無法實作了。故此方案廢棄。
2.層次聚類概述(Hierarchical Clustering) 基于層次的聚類方法(系統聚類方法):對給定的資料集進行層次分解,直到某種條件滿足為止。
1.凝聚的層次聚類:自底向上,首先将每個對象作為一個族開始,每一步合并兩個最近的簇,直到滿足簇的數目。如AGNES算法。每一項自成一類;疊代,将最近的兩類合并為一類。
2.分裂的層次聚類: 自頂向下,從包含所有對象的一個簇開始,每一步分裂一個簇,直到簇的數目。如DLANA算法。将所有項看做一類;找出最不相似的項分裂出去成為兩類。 相信大家讀完上邊的陳述已經明白是怎麼回事了。下邊是代碼,供以後用到的朋友學習研究。
3.代碼: package agenes;
import java.util.ArrayList;
import java.util.List;
public class Cluster {
private List dataPoints = new ArrayList(); // 類簇中的樣本點
private String clusterName;
public List getDataPoints() {
return dataPoints;
}
public void setDataPoints(List dataPoints) {
this.dataPoints = dataPoints;
}
public String getClusterName() {
return clusterName;
}
public void setClusterName(String clusterName) {
this.clusterName = clusterName;
}
}
package agenes;
import java.util.ArrayList;
import java.util.List;
public class ClusterAnalysis {
public List startAnalysis(List dataPoints,int ClusterNum){
List finalClusters=new ArrayList();
List originalClusters=initialCluster(dataPoints);
finalClusters=originalClusters;
while(finalClusters.size()>ClusterNum){
double min=Double.MAX_VALUE;
int mergeIndexA=0;
int mergeIndexB=0;
for(int i=0;i
for(int j=0;j
if(i!=j){
Cluster clusterA=finalClusters.get(i);
Cluster clusterB=finalClusters.get(j);
List dataPointsA=clusterA.getDataPoints();
List dataPointsB=clusterB.getDataPoints();
for(int m=0;m
for(int n=0;n
double tempDis=getDistance(dataPointsA.get(m),dataPointsB.get(n));
if(tempDis
min=tempDis;
mergeIndexA=i;
mergeIndexB=j;
}
}
}
}
} //end for j
}// end for i
//合并cluster[mergeIndexA]和cluster[mergeIndexB]
finalClusters=mergeCluster(finalClusters,mergeIndexA,mergeIndexB);
}//end while
return finalClusters;
}
private List mergeCluster(List clusters,int mergeIndexA,int mergeIndexB){
if (mergeIndexA != mergeIndexB) {
// 将cluster[mergeIndexB]中的DataPoint加入到 cluster[mergeIndexA]
Cluster clusterA = clusters.get(mergeIndexA);
Cluster clusterB = clusters.get(mergeIndexB);
List dpA = clusterA.getDataPoints();
List dpB = clusterB.getDataPoints();
for (DataPoint dp : dpB) {
DataPoint tempDp = new DataPoint();
// tempDp.setDataPointName(dp.getDataPointName());
// tempDp.setDimensioin(dp.getDimensioin());
// tempDp.setCluster(clusterA);
tempDp = dp;
tempDp.setCluster(clusterA);
dpA.add(tempDp);
}
clusterA.setDataPoints(dpA);
// List clusters中移除cluster[mergeIndexB]
clusters.remove(mergeIndexB);
}
return clusters;
}
// 初始化類簇
private List initialCluster(List dataPoints){
List originalClusters=new ArrayList();
for(int i=0;i
DataPoint tempDataPoint=dataPoints.get(i);
List tempDataPoints=new ArrayList();
tempDataPoints.add(tempDataPoint);
Cluster tempCluster=new Cluster();
tempCluster.setClusterName("Cluster "+String.valueOf(i));
tempCluster.setDataPoints(tempDataPoints);
tempDataPoint.setCluster(tempCluster);
originalClusters.add(tempCluster);
}
return originalClusters;
}
//計算兩個樣本點之間的歐幾裡得距離
private double getDistance(DataPoint dpA, DataPoint dpB){
double distance=0;
double[] dimA = dpA.getDimensioin();
double[] dimB = dpB.getDimensioin();
if (dimA.length == dimB.length) {
for (int i = 0; i < dimA.length; i++) {
double temp=Math.pow((dimA[i]-dimB[i]),2);
distance=distance+temp;
}
distance=Math.pow(distance, 0.5);
}
return distance;
}
public static void main(String[] args){
ArrayList dpoints = new ArrayList();
double[] a={2,3};
double[] b={2,4};
double[] c={1,4};
double[] d={1,3};
double[] e={2,2};
double[] f={3,2};
double[] g={8,7};
double[] h={8,6};
double[] i={7,7};
double[] j={7,6};
double[] k={8,5};
// double[] l={100,2};//孤立點
double[] m={8,20};
double[] n={8,19};
double[] o={7,18};
double[] p={7,17};
double[] q={8,20};
dpoints.add(new DataPoint(a,"a"));
dpoints.add(new DataPoint(b,"b"));
dpoints.add(new DataPoint(c,"c"));
dpoints.add(new DataPoint(d,"d"));
dpoints.add(new DataPoint(e,"e"));
dpoints.add(new DataPoint(f,"f"));
dpoints.add(new DataPoint(g,"g"));
dpoints.add(new DataPoint(h,"h"));
dpoints.add(new DataPoint(i,"i"));
dpoints.add(new DataPoint(j,"j"));
dpoints.add(new DataPoint(k,"k"));
// dataPoints.add(new DataPoint(l,"l"));
// dpoints.add(new DataPoint(l,"l"));
dpoints.add(new DataPoint(m,"m"));
dpoints.add(new DataPoint(n,"n"));
dpoints.add(new DataPoint(o,"o"));
dpoints.add(new DataPoint(p,"p"));
dpoints.add(new DataPoint(q,"q"));
int clusterNum=3; //類簇數
ClusterAnalysis ca=new ClusterAnalysis();
List clusters=ca.startAnalysis(dpoints, clusterNum);
for(Cluster cl:clusters){
System.out.println("------"+cl.getClusterName()+"------");
List tempDps=cl.getDataPoints();
for(DataPoint tempdp:tempDps){
System.out.println(tempdp.getDataPointName());
}
}
}
}
package agenes;
public class DataPoint {
String dataPointName; // 樣本點名
Cluster cluster; // 樣本點所屬類簇
private double dimensioin[]; // 樣本點的次元
public DataPoint() {
}
public DataPoint(double[] dimensioin, String dataPointName) {
this.dataPointName = dataPointName;
this.dimensioin = dimensioin;
}
public double[] getDimensioin() {
return dimensioin;
}
public void setDimensioin(double[] dimensioin) {
this.dimensioin = dimensioin;
}
public Cluster getCluster() {
return cluster;
}
public void setCluster(Cluster cluster) {
this.cluster = cluster;
}
public String getDataPointName() {
return dataPointName;
}
public void setDataPointName(String dataPointName) {
this.dataPointName = dataPointName;
}
}