天天看点

R-聚类分析

输入data为矩阵,clusternumber为聚类的个数,该算法使用的是计量地理的思想。使用的测试数据如下:
           
0.0,0.375,0.483,1.749,1.516,0.375
0.0,0.0,0.776,1.596,1.336,1.743
0.0,0.0,0.0,1.926,1.662,2.154
0.0,0.0,0.0,0.0,0.501,0.693
0.0,0.0,0.0,0.0,0.0,0.589
0.0,0.0,0.0,0.0,0.0,0.0
           
#order1:获取的最小值行列号
#isexist:判断的聚类结果里面是否包含新的类,如果包括,将新类加入进去
#newdata:如果新类由两个元素,并且这两个元素属于两个类,吧这两个类里面的数据放到newdata里面,最后合并为一类
#Flag_DeleteRowAndCol:在最后删除已分类的行列用到
#Flag_DeleteName:在最后删除行列名
#Flag_DeleteNumber:一共删除多少行(列)
#counDIM:新加的行列名称
#calcdis:用于计算所有类与当前值的距离
#leaveMatrixDim:用于循环的数据矩阵是否需要删除最后一列,因为最后一列始终为所有类别与剩余元素的最小值,并且只有一列
cluster = function(data,clusterNumbers)
{
   #data=as.data.frame(b)
   #clusterNumbers=4
   #browser()
   name=rownames(data)
   ori=data
   counDIM=1+nrow(data)
   # 
   # dist=data.frame(1,2:6,t(data[1,2:6]))
   # colnames(dist)=c('rownumber','colnumber','value')
   # for(i in 2:(nrow(data)-1))
   # {
   #    j=i+1
   #    a=data.frame(i,j:6,t(data[i,j:6]))
   #    colnames(a)=c('rownumber','colnumber','value')
   #    dist=rbind(dist,a)
   #    print(i)
   # }
   # rownames(dist)=NULL
   leaveMatrixDim=nrow(ori)
   clusterresult=matrix(0,leaveMatrixDim,leaveMatrixDim)
   rownames(clusterresult)=1:leaveMatrixDim
   colnames(clusterresult)=1:leaveMatrixDim
   
   newcluster=matrix(0,10*leaveMatrixDim,leaveMatrixDim)
   rownames(newcluster)=1:(10*leaveMatrixDim)
   colnames(newcluster)=1:(leaveMatrixDim)
   
   finalDIMofData=nrow(data)+1
   # clusterresult[1,1]=6
   # clusterresult[2,1]=1
   # clusterresult[2,2]=2
   # clusterresult[2,3]=6
   # calc_dist=dist[which(dist$value!=-1),]
   m_iswhile=TRUE
   
   
   while(m_iswhile)
  {
      minValue= min(data[data!=0])
      getrowandcol=which(data==minValue, arr.ind = TRUE)
      order1=c(getrowandcol)
      Flag_DeleteRowAndCol=order1
      Flag_DeleteName=as.numeric(rownames(data)[order1])
      order1=unique(Flag_DeleteName)
      Flag_DeleteNumber=length(order1)
      # for(indx in 1:Flag_DeleteNumber)
      # {
      #    if(order1[indx]<=(as.numeric(rownames(ori)[nrow(ori)])))
      #    leaveMatrixDim=leaveMatrixDim-1
      # }
      leaveMatrixDim=nrow(data)

      #如果与已经分类的归为一类,需要找到那一类的所有行列
      #if(any(order1>(nrow(data)-Flag_DeleteNumber))==TRUE)
      if(any(order1>(as.numeric(rownames(ori)[nrow(ori)])))==TRUE)
      {
         #browser()
         calcrc=which(newcluster == minValue, arr.ind = TRUE)
         for(i in 1:nrow(calcrc))
         {
            calcrow=which(newcluster == minValue, arr.ind = TRUE)[i,1]
            calccol=which(newcluster == minValue, arr.ind = TRUE)[i,2]
            order1=cbind(order1, newcluster[c(calcrow+1,calcrow+2),calccol])
         }
         
         # calccol=which(newcluster == minValue, arr.ind = TRUE)
         # for(i in 1:nrow(getrowandcol))
         # {
         #    order1=cbind(order1, newcluster[c(calcrow+1,calcrow+2),calccol])
         # }
         
         order1=c(order1)
         order1=order1[order1<=(as.numeric(rownames(ori)[nrow(ori)]))]
         order1=unique(order1)
      }
         
      #查看当前的两条轨迹是否已经在结果类别中出现过
      for(ex in 1:length(order1))
      {
         if(ex==1)
         {
            isexist=which(clusterresult==order1[ex], arr.ind = TRUE)
         }else
         {
            isexist=rbind(isexist, which(clusterresult==order1[ex], arr.ind = TRUE))
         }
      }
      #如果出现过,只留下行号,即类别号,后面要归为这一类里面
      if(nrow(isexist)>0)
         isexist=unique(isexist[,1])
      
     
      #判断是否需要合并现有的两个类别
      if(length(isexist)>1)
      {
         for(l in 1:length(isexist))
         {
            if(l==1)
            {
               newdata=c((clusterresult[isexist[1],]))
            }else
            {
               newdata=rbind(newdata,c(clusterresult[isexist[l],])) 
            }
         } 
         newdata=c(newdata)
         newdata=as.data.frame(newdata)
         for(k in 1:length(order1))
         {
            newdata=rbind(newdata,order1[k])
         }
         newdata=unique(newdata)
         newdata=newdata[which(newdata>0),1]
         clusterresult=clusterresult[-isexist,]
         
         #把结果中的两类合并后,要把这两类清零,然后把结果都网上提,保证前面部分有数据,后面为0,(数据连续,0连续,不能中间有间隔)
         for(i in 1:length(isexist))
         {
            clusterresult=rbind(clusterresult,rep(0,nrow(ori)))
         }
         
         if(length(which(clusterresult!=0, arr.ind = TRUE))>0)
         {
            irow=(max(which(clusterresult!=0, arr.ind = TRUE)[,1]))+1
         }else
         {
            irow=1
         }
         icol=1
         for(k in 1:length(newdata))
         {
            clusterresult[irow,icol]=(newdata[k]) 
            icol=icol+1
         }
         rownames(clusterresult)[irow]=counDIM
      }
      if(length(isexist)==1)
      {
         irow=isexist[1]
         icol=(max(which(clusterresult[irow,]!=0, arr.ind = TRUE)))+1
         for(k in 1:length(order1))
         {
            clusterresult[irow,icol]=order1[k]
            icol=icol+1
         }
         rownames(clusterresult)[irow]=counDIM
      }#新填一类
      if(length(isexist)<1)
      {
         #查找最后的类别号
         if(length(which(clusterresult!=0, arr.ind = TRUE))>0)
         {
            irow=(max(which(clusterresult!=0, arr.ind = TRUE)[,1]))+1
         }else
         {
            irow=1
         }
         
         index=1 
         for(k in 1:length(order1))
         {
            clusterresult[irow,index]=order1[k] 
            index=index+1
         }
         rownames(clusterresult)[irow]=counDIM
      }
     # newcluster= data.frame(rep(0,(nrow(data)-length(order1))),rep(0,(nrow(data)-length(order1))),rep(0,(nrow(data)-length(order1)))) 
      
      #用于一开始查询已有类别的行号,也可以作为最后新增列使用
     #newcluster=matrix(0,3,(nrow(data)-length(name)))
      
      
      if(length(intersect(name,Flag_DeleteName))>0)
      {
         name=name[-which(name%in%intersect(name,as.character(Flag_DeleteName)))]
      }
      #把要删除的行列删除
      data=data[-Flag_DeleteRowAndCol,-Flag_DeleteRowAndCol]
      # if(nrow(data)>leaveMatrixDim)
      # {
      #    data=data[1:leaveMatrixDim,1:leaveMatrixDim]
      # }
     #如果新添加的列被聚到类别中,那newcluster(记录每次新列的数据)就要把相应的清零
      for(i in 1:length(Flag_DeleteName))
      {
         if(Flag_DeleteName[i]>nrow(ori))
         {
            calcrow=nrow(ori)+3*(Flag_DeleteName[i]-nrow(ori)-1)+1
            # getculsterrow=which(rownames(newcluster)==calcrow,arr.ind = TRUE)[1]
            newcluster[c(calcrow,calcrow+1,calcrow+2),]=0
         }
      }
      

      #对聚类的结果进行去重
      for(i in 1:(max(which(clusterresult!=0, arr.ind = TRUE)[,1])))
      {
         a=clusterresult[i,(clusterresult[i,]>0)]
         a=unique(a)
         clusterresult[i,]=0
         for(j in 1:length(a))
         {
            clusterresult[i,j]=a[j]
         }
         
      }

      #行列名称也要改,因为后面聚类的时候是通过行列名称判断的
      if((max(which(clusterresult!=0, arr.ind = TRUE)[,1]))!=clusterNumbers || nrow(data)>(nrow(ori)/20))
      {
        
         #查找要归入类内的所有行列号
         # for(i in 1:(max(which(clusterresult!=0, arr.ind = TRUE)[,1])))
         # {
         #    if(i==1)
         #    {
         #       calcdis=clusterresult[i,which(clusterresult[i,]!=0, arr.ind = TRUE)]
         #    }else
         #    {
         #       calcdis=rbind(calcdis,clusterresult[i,which(clusterresult[i,]!=0, arr.ind = TRUE)])
         #    }
         # 
         # }
         # calcdis=unique(c(calcdis))
         leavename=rownames(data)
        
         for(i in 1:((length(leavename))))
         {
            #browser()
             #if(leaveMatrixDim==4) browser()

            #判断要重新计算距离的两类是否是集合,如果是,需要对应找到这个集合里面的所有元素进行比较
            if(as.numeric(leavename[i])>nrow(ori))
            {
               getculsterrow=which(rownames(clusterresult)==as.numeric(leavename[i]),arr.ind = TRUE)[1]
               disd=clusterresult[getculsterrow,which(clusterresult[getculsterrow,]!=0, arr.ind = TRUE)]#disd是已有的类别
               
               getculsterrow=which(rownames(clusterresult)==counDIM,arr.ind = TRUE)[1]
               calcdis=clusterresult[getculsterrow,which(clusterresult[getculsterrow,]!=0, arr.ind = TRUE)]
               
               new1= ifelse(ori[calcdis[1],as.numeric(disd[1])]>ori[as.numeric(disd[1]),calcdis[1]],ori[calcdis[1],as.numeric(disd[1])],ori[as.numeric(disd[1]),calcdis[1]]) 
               newcluster[finalDIMofData,i]=new1
               newcluster[finalDIMofData+1,i]=as.numeric(disd[1])
               newcluster[finalDIMofData+2,i]=calcdis[1]
               
               for(j in 1:length(calcdis))
               {
                  for(k in 1:length(disd))
                  {
                     new2= ifelse(ori[calcdis[j],as.numeric(disd[k])]>ori[as.numeric(disd[k]),calcdis[j]],ori[calcdis[j],as.numeric(disd[k])],ori[as.numeric(disd[k]),calcdis[j]]) 
                     
                     if(new1==0)
                     {
                        new1= new2
                        newcluster[finalDIMofData,i]=new1
                        newcluster[finalDIMofData+1,i]=as.numeric(disd[k])
                        newcluster[finalDIMofData+2,i]=calcdis[j]
                     }
                     if(new1>new2 && new2!=0)
                     {
                        new1= new2
                        newcluster[finalDIMofData,i]=new1
                        newcluster[finalDIMofData+1,i]=as.numeric(disd[k])
                        newcluster[finalDIMofData+2,i]=calcdis[j]
                     }
                  }
               }
            }else
            {
               getculsterrow=which(rownames(clusterresult)==counDIM,arr.ind = TRUE)[1]
               calcdis=clusterresult[getculsterrow,which(clusterresult[getculsterrow,]!=0, arr.ind = TRUE)]
               
               new1= ifelse(ori[calcdis[1],as.numeric(leavename[i])]>ori[as.numeric(leavename[i]),calcdis[1]],ori[calcdis[1],as.numeric(leavename[i])],ori[as.numeric(leavename[i]),calcdis[1]]) 
               newcluster[finalDIMofData,i]=new1
               newcluster[finalDIMofData+1,i]=as.numeric(leavename[i])
               newcluster[finalDIMofData+2,i]=calcdis[1]
               
               for(j in 2:length(calcdis))
               {
                  new2= ifelse(ori[calcdis[j],as.numeric(leavename[i])]>ori[as.numeric(leavename[i]),calcdis[j]],ori[calcdis[j],as.numeric(leavename[i])],ori[as.numeric(leavename[i]),calcdis[j]]) 
                  if(new1==0)
                  {
                     new1= new2
                     newcluster[finalDIMofData,i]=new1
                     newcluster[finalDIMofData+1,i]=as.numeric(leavename[i])
                     newcluster[finalDIMofData+2,i]=calcdis[j]
                  }
                  if(new1>new2 && new2!=0)
                  {
                     new1= new2
                     newcluster[finalDIMofData,i]=new1
                     newcluster[finalDIMofData+1,i]=as.numeric(leavename[i])
                     newcluster[finalDIMofData+2,i]=calcdis[j]
                  }
               }
            }
           
         }
         add=newcluster[finalDIMofData,]
         add=add[add>0]
         data=cbind(data, t(t(add)))#加一列,是计算的新距离
         data=rbind(data,c(rep(0,(nrow(data)+1))))#加一行,全为0
         rownames(data)[nrow(data)]=counDIM
         colnames(data)[nrow(data)]=counDIM
         counDIM=counDIM+1
         
         rownames(data)[1:length(name)]=name
         colnames(data)[1:length(name)]=name
         
         finalDIMofData=finalDIMofData+3
      }else
      {
         m_iswhile=FALSE
      }
      
      cat('待聚类元素个数:',leaveMatrixDim,'\n')
      cat('newcluster中共有数据行:',(max(which(newcluster!=0, arr.ind = TRUE)[,1])),'\n')
      cat('已完成聚类个数:',(max(which(clusterresult!=0, arr.ind = TRUE)[,1])),'\n')
      #print((leaveMatrixDim))
      #print((leaveMatrixDim))

   }
   browser()
   return(clusterresult)

}
cluster(C,1)
b=AREA_COST_Order
for(i in 1:nrow(AREA_COST_Order))
{
   for(j in i:0)
   {
      b[i,j]=0
   }
}
b[b==1]=0
rownames(b)=1:nrow(b)
colnames(b)=1:nrow(b)
diag(b)=0

result_cluster=cluster(b,4)

data=as.data.frame(b)

clusterNumbers=4

b=read.csv("K:\\本科\\学习\\C++\\程序\\聚类分析\\ClusterAnalysis\\Table 7-5.txt", sep = ",",header=F)
cluster(b,1)
(max(which(result_cluster!=0, arr.ind = TRUE)[,1]))

           
上一篇: R-矩阵
下一篇: R-导入数据