天天看點

從原始文檔到KNN分類算法實作(一)

    1. 建立原始文檔

      ~/corpus/C1下建三個檔案:0,1,2。内容分别為:

      眼睛 明亮 健康 身體 發達
      1 高大 身材 胳膊 勇猛 四肢
      2 胳膊 健康 身體 高大 健康
      ~/corpus/C2下建三個檔案:3,4,5。内容分别為:
      3 美國 很 發達
      4 美國 經濟 富強
      5 日本 富強 科技 發達
    2. 建立檔案夾~/frequency,運作程式WordCount.java。該程式的輸入檔案是~/corpus,運作結束後在~/frequency下産生了6個新檔案:0—5,分别記錄了原始文檔中每個詞出現的次數。

      View Code

      1 /**
       2  * Author: Orisun
       3  * Date: Aug 29, 2011
       4  * FileName: WordCount.java
       5  * Function: calculate word frequency in each document.
       6  */
       7 package featureselect;
       8 
       9 import java.io.BufferedReader;
      10 import java.io.BufferedWriter;
      11 import java.io.File;
      12 import java.io.FileReader;
      13 import java.io.FileWriter;
      14 import java.util.HashMap;
      15 import java.util.Iterator;
      16 import java.util.Map.Entry;
      17 
      18 public class WordCount {
      19 
      20     //參數檔案是(經過預處理的)原始訓練文檔集所在的檔案夾
      21     public void wordCount(File srcFile) {
      22         if (srcFile.isDirectory()) {
      23             File[] children = srcFile.listFiles();
      24             for (File child : children) {
      25                 wordCount(child);
      26             }
      27         } else if (srcFile.isFile()) {
      28             HashMap<String,Integer> wordfreq=new HashMap<String,Integer>();        //存儲單詞出現的次數
      29             try {
      30                 FileReader fr = new FileReader(srcFile);
      31                 BufferedReader br = new BufferedReader(fr);
      32                 String line=null;
      33                 while((line=br.readLine())!=null){
      34                     String[] words=line.split("\\s+");
      35                     for(String word:words){
      36                         if(!wordfreq.containsKey(word)){
      37                             wordfreq.put(word, new Integer(1));
      38                         }else{
      39                             int n=wordfreq.get(word);
      40                             wordfreq.put(word, new Integer(n+1));
      41                         }
      42                     }
      43                 }
      44                 br.close();
      45                 
      46                 //把詞頻寫入新檔案
      47                 File newFile=new File("/home/orisun/frequency",srcFile.getName());
      48                 newFile.createNewFile();
      49                 FileWriter fw=new FileWriter(newFile);
      50                 BufferedWriter bw=new BufferedWriter(fw);
      51                 Iterator<Entry<String, Integer>> iter=wordfreq.entrySet().iterator();
      52                 while(iter.hasNext()){
      53                     Entry<String,Integer> entry=iter.next();
      54                     bw.write(entry.getKey());
      55                     bw.write("\t");
      56                     bw.write(String.valueOf(entry.getValue()));
      57                     bw.newLine();
      58                 }
      59                 bw.flush();
      60                 bw.close();
      61             } catch (Exception e) {
      62                 e.printStackTrace();
      63             }
      64         }
      65     }
      66     
      67     public static void main(String[] args){
      68         WordCount inst=new WordCount();
      69         File file=new File("/home/orisun/corpus");
      70         if(!file.exists()){
      71             System.out.println("檔案不存在,程式退出.");
      72             System.exit(2);
      73         }
      74         inst.wordCount(file);
      75     }
      76 }      
    3. 下一步是運作WordDocMatrix.java。運作時它的輸入是第2步産生的~/frequency下的六個檔案,輸出是一個檔案~/matrix,該檔案記錄了每個單詞在各個文檔中出現的次數。
      1 2 3 4 5
      發達 1 1 1
      健康 1 2
      富強 1 1
      日本 1
      四肢 1
      眼睛 1
      高大 1 1
      1
      身體 1 1
      身材 1
      明亮 1
      胳膊 1 1
      經濟 1
      勇猛 1
      科技 1
      美國 1 1

View Code

1 /**
  2  * Author: Orisun
  3  * Date: Aug 29, 2011
  4  * FileName: WordDocMatrix.java
  5  * Function: word-document matrix denote each word frequency in each document.
  6  */
  7 package featureselect;
  8 
  9 import java.io.BufferedReader;
 10 import java.io.File;
 11 import java.io.FileReader;
 12 import java.io.PrintStream;
 13 import java.util.ArrayList;
 14 import java.util.HashMap;
 15 import java.util.Iterator;
 16 import java.util.Map.Entry;
 17 
 18 public class WordDocMatrix {
 19 
 20     // 訓練集分為3類,每類各1000個文檔
 21     public static final int docnumber = 3000; 
 22     public static final int classnumber = 3;
 23     //matrix記錄各個詞在各個文檔中出現過幾次
 24     HashMap<String, ArrayList<Short>> matrix = new HashMap<String, ArrayList<Short>>();
 25 
 26     //參數檔案是檔案夾,其中的檔案分别存儲了文檔中各個單詞出現的次數
 27     public void buildMatrix(File srcFile) {
 28         if (srcFile.isDirectory()) {
 29             File[] children = srcFile.listFiles();
 30             for (File child : children) {
 31                 buildMatrix(child);
 32             }
 33         } else if (srcFile.isFile()) {
 34             int filename=Integer.parseInt(srcFile.getName());
 35             try {
 36                 FileReader fr = new FileReader(srcFile);
 37                 BufferedReader br = new BufferedReader(fr);
 38                 String line=null;
 39                 while((line=br.readLine())!=null){
 40                     String[] pair=line.split("\\s+");
 41                     if(!matrix.containsKey(pair[0])){
 42                         ArrayList<Short> al=new ArrayList<Short>(docnumber);
 43                         for(int i=0;i<docnumber;i++)
 44                             al.add((short)0);
 45                         al.set(filename, Short.parseShort(pair[1]));
 46                         matrix.put(pair[0], al);
 47                     }else{
 48                         ArrayList<Short> al=matrix.get(pair[0]);
 49                         short orig=al.get(filename);
 50                         al.set(filename, (short)(orig+Short.parseShort(pair[1])));
 51                     }
 52                 }
 53                 br.close();
 54             } catch (Exception e) {
 55                 e.printStackTrace();
 56             }
 57         }
 58     }
 59     
 60     //列印矩陣的前幾行,輸出到檔案,以作驗證(如果全部列印檔案會因太大而加載過慢,甚至可能打不開)
 61     public static void main(String[] args){
 62         WordDocMatrix inst=new WordDocMatrix();
 63         try{
 64             File Mfile=new File("/home/orisun/freq");
 65             if (!Mfile.exists()) {
 66                 System.out.println("檔案不存在,程式退出.");
 67                 System.exit(2);
 68             }
 69             inst.buildMatrix(Mfile);
 70             File file=new File("/home/orisun/matrix");
 71             file.createNewFile();
 72             PrintStream ps=new PrintStream(file);
 73             inst.printMatrix(ps,inst.matrix);
 74             //inst.printMatrix(System.out,inst.matrix);
 75         } catch (Exception e) {
 76             e.printStackTrace();
 77         }
 78     }
 79     
 80     //輸出matrix
 81     public void printMatrix(PrintStream out,HashMap<String, ArrayList<Short>> matrix){
 82         Iterator<Entry<String, ArrayList<Short>>> iter=matrix.entrySet().iterator();
 83         try{
 84         while(iter.hasNext()){
 85             Entry<String, ArrayList<Short>> entry=iter.next();
 86             out.print(entry.getKey());
 87             out.print("\t");
 88             for(int i=0;i<docnumber;i++){
 89                 out.print(String.valueOf(entry.getValue().get(i)));
 90                 out.print("\t");
 91             }
 92             out.println();
 93         }
 94         out.flush();
 95         out.close();
 96         }catch(Exception e){
 97             e.printStackTrace();
 98         }
 99     }
100 }      

4.IG.java在整個算法過程中也不必單獨運作,它隻是為後面的特征詞選擇提供函數調用。運作時它的輸入是~/frequency下的六個檔案,通過調從檔案讀入word-doc矩陣,最後計算并輸出各個詞的資訊增益值IG。

發達 0.0566330123
健康 0.3182570841
富強 0.3182570841
日本 0.1323041247
四肢 0.1323041247
眼睛 0.1323041247
高大 0.3182570841
0.1323041247
身體 0.3182570841
身材 0.1323041247
明亮 0.1323041247
胳膊 0.3182570841
經濟 0.1323041247
勇猛 0.1323041247
科技 0.1323041247
美國 0.3182570841

View Code

1 /**
  2  * Author: Orisun
  3  * Date: Aug 29, 2011
  4  * FileName: IG.java
  5  * Function: calculate the information gain of each word in documents
  6  */
  7 package featureselect;
  8 
  9 import java.io.BufferedReader;
 10 import java.io.File;
 11 import java.io.FileReader;
 12 import java.io.IOException;
 13 import java.util.ArrayList;
 14 import java.util.HashMap;
 15 import java.util.Iterator;
 16 import java.util.Map.Entry;
 17 
 18 public class IG {
 19 
 20     // matrix記錄各個詞在各個文檔中出現過幾次
 21     public static HashMap<String, ArrayList<Short>> matrix = new HashMap<String, ArrayList<Short>>();
 22     HashMap<String, Double> map = new HashMap<String, Double>(); // 存儲每個單詞的資訊增益值
 23 
 24     //參數檔案存儲word-doc矩陣
 25     public static void initMatrix(File srcFile) {
 26         if (!(srcFile.exists() && srcFile.isFile())) {
 27             System.out.println("Matrix檔案不存在.程式退出.");
 28             System.exit(2);
 29         }
 30         try {
 31             FileReader fr = new FileReader(srcFile);
 32             BufferedReader br=new BufferedReader(fr);                    
 33             String line=null;
 34             while((line=br.readLine())!=null){        
 35                 //System.out.println(line);
 36                 String[] content=line.split("\\s+");
 37                 String word=content[0];
 38                 //System.out.print(word+"\t");
 39                 ArrayList<Short> al=new ArrayList<Short>(WordDocMatrix.docnumber);
 40                 for(int i=0;i<WordDocMatrix.docnumber;i++){
 41                     short count=Short.parseShort(content[i+1]);
 42                     //System.out.print(count+"\t");
 43                     al.add(count);
 44                 }
 45                 //System.out.println();
 46                 matrix.put(word, al);
 47             }
 48             //System.out.println("word-doc矩陣的行數:"+matrix.size());
 49             br.close();
 50         } catch (IOException e) {
 51             e.printStackTrace();
 52         }
 53     }
 54 
 55     //參數檔案是檔案夾,其中的檔案分别存儲了文檔中各個單詞出現的次數
 56     public void calIG(File srcFile) {
 57         initMatrix(new File("/home/orisun/matrix"));
 58         Iterator<Entry<String, ArrayList<Short>>> iter = matrix.entrySet().iterator();
 59         double entropy = Math.log(WordDocMatrix.classnumber);
 60         while (iter.hasNext()) {
 61             Entry<String, ArrayList<Short>> entry = iter.next();
 62             String word = entry.getKey();
 63             ArrayList<Short> al = entry.getValue();
 64             int category = WordDocMatrix.docnumber / WordDocMatrix.classnumber;
 65             int wcount = 0; // 出現word的文檔的文檔數量
 66             int[] wcount_class = new int[WordDocMatrix.classnumber];// 每個類别中出現單詞word的文檔數
 67             double pw = 0.0; // 出現word的文檔占全部文檔的比重
 68             double[] pcw = new double[WordDocMatrix.classnumber]; // 在單詞word出現時各個類别所占的比重
 69             double[] pcw_b = new double[WordDocMatrix.classnumber]; // 在單詞word不出現時各個類别所占的比重
 70             for (int i = 0; i < WordDocMatrix.classnumber; i++) {
 71                 for (int j = 0; j < category; j++) {
 72                     if (al.get(j + i * category) > 0) {
 73                         wcount_class[i]++;
 74                     }
 75                 }
 76                 wcount += wcount_class[i];
 77             }
 78             pw = 1.0 * wcount / WordDocMatrix.docnumber;
 79             for (int i = 0; i < WordDocMatrix.classnumber; i++) {
 80                 pcw[i] = 1.0 * wcount_class[i] / wcount;
 81                 pcw_b[i] = 1.0 * (category - wcount_class[i])
 82                         / (WordDocMatrix.docnumber - wcount);
 83             }
 84             double d1 = 0.0;
 85             double d2 = 0.0;
 86             for (int i = 0; i < WordDocMatrix.classnumber; i++) {
 87                 d1 += pcw[i] * Math.log(pcw[i] + Double.MIN_VALUE);
 88                 d2 += pcw_b[i] * Math.log(pcw_b[i] + Double.MIN_VALUE);
 89             }
 90             double ig = entropy + pw * d1 + (1.0 - pw) * d2;
 91             map.put(word, ig);
 92             //System.out.println(word+"\t"+ig);
 93         }
 94     }
 95     
 96     public static void main(String[] args){
 97         IG inst=new IG();
 98         File matrixFile=new File("/home/orisun/matrix");
 99         initMatrix(matrixFile);
100         //new featureselect.WordDocMatrix().printMatrix(System.out, matrix);
101         File freqFile=new File("/home/orisun/frequency");
102         inst.calIG(freqFile);
103     }
104 }      

5.FS.java需要運作。它的輸入是~/frequency下的六個檔案,調用IG.java中的calIG方法,并對各單詞的資訊增益進行從大到小排序,最後把IG值最大的N個特征項輸出到檔案~/features中,~/features中FS.java運作時建立的。這裡我們選擇6個特征詞,IG值從大到小排

  1. 健康 富強 高大 身體 胳膊 美國
    這也是在FS.features中的存儲順序(FS.features是ArrayList類型)。
  2. View Code
    1 /**
     2  * Author: Orisun
     3  * Date: Aug 29, 2011
     4  * FileName: FS.java
     5  * Function: feature select with information gain
     6  */
     7 package featureselect;
     8 
     9 import java.io.BufferedWriter;
    10 import java.io.File;
    11 import java.io.FileWriter;
    12 import java.util.ArrayList;
    13 import java.util.Collections;
    14 import java.util.HashMap;
    15 import java.util.Iterator;
    16 import java.util.Map.Entry;
    17 import java.util.Map;
    18 import java.util.Comparator;
    19 
    20 public class FS {
    21 
    22     public static ArrayList<String> features=new ArrayList<String>();//存放最終選擇的特征詞
    23     
    24     //選擇資訊增益值最大的n個單詞作為特征項
    25     public void featureSelect(int n){
    26         IG inst=new IG();
    27         inst.calIG(new File("/home/orisun/frequency"));
    28         ArrayList<Entry<String,Double>> list=sort(inst.map);
    29         Iterator<Entry<String,Double>> iter=list.iterator();
    30         int index=0;
    31         while(index++<n && iter.hasNext()){
    32             Entry<String,Double> entry=iter.next();
    33             //System.out.println(entry.getKey()+"  "+entry.getValue());
    34             features.add(entry.getKey());
    35         }
    36     }
    37     
    38     //Map按value進行排序
    39     public ArrayList<Entry<String,Double>> sort(HashMap<String,Double> arg){
    40         ArrayList<Entry<String,Double>> al=new ArrayList<Entry<String,Double>>(arg.entrySet());
    41         Collections.sort(al, new Comparator<Map.Entry<String,Double>>(){
    42             public int compare(Map.Entry<String, Double> o1,Map.Entry<String,Double> o2){
    43                 double res=o2.getValue()-o1.getValue();
    44                 if(res<0)
    45                     return -1;
    46                 else if(res>0)
    47                     return 1;
    48                 else
    49                     return 0;
    50         }
    51         });
    52         return al;
    53     }
    54     
    55     //把最終選擇的特征詞存入檔案
    56     public static void main(String[] args){
    57         FS inst=new FS();
    58         inst.featureSelect(6);
    59         try{    
    60             File file=new File("/home/orisun/features");
    61             file.createNewFile();
    62             FileWriter fw=new FileWriter(file);
    63             BufferedWriter bw=new BufferedWriter(fw);
    64             Iterator<String> iter=FS.features.iterator();
    65             while(iter.hasNext()){
    66                 String feature=iter.next();
    67                 bw.write(feature);
    68                 bw.newLine();
    69             }
    70             bw.flush();
    71             bw.close();
    72         } catch (Exception e) {
    73             e.printStackTrace();
    74         }
    75     }
    76 }
          

6.建立文檔向量模型, TDVM.java需要運作。它首先從檔案 ~/features中讀取特征項存入 TDVM.features中( DVM.features是 HashSet類型),此時特征項在 TDVM.features中的順序為:

  1. 健康 富強 胳膊 高大 美國 身體

    然後調用IG.initMatrix,從檔案中讀入word-doc矩陣,計算出特征項在文檔中的權重,最終得出文檔向量模型。

    歸一化之前6個文檔向量分别為:

    1.0986122886

    0.0

    0.0

    0.0

    0.0

    1.0986122886

    0.0

    0.0

    1.0986122886

    1.0986122886

    0.0

    0.0

    2.1972245773

    0.0

    1.0986122886

    1.0986122886

    0.0

    1.0986122886

    0.0

    0.0

    0.0

    0.0

    1.0986122886

    0.0

    0.0

    1.0986122886

    0.0

    0.0

    1.0986122886

    0.0

    0.0

    1.0986122886

    0.0

    0.0

    0.0

    0.0

    歸一化之後6個文檔向量分别為:

0.4472135954

0.0

0.0

0.0

0.0

0.70710678118

0.0

0.0

0.70710678118

0.70710678118

0.0

0.0

0.8944271909

0.0

0.70710678118

0.70710678118

0.0

0.70710678118

0.0

0.0

0.0

0.0

0.70710678118

0.0

0.0

0.70710678118

0.0

0.0

0.70710678118

0.0

0.0

0.70710678118

0.0

0.0

0.0

0.0

View Code

1 /**
  2  * Author: Orisun
  3  * Date: Aug 29, 2011
  4  * FileName: DVM.java
  5  * Function: build document vector model for documents in train
  6  */
  7 package DVM;
  8 
  9 import java.io.BufferedReader;
 10 import java.io.BufferedWriter;
 11 import java.io.File;
 12 import java.io.FileReader;
 13 import java.io.FileWriter;
 14 import java.util.ArrayList;
 15 import java.util.HashMap;
 16 import java.util.HashSet;
 17 import java.util.Iterator;
 18 import java.util.Map.Entry;
 19 import java.util.Vector;
 20 
 21 public class TVSM {
 22 
 23     HashSet<String> features = new HashSet<String>();// 存放最終選擇的特征詞
 24     int len; // 特征項的個數,亦即文檔向量的長度
 25     File path = new File("/home/orisun/dvm"); // 存放文檔向量的路徑
 26     HashMap<String, Vector<Double>> dv = new HashMap<String, Vector<Double>>(); // key是文檔名,value是文檔對應的規一化之前的向量
 27     double[] sqrt;        //存儲向量數組每個位置上的平方和
 28 
 29     public void printFeature() {
 30         Iterator<String> iter = features.iterator();
 31         while (iter.hasNext()) {
 32             System.out.print(iter.next() + "\t");
 33         }
 34         System.out.println();
 35     }
 36 
 37     // 從檔案中讀入特征項。參數檔案存儲經過特征選擇後剩下的特征項。
 38     public void initFeatures(File file) {
 39         try {
 40             FileReader fr = new FileReader(file);
 41             BufferedReader br = new BufferedReader(fr);
 42             String line = null;
 43             while ((line = br.readLine()) != null) {
 44                 features.add(line.split("\\s+")[0]);
 45             }
 46         } catch (Exception e) {
 47             e.printStackTrace();
 48         }
 49         len = features.size();
 50         sqrt = new double[len];
 51     }
 52 
 53     public void buildDVM(File srcFile) {
 54         if (srcFile.isDirectory()) {
 55             File[] children = srcFile.listFiles();
 56             for (File child : children) {
 57                 buildDVM(child);
 58             }
 59         } else if (srcFile.isFile()) {
 60             featureselect.IG.initMatrix(new File("/home/orisun/matrix"));
 61             // new featureselect.WordDocMatrix().printMatrix(System.out,featureselect.IG.matrix);
 62             Object[] feature_array = features.toArray();
 63             HashMap<String, Double> fea_wei = new HashMap<String, Double>();
 64 
 65             int filename = Integer.parseInt(srcFile.getName());
 66             Vector<Double> v = new Vector<Double>(len);
 67             for (int i = 0; i < len; i++)
 68                 v.add(0.0);
 69             try {
 70                 FileReader fr = new FileReader(srcFile);
 71                 BufferedReader br = new BufferedReader(fr);
 72                 String line = null;
 73                 while ((line = br.readLine()) != null) { // 逐個讀取文檔中的詞語
 74                     String word = line.split("\\s+")[0];
 75                     if (!features.contains(word))
 76                         continue;
 77                     int tf = 0; // 特征項在本文檔中出現的頻率
 78                     int Ni = 0; // 出現特征項的文檔數目
 79                     int N = featureselect.WordDocMatrix.docnumber; // 全部文檔數目
 80                     ArrayList<Short> al = featureselect.IG.matrix.get(word);
 81                     tf = al.get(filename);
 82                     for (int i = 0; i < N; i++) {
 83                         if (al.get(i) > 0)
 84                             Ni++;
 85                     }
 86                     // System.out.println("word="+word+"\tfilenmae="+filename+"\ttf="+tf+"\tNi="+Ni);
 87                     double weight = -1.0 * tf
 88                             * Math.log(1.0 * Ni / N + Double.MIN_VALUE);
 89                     fea_wei.put(word, weight);
 90                 }
 91                 for (int i = 0; i < feature_array.length; i++) {
 92                     String feat = feature_array[i].toString();
 93                     double w = 0.0;
 94                     if (fea_wei.containsKey(feat))
 95                         w = fea_wei.get(feat);
 96                     v.set(i, w);
 97                     sqrt[i] += Math.pow(w, 2);
 98                 }
 99                 dv.put(String.valueOf(filename), v);
100 
101             } catch (Exception e) {
102                 e.printStackTrace();
103             }
104 
105         }
106     }
107 
108     //文檔向量歸一化
109     public void unionVector() {
110         Iterator<Entry<String, Vector<Double>>> iter = dv.entrySet().iterator();
111         while (iter.hasNext()) {
112             Entry<String, Vector<Double>> entry = iter.next();
113             String fname = entry.getKey();
114             Vector<Double> v = entry.getValue();
115             Iterator<Double> it=v.iterator();
116             
117             File newFile = new File(path, fname);
118             try {
119                 newFile.createNewFile();
120                 FileWriter fw=new FileWriter(newFile);
121                 BufferedWriter bw=new BufferedWriter(fw);
122                 
123                 int index=0;
124                 while(it.hasNext()){
125                     double d=it.next();
126                     d/=Math.sqrt(sqrt[index]);
127                     //歸一化後寫入檔案
128                     bw.write(String.valueOf(d));
129                     bw.newLine();
130                     index++;
131                 }
132                 bw.flush();
133                 bw.close();
134             } catch (Exception e) {
135                 e.printStackTrace();
136             }
137 
138         }
139     }
140 
141     public static void main(String[] args) {
142         TVSM inst = new TVSM();
143         File feaFile = new File("/home/orisun/features");
144         inst.initFeatures(feaFile);
145         // inst.printFeature();
146         File freqFile = new File("/home/orisun/corpus");
147         
148         if (!freqFile.exists()) {
149             System.out.println("檔案不存在,程式退出.");
150             System.exit(2);
151         }
152         inst.buildDVM(freqFile);
153         inst.unionVector();
154     }
155 }      

7.建立一個檔案~/unknown,檔案内容為:

  1. 美國 比 日本 富強
    接下來我們要判斷這個文檔屬于C1類還是C2類。

8.利用WordCount.java程式(當然參數檔案要作修改)我們統計出unknown檔案中各個詞語出現的次數,寫入原檔案中。

9.運作ADVM.java建立unknown檔案的文檔向量模型,向量寫入檔案~/dvm2/unknown中。其文檔向量為(不需要進行歸一化):

  1. 0.0

    1.0986122886681098

    0.0

    0.0

    1.0986122886681098

    0.0

  2. View Code
    1 /**
      2  * Author: Orisun
      3  * Date: Aug 31, 2011
      4  * FileName: ADVM.java
      5  * Function: build document vector model for a new come document
      6  */
      7 package DVM;
      8 
      9 import java.io.BufferedReader;
     10 import java.io.BufferedWriter;
     11 import java.io.File;
     12 import java.io.FileReader;
     13 import java.io.FileWriter;
     14 import java.util.ArrayList;
     15 import java.util.HashMap;
     16 import java.util.HashSet;
     17 import java.util.Iterator;
     18 import java.util.Vector;
     19 
     20 public class AVSM {
     21 
     22     HashSet<String> features = new HashSet<String>();// 存放最終選擇的特征詞
     23     int len; // 特征項的個數,亦即文檔向量的長度
     24     File path = new File("/home/orisun/dvm2"); // 存放文檔向量的路徑
     25 
     26     // 從檔案中讀入特征項。參數檔案存儲經過特征選擇後剩下的特征項。
     27     public void initFeatures(File file) {
     28         try {
     29             FileReader fr = new FileReader(file);
     30             BufferedReader br = new BufferedReader(fr);
     31             String line = null;
     32             while ((line = br.readLine()) != null) {
     33                 features.add(line.split("\\s+")[0]);
     34             }
     35         } catch (Exception e) {
     36             e.printStackTrace();
     37         }
     38         len = features.size();
     39     }
     40     
     41     //參數檔案存放文檔中單詞的頻數
     42     public void buildDVM(File srcFile) {
     43         if (srcFile.isDirectory()) {
     44             File[] children = srcFile.listFiles();
     45             for (File child : children) {
     46                 buildDVM(child);
     47             }
     48         } else if (srcFile.isFile()) {
     49             featureselect.IG.initMatrix(new File("/home/orisun/matrix"));
     50             Object[] feature_array = features.toArray();
     51             HashMap<String, Double> fea_wei = new HashMap<String, Double>();
     52 
     53             String filename = srcFile.getName();
     54             File newFile = new File(path, filename);
     55             Vector<Double> v = new Vector<Double>(len);
     56             for (int i = 0; i < len; i++)
     57                 v.add(0.0);
     58             try {
     59                 newFile.createNewFile();
     60                 FileReader fr = new FileReader(srcFile);
     61                 BufferedReader br = new BufferedReader(fr);
     62                 String line = null;
     63                 while ((line = br.readLine()) != null) { // 逐個讀取文檔中的詞語
     64                     String[] content=line.split("\\s+");
     65                     String word = content[0];
     66                     int tf=Integer.parseInt(content[1]);// 特征項在本文檔中出現的頻率
     67                     if (!features.contains(word))
     68                         continue;
     69                     int Ni = 0; // 出現特征項的文檔數目
     70                     int N = featureselect.WordDocMatrix.docnumber; // 全部文檔數目
     71                     ArrayList<Short> al = featureselect.IG.matrix.get(word);
     72                     for (int i = 0; i < N; i++) {
     73                         if (al.get(i) > 0)
     74                             Ni++;
     75                     }
     76                     //System.out.println("word="+word+"\tfilenmae="+filename+"\ttf="+tf+"\tNi="+Ni);
     77                     double weight = -1.0 * tf
     78                             * Math.log(1.0 * Ni / N + Double.MIN_VALUE);
     79                     fea_wei.put(word, weight);
     80                 }
     81                 for(int i=0;i<feature_array.length;i++){
     82                     String feat=feature_array[i].toString();
     83                     double w=0.0;
     84                     if(fea_wei.containsKey(feat))
     85                         w=fea_wei.get(feat);
     86                     v.set(i, w);
     87                 }
     88                 
     89                 //把文檔向量寫入dvm路徑下的檔案
     90                 FileWriter fw=new FileWriter(newFile);
     91                 BufferedWriter bw=new BufferedWriter(fw);
     92                 Iterator<Double> iter=v.iterator();
     93                 while(iter.hasNext()){
     94                     bw.write(String.valueOf(iter.next()));
     95                     bw.newLine();
     96                 }
     97                 bw.flush();
     98                 bw.close();
     99             } catch (Exception e) {
    100                 e.printStackTrace();
    101             }
    102 
    103         }
    104     }
    105     public static void main(String[] args){
    106         AVSM inst=new AVSM();
    107         File feaFile=new File("/home/orisun/features"); 
    108         inst.initFeatures(feaFile);
    109         //inst.printFeature();
    110         File freqFile=new File("/home/orisun/unknown");
    111         inst.buildDVM(freqFile);
    112         if(!freqFile.exists()){
    113             System.out.println("檔案不存在,程式退出.");
    114             System.exit(2);
    115         }
    116         
    117     }
    118 }      
    10.運作KNN.java。計算出unknown向量和6個訓練向量的夾角斜紋餘弦為:
1
2
3 0.7071
4 1
5 0.7071

餘弦值越大說明夾角越小,說明向量越相似。顯然unknown檔案與文檔3、4、5很相似,是以它因該屬于第2類!

View Code

1 /**
  2  * Author: Orisun
  3  * Date: Aug 30, 2011
  4  * FileName: KNN.java
  5  * Function: K-Nearest Neighbour Algorithm
  6  */
  7 package classification;
  8 
  9 import java.io.BufferedReader;
 10 import java.io.File;
 11 import java.io.FileReader;
 12 import java.io.IOException;
 13 import java.util.ArrayList;
 14 import java.util.HashMap;
 15 import java.util.Vector;
 16 import java.util.Map.Entry;
 17 
 18 import featureselect.FS;
 19 
 20 public class KNN {
 21     //訓練集文檔的向量表示
 22     HashMap<String, Double> dist = new HashMap<String, Double>(); // 等分類文檔互每個訓練集文檔的距離
 23     Vector<Double> uv = new Vector<Double>(); // 待分類文檔的向量表示
 24 
 25     // srcFile存放了待分類文檔的向量表示
 26     public void initUV(File srcFile) {
 27         if (!srcFile.exists()) {
 28             System.err.println("File not found.Program exit with failure.");
 29             System.exit(2);
 30         }
 31         try {
 32             FileReader fr = new FileReader(srcFile);
 33             BufferedReader br = new BufferedReader(fr);
 34             String line = null;
 35             while ((line = br.readLine()) != null) {
 36                 uv.add(Double.valueOf(line.trim()));
 37             }
 38             br.close();
 39         } catch (IOException e) {
 40             e.printStackTrace();
 41         }
 42     }
 43 
 44     // srcFile是訓練集的文檔向量所在的檔案夾
 45     public void calDist(File srcFile) {
 46         File[] children = srcFile.listFiles();
 47         
 48         for (File child : children) {
 49             String filename = child.getName();
 50             Vector<Double> v = new Vector<Double>();
 51             try {
 52                 FileReader fr = new FileReader(child);
 53                 BufferedReader br = new BufferedReader(fr);
 54                 String line = null;
 55                 while ((line = br.readLine()) != null) {
 56                     v.add(Double.valueOf(line.trim()));
 57                 }
 58                 br.close();
 59             } catch (IOException e) {
 60                 e.printStackTrace();
 61             }
 62             int len = v.size();
 63             double d = cos(v, uv, len);
 64             dist.put(filename, d);
 65             System.out.println("到"+filename+"的距離是"+d);
 66         }
 67     }
 68 
 69     // 計算兩個向量的夾角的餘弦。如果此值的絕對值越大,說明夾角越小,越相似,距離越近。
 70     public double cos(Vector<Double> v1, Vector<Double> v2, int len) {
 71         double res = 0.0;
 72         double mul = 0.0;
 73         double p1 = 0.0, p2 = 0.0;
 74         for (int i = 0; i < len; i++) {
 75             double one = v1.get(i);
 76             double two = v2.get(i);
 77             mul += one * two;
 78             p1 += Math.pow(one, 2);
 79             p2 += Math.pow(two, 2);
 80         }
 81         res = Math.abs(mul) / Math.sqrt(p1 * p2);
 82         return res;
 83     }
 84 
 85     public void knn(int k){
 86         //對新向量到所有訓練向量的距離按從大到小排序
 87         FS fs=new FS();
 88         ArrayList<Entry<String,Double>> dist_list=fs.sort(dist);
 89         int c1=0,c2=0;
 90         for(int i=0;i<k;i++){
 91             Entry<String,Double> entry=dist_list.get(i);
 92             int fileNum=Integer.parseInt(entry.getKey());
 93             if(fileNum>=0 && fileNum<3)
 94                 c1++;
 95             else if(fileNum>=3 && fileNum<6)
 96                 c2++;
 97         }
 98         if(c1>c2)
 99             System.out.println("屬于第1類!");
100         else if(c2>1)
101             System.out.println("屬于第2類!");
102         else
103             System.out.println("屬于兩類的可能性一樣大!");
104     }
105     
106     public static void main(String[] args){
107         KNN inst=new KNN();
108         File uvFile=new File("/home/orisun/dvm2/unknown");
109         inst.initUV(uvFile);
110         File tvFiles=new File("/home/orisun/dvm");
111         inst.calDist(tvFiles);
112         inst.knn(3);
113     }
114 }      

以上隻是簡單的模拟,在實際文本分類中訓練集很大時,word-doc matrix會很大,必然造成記憶體溢出。解決方法參見我的從原始文檔到KNN分類算法實作(二)