天天看点

朴素贝叶斯文本分类算法源代码

附录:朴素贝叶斯文本分类算法源代码

1、算法实现

实验的实现使用的是C#。训练数据存放在D://train文件夹中,测试数据存放在D://test文件夹中。均以“类别号 (索引号).txt”格式命名单个文档,其中类别号为1-4,分别对应于计算机、艺术、历史和体育,索引号为1-100,对应每个类别的100个文档。

1.1  定义、初始化部分:

1    HashSet<string> dict = new HashSet<string>();//对应训练样本中的所有单词统计vocabulary
2    HashSet<string> ignore = new HashSet<string>() {"——", "“", ",", "。", "”", ":", "、","【", "】", "的", "了", "在", "是", "我", "有",                                                                        "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会","着", "没有", "看", "好", "自己", "这" };//标点符号和停止词
3    Dictionary<int, Dictionary<string, int>> posts = newDictionary<int, Dictionary<string, int>>();//文本中没个词的出现次数
4    int num_class = 4;//共考虑4类文本
5    int[] num = new int[num_class];//算法中的n,某类文本中的不同单词位置总数
6    double[] P1;//先验概率
7    P1 = new double[num_class];
8    double[,] P2;//条件概率
9    P2 = new double[num_class, num_vocabulary];
10  int[] err;//统计测试结果中每个类别文档误判数
11  err = new int[num_class];
12  for (int m = 0; m < num_class; m++)
13      err[m] = 0;
           

1.2 训练部分

(1)获取vocabulary

14  for (int pr = 1; pr <= num_class; pr++)//循环所有训练集,获得vocabulary以及读取所有文档到post
15  {
16      for (int i = 1; i <= 100;i++)
17      {
18          string line, context ="";
19          StreamReader sr = newStreamReader(@"d:\train\" + pr.ToString()    + " (" + i.ToString() +").txt");
20          while (!sr.EndOfStream)
21          {
22              line =sr.ReadLine().Trim();
23              context = context +line;
24          }
25          string[] arr =context.Split(' ');
26          num[pr-1] = num[pr-1] +arr.Length;
27   
28          posts[(pr-1)*100+i-1] = newDictionary<string, int>();
29          for (int idx = 0; idx <arr.Length; ++idx)//循环一个文本中的所有词
30          {
31              arr[idx] =GetWord(arr[idx]);
32              if(arr[idx].Trim().Length == 0)
33                  continue;
34              if(ignore.Contains(arr[idx]))//是否是停止词或者标点胡浩
35                  continue;
36              if(!dict.Contains(arr[idx]))//加入新词
37                  dict.Add(arr[idx]);
38              if (!posts[(pr - 1) *100 + i - 1].ContainsKey(arr[idx]))//统计每个文本中每个词的出现次数
39                  posts[(pr - 1) * 100+ i - 1][arr[idx]] = 1;
40              else
41                  posts[(pr - 1) * 100+ i - 1][arr[idx]]++;
42          }
43      }
44  }
45  Console.WriteLine("{0} words in dict.", dict.Count);//输出vocabulary中词的个数
           

(2)计算先验概率和条件概率

46  int num_vocabulary = dict.Count;
47  for (int i = 0; i < num_class; i++)//先验概率P(Vj)
48  P1[i] = (double)1 / (double)num_class;
49   
50  for (int i = 0; i < num_class; i++)//条件概率
51  {
52      int j = 0;
53      foreach (string word in dict)
54      {
55          int num1 = 0;//记录某个单词在vocabulary中的位置
56          for (int k = 100 * i; k <100 * i + 100; k++)
57          {
58              Dictionary<string,int> post = posts[k];
59              if(post.ContainsKey(word))
60                  num1 += post[word];
61          }
62          P2[i, j] = (double)(num1 +1) / (double)(num[i] + num_vocabulary);
63          j++;
64      }
65  }
           

1.3  分类部分

66  for (int pr = 1; pr <= num_class; pr++)//循环测试集中每个文档
67  {
68      for (int i = 1; i <= 100;i++)
69      {
70          double[] P3;//对于某文本属于4个类别的概率
71          P3 = new double[num_class];
72          for (int k = 0; k <num_class; k++)
73              P3[k] = P1[k];
74   
75          string cs_line, cs_context ="";
76          StreamReader cs_sr = newStreamReader(@"d:\test\" + pr.ToString()  + " (" + i.ToString() +").txt");
77          while (!cs_sr.EndOfStream)
78          {
79              cs_line =cs_sr.ReadLine().Trim();
80              cs_context = cs_context+ cs_line;
81          }
82          string[] cs_arr =cs_context.Split(' ');
83   
84          for (int idx = 0; idx <cs_arr.Length; ++idx)//计算4个概率
85          {
86              cs_arr[idx] =GetWord(cs_arr[idx]);
87              if(cs_arr[idx].Trim().Length == 0)
88                  continue;
89              if(ignore.Contains(cs_arr[idx]))
90                  continue;
91   
92              int num2 = 0;//作用同num1
93              if(dict.Contains(cs_arr[idx]))
94              {
95                  foreach (string wordin dict)
96                  {
97                      if(cs_arr[idx].Equals(word))
98                          break;
99                      num2++;
100                 }
101                 for (int k = 0; k< num_class; k++)
102                     P3[k] = P3[k] *P2[k, num2];
103  
104                 while (P3[0] <0.0001 && P3[1] < 0.0001 && P3[2] < 0.0001 &&P3[3] < 0.0001)//由于数据太小,之后大小无法比较,故对其同时放大
105                     for (int k = 0;k < num_class; k++)
106                         P3[k] *=1000;
107             }
108         }
109  
110         int result = 0;
111         result = Max(P3);
112         if (result != pr)
113             err[pr - 1]++;
114         if (1 <= i && i<= 9)
115             Console.Write(pr.ToString() +" (0" + i.ToString() + ").txt: " + "类别" + result.ToString() +"   ");
116         else
117             Console.Write(pr.ToString() +" (" + i.ToString() + ").txt: " + "类别" + result.ToString() +"   ");
118     }
119     Console.Write("\n");
120 }
           

继续阅读