附录:朴素贝叶斯文本分类算法源代码
1、算法实现
实验的实现使用的是C#。训练数据存放在D://train文件夹中,测试数据存放在D://test文件夹中。均以“类别号 (索引号).txt”格式命名单个文档,其中类别号为1-4,分别对应于计算机、艺术、历史和体育,索引号为1-100,对应每个类别的100个文档。
1.1 定义、初始化部分:
1 HashSet<string> dict = new HashSet<string>();//对应训练样本中的所有单词统计vocabulary
2 HashSet<string> ignore = new HashSet<string>() {"——", "“", ",", "。", "”", ":", "、","【", "】", "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会","着", "没有", "看", "好", "自己", "这" };//标点符号和停止词
3 Dictionary<int, Dictionary<string, int>> posts = newDictionary<int, Dictionary<string, int>>();//文本中没个词的出现次数
4 int num_class = 4;//共考虑4类文本
5 int[] num = new int[num_class];//算法中的n,某类文本中的不同单词位置总数
6 double[] P1;//先验概率
7 P1 = new double[num_class];
8 double[,] P2;//条件概率
9 P2 = new double[num_class, num_vocabulary];
10 int[] err;//统计测试结果中每个类别文档误判数
11 err = new int[num_class];
12 for (int m = 0; m < num_class; m++)
13 err[m] = 0;
1.2 训练部分
(1)获取vocabulary
14 for (int pr = 1; pr <= num_class; pr++)//循环所有训练集,获得vocabulary以及读取所有文档到post
15 {
16 for (int i = 1; i <= 100;i++)
17 {
18 string line, context ="";
19 StreamReader sr = newStreamReader(@"d:\train\" + pr.ToString() + " (" + i.ToString() +").txt");
20 while (!sr.EndOfStream)
21 {
22 line =sr.ReadLine().Trim();
23 context = context +line;
24 }
25 string[] arr =context.Split(' ');
26 num[pr-1] = num[pr-1] +arr.Length;
27
28 posts[(pr-1)*100+i-1] = newDictionary<string, int>();
29 for (int idx = 0; idx <arr.Length; ++idx)//循环一个文本中的所有词
30 {
31 arr[idx] =GetWord(arr[idx]);
32 if(arr[idx].Trim().Length == 0)
33 continue;
34 if(ignore.Contains(arr[idx]))//是否是停止词或者标点胡浩
35 continue;
36 if(!dict.Contains(arr[idx]))//加入新词
37 dict.Add(arr[idx]);
38 if (!posts[(pr - 1) *100 + i - 1].ContainsKey(arr[idx]))//统计每个文本中每个词的出现次数
39 posts[(pr - 1) * 100+ i - 1][arr[idx]] = 1;
40 else
41 posts[(pr - 1) * 100+ i - 1][arr[idx]]++;
42 }
43 }
44 }
45 Console.WriteLine("{0} words in dict.", dict.Count);//输出vocabulary中词的个数
(2)计算先验概率和条件概率
46 int num_vocabulary = dict.Count;
47 for (int i = 0; i < num_class; i++)//先验概率P(Vj)
48 P1[i] = (double)1 / (double)num_class;
49
50 for (int i = 0; i < num_class; i++)//条件概率
51 {
52 int j = 0;
53 foreach (string word in dict)
54 {
55 int num1 = 0;//记录某个单词在vocabulary中的位置
56 for (int k = 100 * i; k <100 * i + 100; k++)
57 {
58 Dictionary<string,int> post = posts[k];
59 if(post.ContainsKey(word))
60 num1 += post[word];
61 }
62 P2[i, j] = (double)(num1 +1) / (double)(num[i] + num_vocabulary);
63 j++;
64 }
65 }
1.3 分类部分
66 for (int pr = 1; pr <= num_class; pr++)//循环测试集中每个文档
67 {
68 for (int i = 1; i <= 100;i++)
69 {
70 double[] P3;//对于某文本属于4个类别的概率
71 P3 = new double[num_class];
72 for (int k = 0; k <num_class; k++)
73 P3[k] = P1[k];
74
75 string cs_line, cs_context ="";
76 StreamReader cs_sr = newStreamReader(@"d:\test\" + pr.ToString() + " (" + i.ToString() +").txt");
77 while (!cs_sr.EndOfStream)
78 {
79 cs_line =cs_sr.ReadLine().Trim();
80 cs_context = cs_context+ cs_line;
81 }
82 string[] cs_arr =cs_context.Split(' ');
83
84 for (int idx = 0; idx <cs_arr.Length; ++idx)//计算4个概率
85 {
86 cs_arr[idx] =GetWord(cs_arr[idx]);
87 if(cs_arr[idx].Trim().Length == 0)
88 continue;
89 if(ignore.Contains(cs_arr[idx]))
90 continue;
91
92 int num2 = 0;//作用同num1
93 if(dict.Contains(cs_arr[idx]))
94 {
95 foreach (string wordin dict)
96 {
97 if(cs_arr[idx].Equals(word))
98 break;
99 num2++;
100 }
101 for (int k = 0; k< num_class; k++)
102 P3[k] = P3[k] *P2[k, num2];
103
104 while (P3[0] <0.0001 && P3[1] < 0.0001 && P3[2] < 0.0001 &&P3[3] < 0.0001)//由于数据太小,之后大小无法比较,故对其同时放大
105 for (int k = 0;k < num_class; k++)
106 P3[k] *=1000;
107 }
108 }
109
110 int result = 0;
111 result = Max(P3);
112 if (result != pr)
113 err[pr - 1]++;
114 if (1 <= i && i<= 9)
115 Console.Write(pr.ToString() +" (0" + i.ToString() + ").txt: " + "类别" + result.ToString() +" ");
116 else
117 Console.Write(pr.ToString() +" (" + i.ToString() + ").txt: " + "类别" + result.ToString() +" ");
118 }
119 Console.Write("\n");
120 }